Add validation utilities, update model conversion logic, fix manifest.json, rename deprecated flags, improve docs

Files changed (4) hide show

README.md +1 -1
convert.py +252 -257
sharp.mlpackage/Data/com.apple.CoreML/model.mlmodel +1 -1
sharp.mlpackage/Manifest.json +8 -8

README.md CHANGED Viewed

@@ -63,7 +63,7 @@ Use the provided [sharp.swift](sharp.swift) inference script to load the model a
 swiftc -O -o run_sharp sharp.swift -framework CoreML -framework CoreImage -framework AppKit
 # Run inference on an image and decimate the output by 50%
-./run_sharp sharp.mlpackage city.png city.ply -d 0.5
 ```
 > Inference on an Apple M4 Max takes ~1.9 seconds.

 swiftc -O -o run_sharp sharp.swift -framework CoreML -framework CoreImage -framework AppKit
 # Run inference on an image and decimate the output by 50%
+./run_sharp sharp.mlpackage test.png test.ply -d 0.5
 ```
 > Inference on an Apple M4 Max takes ~1.9 seconds.

convert.py CHANGED Viewed

@@ -8,6 +8,7 @@ from __future__ import annotations
 import argparse
 import logging
 from pathlib import Path
 from typing import Any
@@ -25,19 +26,92 @@ LOGGER = logging.getLogger(__name__)
 DEFAULT_MODEL_URL = "https://ml-site.cdn-apple.com/models/sharp/sharp_2572gikvuh.pt"
-class SafeClamp(nn.Module):
-    """Safe clamp operation that avoids tracing issues."""
-    def forward(self, x, min_val=1e-4, max_val=1e4):
-        return torch.clamp(x, min=min_val, max=max_val)
-class SafeDivision(nn.Module):
-    """Safe division that avoids division by zero."""
-    def forward(self, numerator, denominator):
-        return numerator / torch.clamp(denominator, min=1e-8)
 class SharpModelTraceable(nn.Module):
@@ -61,10 +135,10 @@ class SharpModelTraceable(nn.Module):
         self.prediction_head = predictor.prediction_head
         self.gaussian_composer = predictor.gaussian_composer
         self.depth_alignment = predictor.depth_alignment
-        # Replace problematic operations with custom modules
-        self.safe_clamp = SafeClamp()
-        self.safe_div = SafeDivision()
     def forward(
         self,
@@ -95,12 +169,17 @@ class SharpModelTraceable(nn.Module):
         # Apply depth alignment (inference mode)
         monodepth, _ = self.depth_alignment(monodepth, None, monodepth_output.decoder_features)
         # Initialize gaussians
         init_output = self.init_model(image, monodepth)
-        # Store global_scale for debugging if in eval mode (not during tracing)
-        if hasattr(self, '_store_global_scale'):
-            self._stored_global_scale = init_output.global_scale
         # Extract features
         image_features = self.feature_model(
@@ -358,142 +437,6 @@ def convert_to_coreml(
     return mlmodel
-def convert_to_coreml_with_preprocessing(
-    predictor: RGBGaussianPredictor,
-    output_path: Path,
-    input_shape: tuple[int, int] = (1536, 1536),
-) -> ct.models.MLModel:
-    """Convert SHARP model to Core ML with built-in image preprocessing.
-    This version includes image normalization as part of the model,
-    accepting uint8 images as input.
-    Args:
-        predictor: The SHARP RGBGaussianPredictor model.
-        output_path: Path to save the .mlmodel file.
-        input_shape: Input image shape (height, width).
-    Returns:
-        The converted Core ML model.
-    """
-    class SharpWithPreprocessing(nn.Module):
-        """SHARP model with integrated preprocessing."""
-        def __init__(self, base_model: SharpModelTraceable):
-            super().__init__()
-            self.base_model = base_model
-        def forward(
-            self,
-            image: torch.Tensor,
-            disparity_factor: torch.Tensor
-        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-            # Normalize image from [0, 255] to [0, 1]
-            image_normalized = image / 255.0
-            return self.base_model(image_normalized, disparity_factor)
-    model_wrapper = SharpWithPreprocessing(SharpModelTraceable(predictor))
-    model_wrapper.eval()
-    height, width = input_shape
-    example_image = torch.randint(0, 256, (1, 3, height, width), dtype=torch.float32)
-    example_disparity_factor = torch.tensor([1.0])
-    LOGGER.info("Tracing model with preprocessing...")
-    with torch.no_grad():
-        traced_model = torch.jit.trace(
-            model_wrapper,
-            (example_image, example_disparity_factor),
-            strict=False,
-        )
-    inputs = [
-        ct.ImageType(
-            name="image",
-            shape=(1, 3, height, width),
-            scale=1.0,  # Will be normalized in the model
-            color_layout=ct.colorlayout.RGB,
-        ),
-        ct.TensorType(
-            name="disparity_factor",
-            shape=(1,),
-            dtype=np.float32,
-        ),
-    ]
-    # Define output names with clear, descriptive labels
-    output_names = [
-        "mean_vectors_3d_positions",         # 3D positions (NDC space)
-        "singular_values_scales",            # Scale parameters (diagonal of covariance)
-        "quaternions_rotations",             # Rotation as quaternions
-        "colors_rgb_linear",                 # RGB colors in linear color space
-        "opacities_alpha_channel",           # Opacity values (alpha)
-    ]
-    # Define outputs with proper names for Core ML conversion
-    outputs = [
-        ct.TensorType(name=output_names[0], dtype=np.float32),
-        ct.TensorType(name=output_names[1], dtype=np.float32),
-        ct.TensorType(name=output_names[2], dtype=np.float32),
-        ct.TensorType(name=output_names[3], dtype=np.float32),
-        ct.TensorType(name=output_names[4], dtype=np.float32),
-    ]
-    mlmodel = ct.convert(
-        traced_model,
-        inputs=inputs,
-        outputs=outputs,  # Specify output names during conversion
-        convert_to="mlprogram",
-        compute_precision=ct.precision.FLOAT16,
-    )
-    mlmodel.author = "Apple Inc."
-    mlmodel.short_description = "SHARP model with integrated image preprocessing"
-    mlmodel.version = "1.0.0"
-    # Output descriptions with clear intent and units
-    output_descriptions = {
-        "mean_vectors_3d_positions": (
-            "3D positions of Gaussian splats in normalized device coordinates (NDC). "
-            "Shape: (1, N, 3), where N is the number of Gaussians."
-        ),
-        "singular_values_scales": (
-            "Scale factors for each Gaussian along its principal axes. "
-            "Represents size and anisotropy. Shape: (1, N, 3)."
-        ),
-        "quaternions_rotations": (
-            "Rotation of each Gaussian as a unit quaternion [w, x, y, z]. "
-            "Used to orient the ellipsoid. Shape: (1, N, 4)."
-        ),
-        "colors_rgb_linear": (
-            "RGB color values in linear RGB space (not gamma-corrected). "
-            "Shape: (1, N, 3), with range [0, 1]."
-        ),
-        "opacities_alpha_channel": (
-            "Opacity value per Gaussian (alpha channel), used for blending. "
-            "Shape: (1, N), where values are in [0, 1]."
-        ),
-    }
-    # Update output names and descriptions via spec BEFORE saving
-    spec = mlmodel.get_spec()
-    # Set output descriptions
-    for i, name in enumerate(output_names):
-        if i < len(spec.description.output):
-            output = spec.description.output[i]
-            output.name = name
-            output.shortDescription = output_descriptions[name]
-    LOGGER.info("Output names after update: %s", [o.name for o in spec.description.output])
-    # Save the model with correct names
-    mlmodel.save(str(output_path))
-    return mlmodel
 class QuaternionValidator:
     """Validator for quaternion comparisons with configurable tolerances and outlier analysis."""
@@ -658,6 +601,130 @@ class QuaternionValidator:
         }
 def format_validation_table(
     validation_results: list[dict],
     image_name: str,
@@ -1222,89 +1289,29 @@ def validate_with_single_image_detailed(
     """
     # Load and preprocess the input image
     test_image = load_and_preprocess_image(image_path, input_shape)
-    test_disparity = np.array([1.0], dtype=np.float32)
-    # Run PyTorch model
-    traceable_wrapper = SharpModelTraceable(pytorch_model)
-    traceable_wrapper.eval()
-    with torch.no_grad():
-        pt_outputs = traceable_wrapper(test_image, torch.from_numpy(test_disparity))
-    # Run Core ML model
-    test_image_np = test_image.numpy()
-    coreml_inputs = {
-        "image": test_image_np,
-        "disparity_factor": test_disparity,
-    }
-    coreml_outputs = mlmodel.predict(coreml_inputs)
-    # Output configuration
-    output_names = ["mean_vectors_3d_positions", "singular_values_scales", "quaternions_rotations", "colors_rgb_linear", "opacities_alpha_channel"]
     # Tolerances for real image validation
-    tolerances = {
-        "mean_vectors_3d_positions": 1.2,
-        "singular_values_scales": 0.01,
-        "colors_rgb_linear": 0.01,
-        "opacities_alpha_channel": 0.05,
-        "quaternions_rotations": 5.0,
-    }
-    # Use provided validator or create default
     if quat_validator is None:
-        quat_validator = QuaternionValidator()
-    # Collect validation results
-    validation_results = []
-    for i, name in enumerate(output_names):
-        pt_output = pt_outputs[i].numpy()
-        # Find matching Core ML output
-        coreml_key = None
-        if name in coreml_outputs:
-            coreml_key = name
-        else:
-            for key in coreml_outputs:
-                base_name = name.split('_')[0]
-                if base_name in key.lower():
-                    coreml_key = key
-                    break
-            if coreml_key is None:
-                coreml_key = list(coreml_outputs.keys())[i]
-        coreml_output = coreml_outputs[coreml_key]
-        result = {"output": name, "passed": True, "failure_reason": ""}
-        if name == "quaternions_rotations":
-            # Use QuaternionValidator
-            quat_result = quat_validator.validate(pt_output, coreml_output, image_name=image_path.name)
-            result.update({
-                "max_diff": f"{quat_result['stats']['max']:.6f}",
-                "mean_diff": f"{quat_result['stats']['mean']:.6f}",
-                "p99_diff": f"{quat_result['stats']['p99']:.6f}",
-                "passed": quat_result["passed"],
-                "failure_reason": "; ".join(quat_result["failure_reasons"]) if quat_result["failure_reasons"] else "",
-            })
-        else:
-            diff = np.abs(pt_output - coreml_output)
-            output_tolerance = tolerances.get(name, 0.01)
-            max_diff = np.max(diff)
-            result.update({
-                "max_diff": f"{max_diff:.6f}",
-                "mean_diff": f"{np.mean(diff):.6f}",
-                "p99_diff": f"{np.percentile(diff, 99):.6f}",
-            })
-            if max_diff > output_tolerance:
-                result["passed"] = False
-                result["failure_reason"] = f"max diff {max_diff:.6f} > tolerance {output_tolerance:.6f}"
-        validation_results.append(result)
     return validation_results
@@ -1469,11 +1476,6 @@ def main():
         action="store_true",
         help="Validate Core ML model against PyTorch",
     )
-    parser.add_argument(
-        "--with-preprocessing",
-        action="store_true",
-        help="Include image preprocessing (uint8 -> float normalization)",
-    )
     parser.add_argument(
         "-v", "--verbose",
         action="store_true",
@@ -1522,21 +1524,13 @@ def main():
     precision = ct.precision.FLOAT16 if args.precision == "float16" else ct.precision.FLOAT32
     # Convert to Core ML
-    if args.with_preprocessing:
-        LOGGER.info("Converting with integrated preprocessing...")
-        mlmodel = convert_to_coreml_with_preprocessing(
-            predictor,
-            args.output,
-            input_shape=input_shape,
-        )
-    else:
-        LOGGER.info("Converting using direct tracing...")
-        mlmodel = convert_to_coreml(
-            predictor,
-            args.output,
-            input_shape=input_shape,
-            compute_precision=precision,
-        )
     LOGGER.info(f"Core ML model saved to {args.output}")
@@ -1570,3 +1564,4 @@ def main():
 if __name__ == "__main__":
     exit(main())

 import argparse
 import logging
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 DEFAULT_MODEL_URL = "https://ml-site.cdn-apple.com/models/sharp/sharp_2572gikvuh.pt"
+# ============================================================================
+# Constants & Configuration
+# ============================================================================
+# Output names for Core ML model
+OUTPUT_NAMES = [
+    "mean_vectors_3d_positions",
+    "singular_values_scales",
+    "quaternions_rotations",
+    "colors_rgb_linear",
+    "opacities_alpha_channel",
+]
+# Output descriptions for Core ML metadata
+OUTPUT_DESCRIPTIONS = {
+    "mean_vectors_3d_positions": (
+        "3D positions of Gaussian splats in normalized device coordinates (NDC). "
+        "Shape: (1, N, 3), where N is the number of Gaussians."
+    ),
+    "singular_values_scales": (
+        "Scale factors for each Gaussian along its principal axes. "
+        "Represents size and anisotropy. Shape: (1, N, 3)."
+    ),
+    "quaternions_rotations": (
+        "Rotation of each Gaussian as a unit quaternion [w, x, y, z]. "
+        "Used to orient the ellipsoid. Shape: (1, N, 4)."
+    ),
+    "colors_rgb_linear": (
+        "RGB color values in linear RGB space (not gamma-corrected). "
+        "Shape: (1, N, 3), with range [0, 1]."
+    ),
+    "opacities_alpha_channel": (
+        "Opacity value per Gaussian (alpha channel), used for blending. "
+        "Shape: (1, N), where values are in [0, 1]."
+    ),
+}
+@dataclass
+class ToleranceConfig:
+    """Tolerance configuration for validation."""
+    # Tolerances for random validation (tight)
+    random_tolerances: dict[str, float] = None
+    # Tolerances for real image validation (more lenient)
+    image_tolerances: dict[str, float] = None
+    # Angular tolerances for quaternions (in degrees)
+    angular_tolerances_random: dict[str, float] = None
+    angular_tolerances_image: dict[str, float] = None
+    def __post_init__(self):
+        if self.random_tolerances is None:
+            self.random_tolerances = {
+                "mean_vectors_3d_positions": 0.001,
+                "singular_values_scales": 0.0001,
+                "quaternions_rotations": 2.0,
+                "colors_rgb_linear": 0.002,
+                "opacities_alpha_channel": 0.005,
+            }
+        if self.image_tolerances is None:
+            self.image_tolerances = {
+                "mean_vectors_3d_positions": 1.2,
+                "singular_values_scales": 0.01,
+                "quaternions_rotations": 5.0,
+                "colors_rgb_linear": 0.01,
+                "opacities_alpha_channel": 0.05,
+            }
+        if self.angular_tolerances_random is None:
+            self.angular_tolerances_random = {
+                "mean": 0.01,
+                "p99": 0.1,
+                "p99_9": 1.0,
+                "max": 5.0,
+            }
+        if self.angular_tolerances_image is None:
+            self.angular_tolerances_image = {
+                "mean": 0.2,
+                "p99": 2.0,
+                "p99_9": 5.0,
+                "max": 25.0,
+            }
 class SharpModelTraceable(nn.Module):
         self.prediction_head = predictor.prediction_head
         self.gaussian_composer = predictor.gaussian_composer
         self.depth_alignment = predictor.depth_alignment
+        # For debugging: store global_scale
+        self.last_global_scale = None
+        self.last_monodepth_min = None
     def forward(
         self,
         # Apply depth alignment (inference mode)
         monodepth, _ = self.depth_alignment(monodepth, None, monodepth_output.decoder_features)
+        # Store monodepth min for debugging (before normalization)
+        if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+            self.last_monodepth_min = monodepth.flatten().min().item()
         # Initialize gaussians
         init_output = self.init_model(image, monodepth)
+        # Store global_scale for debugging
+        if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+            if init_output.global_scale is not None:
+                self.last_global_scale = init_output.global_scale.item()
         # Extract features
         image_features = self.feature_model(
     return mlmodel
 class QuaternionValidator:
     """Validator for quaternion comparisons with configurable tolerances and outlier analysis."""
         }
+def find_coreml_output_key(name: str, coreml_outputs: dict) -> str:
+    """Find matching Core ML output key for a given output name.
+    Args:
+        name: The expected output name
+        coreml_outputs: Dictionary of Core ML outputs
+    Returns:
+        The matching key from coreml_outputs
+    """
+    if name in coreml_outputs:
+        return name
+    # Try partial match
+    for key in coreml_outputs:
+        base_name = name.split('_')[0]
+        if base_name in key.lower():
+            return key
+    # Fallback to index-based lookup
+    output_index = OUTPUT_NAMES.index(name) if name in OUTPUT_NAMES else 0
+    return list(coreml_outputs.keys())[output_index]
+def run_inference_pair(
+    pytorch_model: RGBGaussianPredictor,
+    mlmodel: ct.models.MLModel,
+    image_tensor: torch.Tensor,
+    disparity_factor: float = 1.0,
+) -> tuple[list[np.ndarray], dict[str, np.ndarray]]:
+    """Run inference on both PyTorch and Core ML models.
+    Args:
+        pytorch_model: The PyTorch model
+        mlmodel: The Core ML model
+        image_tensor: Input image tensor
+        disparity_factor: Disparity factor value
+    Returns:
+        Tuple of (pytorch_outputs, coreml_outputs)
+    """
+    # Run PyTorch model
+    traceable_wrapper = SharpModelTraceable(pytorch_model)
+    traceable_wrapper.eval()
+    test_disparity_pt = torch.tensor([disparity_factor])
+    with torch.no_grad():
+        pt_outputs = traceable_wrapper(image_tensor, test_disparity_pt)
+    # Convert to numpy
+    pt_outputs_np = [o.numpy() for o in pt_outputs]
+    # Run Core ML model
+    test_image_np = image_tensor.numpy()
+    test_disparity_np = np.array([disparity_factor], dtype=np.float32)
+    coreml_inputs = {
+        "image": test_image_np,
+        "disparity_factor": test_disparity_np,
+    }
+    coreml_outputs = mlmodel.predict(coreml_inputs)
+    return pt_outputs_np, coreml_outputs
+def compare_outputs(
+    pt_outputs: list[np.ndarray],
+    coreml_outputs: dict[str, np.ndarray],
+    tolerances: dict[str, float],
+    quat_validator: QuaternionValidator,
+    image_name: str = "Unknown",
+) -> list[dict]:
+    """Compare PyTorch and Core ML outputs.
+    Args:
+        pt_outputs: List of PyTorch outputs
+        coreml_outputs: Dictionary of Core ML outputs
+        tolerances: Tolerance values per output type
+        quat_validator: QuaternionValidator instance
+        image_name: Name of the image being validated
+    Returns:
+        List of validation result dictionaries
+    """
+    validation_results = []
+    for i, name in enumerate(OUTPUT_NAMES):
+        pt_output = pt_outputs[i]
+        coreml_key = find_coreml_output_key(name, coreml_outputs)
+        coreml_output = coreml_outputs[coreml_key]
+        result = {"output": name, "passed": True, "failure_reason": ""}
+        if name == "quaternions_rotations":
+            # Use QuaternionValidator for quaternions
+            quat_result = quat_validator.validate(pt_output, coreml_output, image_name=image_name)
+            result.update({
+                "max_diff": f"{quat_result['stats']['max']:.6f}",
+                "mean_diff": f"{quat_result['stats']['mean']:.6f}",
+                "p99_diff": f"{quat_result['stats']['p99']:.6f}",
+                "passed": quat_result["passed"],
+                "failure_reason": "; ".join(quat_result["failure_reasons"]) if quat_result["failure_reasons"] else "",
+            })
+        else:
+            # Standard numerical comparison
+            diff = np.abs(pt_output - coreml_output)
+            output_tolerance = tolerances.get(name, 0.01)
+            max_diff = np.max(diff)
+            result.update({
+                "max_diff": f"{max_diff:.6f}",
+                "mean_diff": f"{np.mean(diff):.6f}",
+                "p99_diff": f"{np.percentile(diff, 99):.6f}",
+            })
+            if max_diff > output_tolerance:
+                result["passed"] = False
+                result["failure_reason"] = f"max diff {max_diff:.6f} > tolerance {output_tolerance:.6f}"
+        validation_results.append(result)
+    return validation_results
 def format_validation_table(
     validation_results: list[dict],
     image_name: str,
     """
     # Load and preprocess the input image
     test_image = load_and_preprocess_image(image_path, input_shape)
+    # Run inference on both models
+    pt_outputs, coreml_outputs = run_inference_pair(pytorch_model, mlmodel, test_image)
     # Tolerances for real image validation
+    tolerance_config = ToleranceConfig()
+    tolerances = tolerance_config.image_tolerances
+    # Use provided validator or create default with image tolerances
     if quat_validator is None:
+        quat_validator = QuaternionValidator(
+            angular_tolerances=tolerance_config.angular_tolerances_image
+        )
+    # Compare outputs
+    validation_results = compare_outputs(
+        pt_outputs,
+        coreml_outputs,
+        tolerances,
+        quat_validator,
+        image_name=image_path.name
+    )
     return validation_results
         action="store_true",
         help="Validate Core ML model against PyTorch",
     )
     parser.add_argument(
         "-v", "--verbose",
         action="store_true",
     precision = ct.precision.FLOAT16 if args.precision == "float16" else ct.precision.FLOAT32
     # Convert to Core ML
+    LOGGER.info("Converting using direct tracing...")
+    mlmodel = convert_to_coreml(
+        predictor,
+        args.output,
+        input_shape=input_shape,
+        compute_precision=precision,
+    )
     LOGGER.info(f"Core ML model saved to {args.output}")
 if __name__ == "__main__":
     exit(main())
+    exit(main())

sharp.mlpackage/Data/com.apple.CoreML/model.mlmodel CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ca2a548947bdf1616a9c7ddf093c27dc0aeb8225a1e50cb40eb098d7aa47a2b5
 size 938769

 version https://git-lfs.github.com/spec/v1
+oid sha256:3e9fd96f088b6d324250226cfcbe7e197b735dbb9322687c177b4c2a8377fb51
 size 938769

sharp.mlpackage/Manifest.json CHANGED Viewed

@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "1504890B-E584-4EC2-A1CF-F87AE1A1BAA0": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Weights",
-            "name": "weights",
-            "path": "com.apple.CoreML/weights"
-        },
-        "D59C5780-FA59-423A-8088-BCF64225C1B3": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "D59C5780-FA59-423A-8088-BCF64225C1B3"
 }

 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
+        "551E6A6B-AAB8-4DA8-B1D0-2D3A73254AD2": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "DD041C71-3C41-47F0-830E-A829C8EEC1EA": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
         }
     },
+    "rootModelIdentifier": "551E6A6B-AAB8-4DA8-B1D0-2D3A73254AD2"
 }