Spaces:
Sleeping
Sleeping
| """ | |
| Intrinsic Dimensionality Estimation Module | |
| This module provides utilities for estimating the intrinsic dimensionality of | |
| high-dimensional feature representations using Maximum Likelihood Estimation (MLE). | |
| The intrinsic dimension represents the true underlying dimensionality of the data | |
| manifold, which is often much lower than the ambient feature space dimension. | |
| """ | |
| import logging | |
| from typing import Union | |
| import numpy as np | |
| import torch | |
| import skdim | |
| from ncut_pytorch.utils.sample import farthest_point_sampling | |
| # ===== Constants ===== | |
| DEFAULT_MAX_SAMPLES = 2000 | |
| MIN_SAMPLES_REQUIRED = 10 | |
| def _sample_features_for_dimension_estimation( | |
| flattened_features: np.ndarray, | |
| max_samples: int, | |
| ) -> np.ndarray: | |
| """Keep NCut sampling on CPU to avoid unsupported CUDA kernels on HF ZeroGPU.""" | |
| tensor_features = torch.tensor(flattened_features, dtype=torch.float32) | |
| sample_indices = farthest_point_sampling( | |
| tensor_features, | |
| max_samples, | |
| device="cpu", | |
| ) | |
| return flattened_features[sample_indices] | |
| # ===== Intrinsic Dimensionality Estimation ===== | |
| def estimate_intrinsic_dimension(features: Union[torch.Tensor, np.ndarray], | |
| max_samples: int = DEFAULT_MAX_SAMPLES, | |
| use_global_estimation: bool = True) -> float: | |
| """ | |
| Estimate the intrinsic dimensionality of feature representations. | |
| This function uses Maximum Likelihood Estimation (MLE) to determine the intrinsic | |
| dimensionality of high-dimensional features. If the dataset is large, it uses | |
| farthest point sampling to select a representative subset for efficient computation. | |
| Args: | |
| features (Union[torch.Tensor, np.ndarray]): Input features of any shape. | |
| Will be flattened to (N, D) format. | |
| max_samples (int): Maximum number of samples to use for estimation. | |
| Larger values give more accurate estimates but are slower. | |
| use_global_estimation (bool): Whether to prefer global over local estimation. | |
| Returns: | |
| float: Estimated intrinsic dimensionality of the feature manifold. | |
| Raises: | |
| ValueError: If input features are empty or have insufficient samples. | |
| RuntimeError: If dimensionality estimation fails completely. | |
| Example: | |
| >>> features = torch.randn(1000, 512) # 1000 samples, 512-dim features | |
| >>> intrinsic_dim = estimate_intrinsic_dimension(features) | |
| >>> print(f"Intrinsic dimension: {intrinsic_dim:.2f}") | |
| """ | |
| # Input validation | |
| if features is None: | |
| raise ValueError("Features cannot be None") | |
| # Convert to numpy if needed | |
| if isinstance(features, torch.Tensor): | |
| if features.numel() == 0: | |
| raise ValueError("Input tensor is empty") | |
| numpy_features = features.cpu().detach().numpy() | |
| else: | |
| numpy_features = np.asarray(features) | |
| if numpy_features.size == 0: | |
| raise ValueError("Input array is empty") | |
| # Reshape to 2D format (N_samples, N_features) | |
| original_shape = numpy_features.shape | |
| flattened_features = numpy_features.reshape(-1, numpy_features.shape[-1]) | |
| n_samples, n_features = flattened_features.shape | |
| # Validate minimum requirements | |
| if n_samples < MIN_SAMPLES_REQUIRED: | |
| raise ValueError( | |
| f"Insufficient samples for dimensionality estimation. " | |
| f"Need at least {MIN_SAMPLES_REQUIRED}, got {n_samples}" | |
| ) | |
| if n_features < 2: | |
| raise ValueError( | |
| f"Feature dimension must be at least 2, got {n_features}" | |
| ) | |
| # Apply farthest point sampling if dataset is too large | |
| if n_samples > max_samples: | |
| logging.info( | |
| f"Dataset has {n_samples} samples, downsampling to {max_samples} " | |
| f"using farthest point sampling for efficiency" | |
| ) | |
| sampled_features = _sample_features_for_dimension_estimation( | |
| flattened_features, | |
| max_samples, | |
| ) | |
| else: | |
| sampled_features = flattened_features | |
| # Validate sampled data quality | |
| if np.any(np.isnan(sampled_features)) or np.any(np.isinf(sampled_features)): | |
| logging.warning("Input features contain NaN or infinite values, which may affect estimation") | |
| # Estimate intrinsic dimensionality using MLE | |
| try: | |
| mle_estimator = skdim.id.MLE() | |
| fitted_estimator = mle_estimator.fit(sampled_features) | |
| estimated_dimension = fitted_estimator.dimension_ | |
| # Handle failed global estimation | |
| if estimated_dimension <= 0 or not np.isfinite(estimated_dimension): | |
| if hasattr(fitted_estimator, 'dimension_pw_') and fitted_estimator.dimension_pw_ is not None: | |
| # Fallback to local (pairwise) dimension estimates | |
| local_dimensions = fitted_estimator.dimension_pw_ | |
| valid_local_dims = local_dimensions[np.isfinite(local_dimensions) & (local_dimensions > 0)] | |
| if len(valid_local_dims) > 0: | |
| estimated_dimension = float(np.mean(valid_local_dims)) | |
| logging.warning( | |
| f"Global intrinsic dimension estimation failed (got {fitted_estimator.dimension_}). " | |
| f"Using mean of {len(valid_local_dims)} local estimates: {estimated_dimension:.2f}" | |
| ) | |
| else: | |
| raise RuntimeError("Both global and local dimensionality estimation failed") | |
| else: | |
| raise RuntimeError("Global dimensionality estimation failed and no local estimates available") | |
| # Sanity check: intrinsic dimension should not exceed ambient dimension | |
| if estimated_dimension > n_features: | |
| logging.warning( | |
| f"Estimated intrinsic dimension ({estimated_dimension:.2f}) exceeds " | |
| f"ambient dimension ({n_features}). Capping to ambient dimension." | |
| ) | |
| estimated_dimension = float(n_features) | |
| # Log results | |
| compression_ratio = n_features / estimated_dimension if estimated_dimension > 0 else np.inf | |
| logging.info( | |
| f"Intrinsic dimensionality estimation completed: " | |
| f"{estimated_dimension:.2f} (compression ratio: {compression_ratio:.1f}x)" | |
| ) | |
| return float(estimated_dimension) | |
| except Exception as e: | |
| raise RuntimeError(f"Intrinsic dimensionality estimation failed: {str(e)}") from e | |