| """ |
| This function is adapted from [pyod] by [yzhao062] |
| Original source: [https://github.com/yzhao062/pyod] |
| """ |
|
|
| from __future__ import division |
| from __future__ import print_function |
| import warnings |
|
|
| import numpy as np |
| from scipy.spatial.distance import cdist |
| from sklearn.cluster import KMeans |
| from sklearn.utils import check_array |
| from sklearn.utils.validation import check_is_fitted |
| from sklearn.utils.estimator_checks import check_estimator |
|
|
| from ..utils.stat_models import pairwise_distances_no_broadcast |
| from ..utils.utility import check_parameter |
| from .base import BaseDetector |
| from ..utils.utility import zscore |
|
|
|
|
| class CBLOF(BaseDetector): |
| r"""The CBLOF operator calculates the outlier score based on cluster-based |
| local outlier factor. |
| |
| CBLOF takes as an input the data set and the cluster model that was |
| generated by a clustering algorithm. It classifies the clusters into small |
| clusters and large clusters using the parameters alpha and beta. |
| The anomaly score is then calculated based on the size of the cluster the |
| point belongs to as well as the distance to the nearest large cluster. |
| |
| Use weighting for outlier factor based on the sizes of the clusters as |
| proposed in the original publication. Since this might lead to unexpected |
| behavior (outliers close to small clusters are not found), it is disabled |
| by default.Outliers scores are solely computed based on their distance to |
| the closest large cluster center. |
| |
| By default, kMeans is used for clustering algorithm instead of |
| Squeezer algorithm mentioned in the original paper for multiple reasons. |
| |
| See :cite:`he2003discovering` for details. |
| |
| Parameters |
| ---------- |
| n_clusters : int, optional (default=8) |
| The number of clusters to form as well as the number of |
| centroids to generate. |
| |
| contamination : float in (0., 0.5), optional (default=0.1) |
| The amount of contamination of the data set, |
| i.e. the proportion of outliers in the data set. Used when fitting to |
| define the threshold on the decision function. |
| |
| clustering_estimator : Estimator, optional (default=None) |
| The base clustering algorithm for performing data clustering. |
| A valid clustering algorithm should be passed in. The estimator should |
| have standard sklearn APIs, fit() and predict(). The estimator should |
| have attributes ``labels_`` and ``cluster_centers_``. |
| If ``cluster_centers_`` is not in the attributes once the model is fit, |
| it is calculated as the mean of the samples in a cluster. |
| |
| If not set, CBLOF uses KMeans for scalability. See |
| https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html |
| |
| alpha : float in (0.5, 1), optional (default=0.9) |
| Coefficient for deciding small and large clusters. The ratio |
| of the number of samples in large clusters to the number of samples in |
| small clusters. |
| |
| beta : int or float in (1,), optional (default=5). |
| Coefficient for deciding small and large clusters. For a list |
| sorted clusters by size `|C1|, \|C2|, ..., |Cn|, beta = |Ck|/|Ck-1|` |
| |
| use_weights : bool, optional (default=False) |
| If set to True, the size of clusters are used as weights in |
| outlier score calculation. |
| |
| check_estimator : bool, optional (default=False) |
| If set to True, check whether the base estimator is consistent with |
| sklearn standard. |
| |
| .. warning:: |
| check_estimator may throw errors with scikit-learn 0.20 above. |
| |
| random_state : int, RandomState or None, optional (default=None) |
| If int, random_state is the seed used by the random |
| number generator; If RandomState instance, random_state is the random |
| number generator; If None, the random number generator is the |
| RandomState instance used by `np.random`. |
| |
| |
| Attributes |
| ---------- |
| clustering_estimator_ : Estimator, sklearn instance |
| Base estimator for clustering. |
| |
| cluster_labels_ : list of shape (n_samples,) |
| Cluster assignment for the training samples. |
| |
| n_clusters_ : int |
| Actual number of clusters (possibly different from n_clusters). |
| |
| cluster_sizes_ : list of shape (n_clusters_,) |
| The size of each cluster once fitted with the training data. |
| |
| decision_scores_ : numpy array of shape (n_samples,) |
| The outlier scores of the training data. |
| The higher, the more abnormal. Outliers tend to have higher scores. |
| This value is available once the detector is fitted. |
| |
| cluster_centers_ : numpy array of shape (n_clusters_, n_features) |
| The center of each cluster. |
| |
| small_cluster_labels_ : list of clusters numbers |
| The cluster assignments belonging to small clusters. |
| |
| large_cluster_labels_ : list of clusters numbers |
| The cluster assignments belonging to large clusters. |
| |
| threshold_ : float |
| The threshold is based on ``contamination``. It is the |
| ``n_samples * contamination`` most abnormal samples in |
| ``decision_scores_``. The threshold is calculated for generating |
| binary outlier labels. |
| |
| labels_ : int, either 0 or 1 |
| The binary labels of the training data. 0 stands for inliers |
| and 1 for outliers/anomalies. It is generated by applying |
| ``threshold_`` on ``decision_scores_``. |
| """ |
|
|
| def __init__(self, n_clusters=8, contamination=0.1, |
| clustering_estimator=None, alpha=0.9, beta=5, |
| use_weights=False, check_estimator=False, random_state=0, |
| n_jobs=1, normalize=True): |
| super(CBLOF, self).__init__(contamination=contamination) |
| self.n_clusters = n_clusters |
| self.clustering_estimator = clustering_estimator |
| self.alpha = alpha |
| self.beta = beta |
| self.use_weights = use_weights |
| self.check_estimator = check_estimator |
| self.random_state = random_state |
| self.normalize = normalize |
|
|
| |
| def fit(self, X, y=None): |
| """Fit detector. y is ignored in unsupervised methods. |
| |
| Parameters |
| ---------- |
| X : numpy array of shape (n_samples, n_features) |
| The input samples. |
| |
| y : Ignored |
| Not used, present for API consistency by convention. |
| |
| Returns |
| ------- |
| self : object |
| Fitted estimator. |
| """ |
|
|
| |
| X = check_array(X) |
| self._set_n_classes(y) |
| n_samples, n_features = X.shape |
| if self.normalize: X = zscore(X, axis=1, ddof=1) |
|
|
| |
| |
| self._validate_estimator(default=KMeans( |
| n_clusters=self.n_clusters, |
| random_state=self.random_state)) |
|
|
| self.clustering_estimator_.fit(X=X, y=y) |
| |
| |
| self.cluster_labels_ = self.clustering_estimator_.labels_ |
| self.cluster_sizes_ = np.bincount(self.cluster_labels_) |
|
|
| |
| self.n_clusters_ = self.cluster_sizes_.shape[0] |
|
|
| if self.n_clusters_ != self.n_clusters: |
| warnings.warn("The chosen clustering for CBLOF forms {0} clusters" |
| "which is inconsistent with n_clusters ({1}).". |
| format(self.n_clusters_, self.n_clusters)) |
|
|
| self._set_cluster_centers(X, n_features) |
| self._set_small_large_clusters(n_samples) |
|
|
| self.decision_scores_ = self._decision_function(X, |
| self.cluster_labels_) |
|
|
| self._process_decision_scores() |
| return self |
|
|
| def decision_function(self, X): |
| """Predict raw anomaly score of X using the fitted detector. |
| |
| The anomaly score of an input sample is computed based on different |
| detector algorithms. For consistency, outliers are assigned with |
| larger anomaly scores. |
| |
| Parameters |
| ---------- |
| X : numpy array of shape (n_samples, n_features) |
| The training input samples. Sparse matrices are accepted only |
| if they are supported by the base estimator. |
| |
| Returns |
| ------- |
| anomaly_scores : numpy array of shape (n_samples,) |
| The anomaly score of the input samples. |
| """ |
| check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) |
| X = check_array(X) |
| labels = self.clustering_estimator_.predict(X) |
| return self._decision_function(X, labels) |
|
|
| def _validate_estimator(self, default=None): |
| """Check the value of alpha and beta and clustering algorithm. |
| """ |
| check_parameter(self.alpha, low=0, high=1, param_name='alpha', |
| include_left=False, include_right=False) |
|
|
| check_parameter(self.beta, low=1, param_name='beta', |
| include_left=False) |
|
|
| if self.clustering_estimator is not None: |
| self.clustering_estimator_ = self.clustering_estimator |
| else: |
| self.clustering_estimator_ = default |
|
|
| |
| if self.clustering_estimator_ is None: |
| raise ValueError("clustering algorithm cannot be None") |
|
|
| if self.check_estimator: |
| check_estimator(self.clustering_estimator_) |
|
|
| def _set_cluster_centers(self, X, n_features): |
| |
| if hasattr(self.clustering_estimator_, 'cluster_centers_'): |
| self.cluster_centers_ = self.clustering_estimator_.cluster_centers_ |
| else: |
| |
| |
| warnings.warn("The chosen clustering for CBLOF does not have" |
| "the center of clusters. Calculate the center" |
| "as the mean of the clusters.") |
| self.cluster_centers_ = np.zeros([self.n_clusters_, n_features]) |
| for i in range(self.n_clusters_): |
| self.cluster_centers_[i, :] = np.mean( |
| X[np.where(self.cluster_labels_ == i)], axis=0) |
|
|
| def _set_small_large_clusters(self, n_samples): |
| |
| size_clusters = np.bincount(self.cluster_labels_) |
|
|
| |
| sorted_cluster_indices = np.argsort(size_clusters * -1) |
|
|
| |
| |
| alpha_list = [] |
| beta_list = [] |
|
|
| for i in range(1, self.n_clusters_): |
| temp_sum = np.sum(size_clusters[sorted_cluster_indices[:i]]) |
| if temp_sum >= n_samples * self.alpha: |
| alpha_list.append(i) |
|
|
| if size_clusters[sorted_cluster_indices[i - 1]] / size_clusters[ |
| sorted_cluster_indices[i]] >= self.beta: |
| beta_list.append(i) |
|
|
| |
| intersection = np.intersect1d(alpha_list, beta_list) |
|
|
| if len(intersection) > 0: |
| self._clustering_threshold = intersection[0] |
| elif len(alpha_list) > 0: |
| self._clustering_threshold = alpha_list[0] |
| elif len(beta_list) > 0: |
| self._clustering_threshold = beta_list[0] |
| else: |
| raise ValueError("Could not form valid cluster separation. Please " |
| "change n_clusters or change clustering method") |
|
|
| self.small_cluster_labels_ = sorted_cluster_indices[ |
| self._clustering_threshold:] |
| self.large_cluster_labels_ = sorted_cluster_indices[ |
| 0:self._clustering_threshold] |
|
|
| |
| |
| |
|
|
| self._large_cluster_centers = self.cluster_centers_[ |
| self.large_cluster_labels_] |
|
|
| def _decision_function(self, X, labels): |
| |
| scores = np.zeros([X.shape[0], ]) |
|
|
| small_indices = np.where( |
| np.isin(labels, self.small_cluster_labels_))[0] |
| large_indices = np.where( |
| np.isin(labels, self.large_cluster_labels_))[0] |
|
|
| if small_indices.shape[0] != 0: |
| |
| dist_to_large_center = cdist(X[small_indices, :], |
| self._large_cluster_centers) |
|
|
| scores[small_indices] = np.min(dist_to_large_center, axis=1) |
|
|
| if large_indices.shape[0] != 0: |
| |
| large_centers = self.cluster_centers_[labels[large_indices]] |
|
|
| scores[large_indices] = pairwise_distances_no_broadcast( |
| X[large_indices, :], large_centers) |
|
|
| if self.use_weights: |
| |
| scores = scores * self.cluster_sizes_[labels] |
|
|
| return scores.ravel() |