diff --git a/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_2/data-v1-dl-1666876.arff.gz b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_2/data-v1-dl-1666876.arff.gz new file mode 100644 index 0000000000000000000000000000000000000000..ee6e378589d722771363d186944ed1f0f78c9836 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_2/data-v1-dl-1666876.arff.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfe8945b949770b0da42daf58ce67d1c5fee25cf7b4fd145161837c2abc09429 +size 1841 diff --git a/.venv/Lib/site-packages/sklearn/mixture/__init__.py b/.venv/Lib/site-packages/sklearn/mixture/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..af69fc1649307bba5a43fda5772eaa8c03f7aa1b --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/mixture/__init__.py @@ -0,0 +1,9 @@ +"""Mixture modeling algorithms.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._bayesian_mixture import BayesianGaussianMixture +from ._gaussian_mixture import GaussianMixture + +__all__ = ["GaussianMixture", "BayesianGaussianMixture"] diff --git a/.venv/Lib/site-packages/sklearn/mixture/tests/__init__.py b/.venv/Lib/site-packages/sklearn/mixture/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/Lib/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py b/.venv/Lib/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py new file mode 100644 index 0000000000000000000000000000000000000000..306c6a7fc2b16a7c9d6970d1fabd620b6f198f46 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py @@ -0,0 +1,464 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import copy + +import numpy as np +import pytest +from scipy.special import gammaln + +from sklearn.exceptions import NotFittedError +from sklearn.metrics.cluster import adjusted_rand_score +from sklearn.mixture import BayesianGaussianMixture +from sklearn.mixture._bayesian_mixture import _log_dirichlet_norm, _log_wishart_norm +from sklearn.mixture.tests.test_gaussian_mixture import RandomData +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_equal, +) + +COVARIANCE_TYPE = ["full", "tied", "diag", "spherical"] +PRIOR_TYPE = ["dirichlet_process", "dirichlet_distribution"] + + +def test_log_dirichlet_norm(): + rng = np.random.RandomState(0) + + weight_concentration = rng.rand(2) + expected_norm = gammaln(np.sum(weight_concentration)) - np.sum( + gammaln(weight_concentration) + ) + predected_norm = _log_dirichlet_norm(weight_concentration) + + assert_almost_equal(expected_norm, predected_norm) + + +def test_log_wishart_norm(): + rng = np.random.RandomState(0) + + n_components, n_features = 5, 2 + degrees_of_freedom = np.abs(rng.rand(n_components)) + 1.0 + log_det_precisions_chol = n_features * np.log(range(2, 2 + n_components)) + + expected_norm = np.empty(5) + for k, (degrees_of_freedom_k, log_det_k) in enumerate( + zip(degrees_of_freedom, log_det_precisions_chol) + ): + expected_norm[k] = -( + degrees_of_freedom_k * (log_det_k + 0.5 * n_features * np.log(2.0)) + + np.sum( + gammaln( + 0.5 + * (degrees_of_freedom_k - np.arange(0, n_features)[:, np.newaxis]) + ), + 0, + ) + ).item() + predected_norm = _log_wishart_norm( + degrees_of_freedom, log_det_precisions_chol, n_features + ) + + assert_almost_equal(expected_norm, predected_norm) + + +def test_bayesian_mixture_weights_prior_initialisation(): + rng = np.random.RandomState(0) + n_samples, n_components, n_features = 10, 5, 2 + X = rng.rand(n_samples, n_features) + + # Check correct init for a given value of weight_concentration_prior + weight_concentration_prior = rng.rand() + bgmm = BayesianGaussianMixture( + weight_concentration_prior=weight_concentration_prior, random_state=rng + ).fit(X) + assert_almost_equal(weight_concentration_prior, bgmm.weight_concentration_prior_) + + # Check correct init for the default value of weight_concentration_prior + bgmm = BayesianGaussianMixture(n_components=n_components, random_state=rng).fit(X) + assert_almost_equal(1.0 / n_components, bgmm.weight_concentration_prior_) + + +def test_bayesian_mixture_mean_prior_initialisation(): + rng = np.random.RandomState(0) + n_samples, n_components, n_features = 10, 3, 2 + X = rng.rand(n_samples, n_features) + + # Check correct init for a given value of mean_precision_prior + mean_precision_prior = rng.rand() + bgmm = BayesianGaussianMixture( + mean_precision_prior=mean_precision_prior, random_state=rng + ).fit(X) + assert_almost_equal(mean_precision_prior, bgmm.mean_precision_prior_) + + # Check correct init for the default value of mean_precision_prior + bgmm = BayesianGaussianMixture(random_state=rng).fit(X) + assert_almost_equal(1.0, bgmm.mean_precision_prior_) + + # Check correct init for a given value of mean_prior + mean_prior = rng.rand(n_features) + bgmm = BayesianGaussianMixture( + n_components=n_components, mean_prior=mean_prior, random_state=rng + ).fit(X) + assert_almost_equal(mean_prior, bgmm.mean_prior_) + + # Check correct init for the default value of bemean_priorta + bgmm = BayesianGaussianMixture(n_components=n_components, random_state=rng).fit(X) + assert_almost_equal(X.mean(axis=0), bgmm.mean_prior_) + + +def test_bayesian_mixture_precisions_prior_initialisation(): + rng = np.random.RandomState(0) + n_samples, n_features = 10, 2 + X = rng.rand(n_samples, n_features) + + # Check raise message for a bad value of degrees_of_freedom_prior + bad_degrees_of_freedom_prior_ = n_features - 1.0 + bgmm = BayesianGaussianMixture( + degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng + ) + msg = ( + "The parameter 'degrees_of_freedom_prior' should be greater than" + f" {n_features -1}, but got {bad_degrees_of_freedom_prior_:.3f}." + ) + with pytest.raises(ValueError, match=msg): + bgmm.fit(X) + + # Check correct init for a given value of degrees_of_freedom_prior + degrees_of_freedom_prior = rng.rand() + n_features - 1.0 + bgmm = BayesianGaussianMixture( + degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng + ).fit(X) + assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_) + + # Check correct init for the default value of degrees_of_freedom_prior + degrees_of_freedom_prior_default = n_features + bgmm = BayesianGaussianMixture( + degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng + ).fit(X) + assert_almost_equal( + degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_ + ) + + # Check correct init for a given value of covariance_prior + covariance_prior = { + "full": np.cov(X.T, bias=1) + 10, + "tied": np.cov(X.T, bias=1) + 5, + "diag": np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3, + "spherical": rng.rand(), + } + + bgmm = BayesianGaussianMixture(random_state=rng) + for cov_type in ["full", "tied", "diag", "spherical"]: + bgmm.covariance_type = cov_type + bgmm.covariance_prior = covariance_prior[cov_type] + bgmm.fit(X) + assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_) + + # Check correct init for the default value of covariance_prior + covariance_prior_default = { + "full": np.atleast_2d(np.cov(X.T)), + "tied": np.atleast_2d(np.cov(X.T)), + "diag": np.var(X, axis=0, ddof=1), + "spherical": np.var(X, axis=0, ddof=1).mean(), + } + + bgmm = BayesianGaussianMixture(random_state=0) + for cov_type in ["full", "tied", "diag", "spherical"]: + bgmm.covariance_type = cov_type + bgmm.fit(X) + assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_) + + +def test_bayesian_mixture_check_is_fitted(): + rng = np.random.RandomState(0) + n_samples, n_features = 10, 2 + + # Check raise message + bgmm = BayesianGaussianMixture(random_state=rng) + X = rng.rand(n_samples, n_features) + + msg = "This BayesianGaussianMixture instance is not fitted yet." + with pytest.raises(ValueError, match=msg): + bgmm.score(X) + + +def test_bayesian_mixture_weights(): + rng = np.random.RandomState(0) + n_samples, n_features = 10, 2 + + X = rng.rand(n_samples, n_features) + + # Case Dirichlet distribution for the weight concentration prior type + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type="dirichlet_distribution", + n_components=3, + random_state=rng, + ).fit(X) + + expected_weights = bgmm.weight_concentration_ / np.sum(bgmm.weight_concentration_) + assert_almost_equal(expected_weights, bgmm.weights_) + assert_almost_equal(np.sum(bgmm.weights_), 1.0) + + # Case Dirichlet process for the weight concentration prior type + dpgmm = BayesianGaussianMixture( + weight_concentration_prior_type="dirichlet_process", + n_components=3, + random_state=rng, + ).fit(X) + weight_dirichlet_sum = ( + dpgmm.weight_concentration_[0] + dpgmm.weight_concentration_[1] + ) + tmp = dpgmm.weight_concentration_[1] / weight_dirichlet_sum + expected_weights = ( + dpgmm.weight_concentration_[0] + / weight_dirichlet_sum + * np.hstack((1, np.cumprod(tmp[:-1]))) + ) + expected_weights /= np.sum(expected_weights) + assert_almost_equal(expected_weights, dpgmm.weights_) + assert_almost_equal(np.sum(dpgmm.weights_), 1.0) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_monotonic_likelihood(): + # We check that each step of the each step of variational inference without + # regularization improve monotonically the training set of the bound + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=20) + n_components = rand_data.n_components + + for prior_type in PRIOR_TYPE: + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=2 * n_components, + covariance_type=covar_type, + warm_start=True, + max_iter=1, + random_state=rng, + tol=1e-3, + ) + current_lower_bound = -np.inf + # Do one training iteration at a time so we can make sure that the + # training log likelihood increases after each iteration. + for _ in range(600): + prev_lower_bound = current_lower_bound + current_lower_bound = bgmm.fit(X).lower_bound_ + assert current_lower_bound >= prev_lower_bound + + if bgmm.converged_: + break + assert bgmm.converged_ + + +def test_compare_covar_type(): + # We can compare the 'full' precision with the other cov_type if we apply + # 1 iter of the M-step (done during _initialize_parameters). + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7) + X = rand_data.X["full"] + n_components = rand_data.n_components + + for prior_type in PRIOR_TYPE: + # Computation of the full_covariance + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=2 * n_components, + covariance_type="full", + max_iter=1, + random_state=0, + tol=1e-7, + ) + bgmm._check_parameters(X) + bgmm._initialize_parameters(X, np.random.RandomState(0)) + full_covariances = ( + bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis] + ) + + # Check tied_covariance = mean(full_covariances, 0) + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=2 * n_components, + covariance_type="tied", + max_iter=1, + random_state=0, + tol=1e-7, + ) + bgmm._check_parameters(X) + bgmm._initialize_parameters(X, np.random.RandomState(0)) + + tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_ + assert_almost_equal(tied_covariance, np.mean(full_covariances, 0)) + + # Check diag_covariance = diag(full_covariances) + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=2 * n_components, + covariance_type="diag", + max_iter=1, + random_state=0, + tol=1e-7, + ) + bgmm._check_parameters(X) + bgmm._initialize_parameters(X, np.random.RandomState(0)) + + diag_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis] + assert_almost_equal( + diag_covariances, np.array([np.diag(cov) for cov in full_covariances]) + ) + + # Check spherical_covariance = np.mean(diag_covariances, 0) + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=2 * n_components, + covariance_type="spherical", + max_iter=1, + random_state=0, + tol=1e-7, + ) + bgmm._check_parameters(X) + bgmm._initialize_parameters(X, np.random.RandomState(0)) + + spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_ + assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1)) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_check_covariance_precision(): + # We check that the dot product of the covariance and the precision + # matrices is identity. + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7) + n_components, n_features = 2 * rand_data.n_components, 2 + + # Computation of the full_covariance + bgmm = BayesianGaussianMixture( + n_components=n_components, max_iter=100, random_state=rng, tol=1e-3, reg_covar=0 + ) + for covar_type in COVARIANCE_TYPE: + bgmm.covariance_type = covar_type + bgmm.fit(rand_data.X[covar_type]) + + if covar_type == "full": + for covar, precision in zip(bgmm.covariances_, bgmm.precisions_): + assert_almost_equal(np.dot(covar, precision), np.eye(n_features)) + elif covar_type == "tied": + assert_almost_equal( + np.dot(bgmm.covariances_, bgmm.precisions_), np.eye(n_features) + ) + + elif covar_type == "diag": + assert_almost_equal( + bgmm.covariances_ * bgmm.precisions_, + np.ones((n_components, n_features)), + ) + + else: + assert_almost_equal( + bgmm.covariances_ * bgmm.precisions_, np.ones(n_components) + ) + + +def test_invariant_translation(): + # We check here that adding a constant in the data change correctly the + # parameters of the mixture + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=100) + n_components = 2 * rand_data.n_components + + for prior_type in PRIOR_TYPE: + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + bgmm1 = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=n_components, + max_iter=100, + random_state=0, + tol=1e-3, + reg_covar=0, + ).fit(X) + bgmm2 = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=n_components, + max_iter=100, + random_state=0, + tol=1e-3, + reg_covar=0, + ).fit(X + 100) + + assert_almost_equal(bgmm1.means_, bgmm2.means_ - 100) + assert_almost_equal(bgmm1.weights_, bgmm2.weights_) + assert_almost_equal(bgmm1.covariances_, bgmm2.covariances_) + + +@pytest.mark.filterwarnings("ignore:.*did not converge.*") +@pytest.mark.parametrize( + "seed, max_iter, tol", + [ + (0, 2, 1e-7), # strict non-convergence + (1, 2, 1e-1), # loose non-convergence + (3, 300, 1e-7), # strict convergence + (4, 300, 1e-1), # loose convergence + ], +) +def test_bayesian_mixture_fit_predict(seed, max_iter, tol): + rng = np.random.RandomState(seed) + rand_data = RandomData(rng, n_samples=50, scale=7) + n_components = 2 * rand_data.n_components + + for covar_type in COVARIANCE_TYPE: + bgmm1 = BayesianGaussianMixture( + n_components=n_components, + max_iter=max_iter, + random_state=rng, + tol=tol, + reg_covar=0, + ) + bgmm1.covariance_type = covar_type + bgmm2 = copy.deepcopy(bgmm1) + X = rand_data.X[covar_type] + + Y_pred1 = bgmm1.fit(X).predict(X) + Y_pred2 = bgmm2.fit_predict(X) + assert_array_equal(Y_pred1, Y_pred2) + + +def test_bayesian_mixture_fit_predict_n_init(): + # Check that fit_predict is equivalent to fit.predict, when n_init > 1 + X = np.random.RandomState(0).randn(50, 5) + gm = BayesianGaussianMixture(n_components=5, n_init=10, random_state=0) + y_pred1 = gm.fit_predict(X) + y_pred2 = gm.predict(X) + assert_array_equal(y_pred1, y_pred2) + + +def test_bayesian_mixture_predict_predict_proba(): + # this is the same test as test_gaussian_mixture_predict_predict_proba() + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + for prior_type in PRIOR_TYPE: + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + Y = rand_data.Y + bgmm = BayesianGaussianMixture( + n_components=rand_data.n_components, + random_state=rng, + weight_concentration_prior_type=prior_type, + covariance_type=covar_type, + ) + + # Check a warning message arrive if we don't do fit + msg = ( + "This BayesianGaussianMixture instance is not fitted yet. " + "Call 'fit' with appropriate arguments before using this " + "estimator." + ) + with pytest.raises(NotFittedError, match=msg): + bgmm.predict(X) + + bgmm.fit(X) + Y_pred = bgmm.predict(X) + Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1) + assert_array_equal(Y_pred, Y_pred_proba) + assert adjusted_rand_score(Y, Y_pred) >= 0.95 diff --git a/.venv/Lib/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py b/.venv/Lib/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py new file mode 100644 index 0000000000000000000000000000000000000000..aa678797c82d75b3b5b0c036d4138c143f2d8325 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py @@ -0,0 +1,1420 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import copy +import itertools +import re +import sys +import warnings +from io import StringIO +from unittest.mock import Mock + +import numpy as np +import pytest +from scipy import linalg, stats + +import sklearn +from sklearn.cluster import KMeans +from sklearn.covariance import EmpiricalCovariance +from sklearn.datasets import make_spd_matrix +from sklearn.exceptions import ConvergenceWarning, NotFittedError +from sklearn.metrics.cluster import adjusted_rand_score +from sklearn.mixture import GaussianMixture +from sklearn.mixture._gaussian_mixture import ( + _compute_log_det_cholesky, + _compute_precision_cholesky, + _estimate_gaussian_covariances_diag, + _estimate_gaussian_covariances_full, + _estimate_gaussian_covariances_spherical, + _estimate_gaussian_covariances_tied, + _estimate_gaussian_parameters, +) +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.extmath import fast_logdet + +COVARIANCE_TYPE = ["full", "tied", "diag", "spherical"] + + +def generate_data(n_samples, n_features, weights, means, precisions, covariance_type): + rng = np.random.RandomState(0) + + X = [] + if covariance_type == "spherical": + for _, (w, m, c) in enumerate(zip(weights, means, precisions["spherical"])): + X.append( + rng.multivariate_normal( + m, c * np.eye(n_features), int(np.round(w * n_samples)) + ) + ) + if covariance_type == "diag": + for _, (w, m, c) in enumerate(zip(weights, means, precisions["diag"])): + X.append( + rng.multivariate_normal(m, np.diag(c), int(np.round(w * n_samples))) + ) + if covariance_type == "tied": + for _, (w, m) in enumerate(zip(weights, means)): + X.append( + rng.multivariate_normal( + m, precisions["tied"], int(np.round(w * n_samples)) + ) + ) + if covariance_type == "full": + for _, (w, m, c) in enumerate(zip(weights, means, precisions["full"])): + X.append(rng.multivariate_normal(m, c, int(np.round(w * n_samples)))) + + X = np.vstack(X) + return X + + +class RandomData: + def __init__(self, rng, n_samples=200, n_components=2, n_features=2, scale=50): + self.n_samples = n_samples + self.n_components = n_components + self.n_features = n_features + + self.weights = rng.rand(n_components) + self.weights = self.weights / self.weights.sum() + self.means = rng.rand(n_components, n_features) * scale + self.covariances = { + "spherical": 0.5 + rng.rand(n_components), + "diag": (0.5 + rng.rand(n_components, n_features)) ** 2, + "tied": make_spd_matrix(n_features, random_state=rng), + "full": np.array( + [ + make_spd_matrix(n_features, random_state=rng) * 0.5 + for _ in range(n_components) + ] + ), + } + self.precisions = { + "spherical": 1.0 / self.covariances["spherical"], + "diag": 1.0 / self.covariances["diag"], + "tied": linalg.inv(self.covariances["tied"]), + "full": np.array( + [linalg.inv(covariance) for covariance in self.covariances["full"]] + ), + } + + self.X = dict( + zip( + COVARIANCE_TYPE, + [ + generate_data( + n_samples, + n_features, + self.weights, + self.means, + self.covariances, + covar_type, + ) + for covar_type in COVARIANCE_TYPE + ], + ) + ) + self.Y = np.hstack( + [ + np.full(int(np.round(w * n_samples)), k, dtype=int) + for k, w in enumerate(self.weights) + ] + ) + + +def test_gaussian_mixture_attributes(): + # test bad parameters + rng = np.random.RandomState(0) + X = rng.rand(10, 2) + + # test good parameters + n_components, tol, n_init, max_iter, reg_covar = 2, 1e-4, 3, 30, 1e-1 + covariance_type, init_params = "full", "random" + gmm = GaussianMixture( + n_components=n_components, + tol=tol, + n_init=n_init, + max_iter=max_iter, + reg_covar=reg_covar, + covariance_type=covariance_type, + init_params=init_params, + ).fit(X) + + assert gmm.n_components == n_components + assert gmm.covariance_type == covariance_type + assert gmm.tol == tol + assert gmm.reg_covar == reg_covar + assert gmm.max_iter == max_iter + assert gmm.n_init == n_init + assert gmm.init_params == init_params + + +def test_check_weights(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + + n_components = rand_data.n_components + X = rand_data.X["full"] + + g = GaussianMixture(n_components=n_components) + + # Check bad shape + weights_bad_shape = rng.rand(n_components, 1) + g.weights_init = weights_bad_shape + msg = re.escape( + "The parameter 'weights' should have the shape of " + f"({n_components},), but got {str(weights_bad_shape.shape)}" + ) + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check bad range + weights_bad_range = rng.rand(n_components) + 1 + g.weights_init = weights_bad_range + msg = re.escape( + "The parameter 'weights' should be in the range [0, 1], but got" + f" max value {np.min(weights_bad_range):.5f}, " + f"min value {np.max(weights_bad_range):.5f}" + ) + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check bad normalization + weights_bad_norm = rng.rand(n_components) + weights_bad_norm = weights_bad_norm / (weights_bad_norm.sum() + 1) + g.weights_init = weights_bad_norm + msg = re.escape( + "The parameter 'weights' should be normalized, " + f"but got sum(weights) = {np.sum(weights_bad_norm):.5f}" + ) + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check good weights matrix + weights = rand_data.weights + g = GaussianMixture(weights_init=weights, n_components=n_components) + g.fit(X) + assert_array_equal(weights, g.weights_init) + + +def test_check_means(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + + n_components, n_features = rand_data.n_components, rand_data.n_features + X = rand_data.X["full"] + + g = GaussianMixture(n_components=n_components) + + # Check means bad shape + means_bad_shape = rng.rand(n_components + 1, n_features) + g.means_init = means_bad_shape + msg = "The parameter 'means' should have the shape of " + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check good means matrix + means = rand_data.means + g.means_init = means + g.fit(X) + assert_array_equal(means, g.means_init) + + +def test_check_precisions(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + + n_components, n_features = rand_data.n_components, rand_data.n_features + + # Define the bad precisions for each covariance_type + precisions_bad_shape = { + "full": np.ones((n_components + 1, n_features, n_features)), + "tied": np.ones((n_features + 1, n_features + 1)), + "diag": np.ones((n_components + 1, n_features)), + "spherical": np.ones((n_components + 1)), + } + + # Define not positive-definite precisions + precisions_not_pos = np.ones((n_components, n_features, n_features)) + precisions_not_pos[0] = np.eye(n_features) + precisions_not_pos[0, 0, 0] = -1.0 + + precisions_not_positive = { + "full": precisions_not_pos, + "tied": precisions_not_pos[0], + "diag": np.full((n_components, n_features), -1.0), + "spherical": np.full(n_components, -1.0), + } + + not_positive_errors = { + "full": "symmetric, positive-definite", + "tied": "symmetric, positive-definite", + "diag": "positive", + "spherical": "positive", + } + + for covar_type in COVARIANCE_TYPE: + X = RandomData(rng).X[covar_type] + g = GaussianMixture( + n_components=n_components, covariance_type=covar_type, random_state=rng + ) + + # Check precisions with bad shapes + g.precisions_init = precisions_bad_shape[covar_type] + msg = f"The parameter '{covar_type} precision' should have the shape of" + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check not positive precisions + g.precisions_init = precisions_not_positive[covar_type] + msg = f"'{covar_type} precision' should be {not_positive_errors[covar_type]}" + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check the correct init of precisions_init + g.precisions_init = rand_data.precisions[covar_type] + g.fit(X) + assert_array_equal(rand_data.precisions[covar_type], g.precisions_init) + + +def test_suffstat_sk_full(): + # compare the precision matrix compute from the + # EmpiricalCovariance.covariance fitted on X*sqrt(resp) + # with _sufficient_sk_full, n_components=1 + rng = np.random.RandomState(0) + n_samples, n_features = 500, 2 + + # special case 1, assuming data is "centered" + X = rng.rand(n_samples, n_features) + resp = rng.rand(n_samples, 1) + X_resp = np.sqrt(resp) * X + nk = np.array([n_samples]) + xk = np.zeros((1, n_features)) + covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) + ecov = EmpiricalCovariance(assume_centered=True) + ecov.fit(X_resp) + assert_almost_equal(ecov.error_norm(covars_pred[0], norm="frobenius"), 0) + assert_almost_equal(ecov.error_norm(covars_pred[0], norm="spectral"), 0) + + # check the precision computation + precs_chol_pred = _compute_precision_cholesky(covars_pred, "full") + precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) + precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) + assert_array_almost_equal(precs_est, precs_pred) + + # special case 2, assuming resp are all ones + resp = np.ones((n_samples, 1)) + nk = np.array([n_samples]) + xk = X.mean(axis=0).reshape((1, -1)) + covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) + ecov = EmpiricalCovariance(assume_centered=False) + ecov.fit(X) + assert_almost_equal(ecov.error_norm(covars_pred[0], norm="frobenius"), 0) + assert_almost_equal(ecov.error_norm(covars_pred[0], norm="spectral"), 0) + + # check the precision computation + precs_chol_pred = _compute_precision_cholesky(covars_pred, "full") + precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) + precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) + assert_array_almost_equal(precs_est, precs_pred) + + +def test_suffstat_sk_tied(): + # use equation Nk * Sk / N = S_tied + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 500, 2, 2 + + resp = rng.rand(n_samples, n_components) + resp = resp / resp.sum(axis=1)[:, np.newaxis] + X = rng.rand(n_samples, n_features) + nk = resp.sum(axis=0) + xk = np.dot(resp.T, X) / nk[:, np.newaxis] + + covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) + covars_pred_full = ( + np.sum(nk[:, np.newaxis, np.newaxis] * covars_pred_full, 0) / n_samples + ) + + covars_pred_tied = _estimate_gaussian_covariances_tied(resp, X, nk, xk, 0) + + ecov = EmpiricalCovariance() + ecov.covariance_ = covars_pred_full + assert_almost_equal(ecov.error_norm(covars_pred_tied, norm="frobenius"), 0) + assert_almost_equal(ecov.error_norm(covars_pred_tied, norm="spectral"), 0) + + # check the precision computation + precs_chol_pred = _compute_precision_cholesky(covars_pred_tied, "tied") + precs_pred = np.dot(precs_chol_pred, precs_chol_pred.T) + precs_est = linalg.inv(covars_pred_tied) + assert_array_almost_equal(precs_est, precs_pred) + + +def test_suffstat_sk_diag(): + # test against 'full' case + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 500, 2, 2 + + resp = rng.rand(n_samples, n_components) + resp = resp / resp.sum(axis=1)[:, np.newaxis] + X = rng.rand(n_samples, n_features) + nk = resp.sum(axis=0) + xk = np.dot(resp.T, X) / nk[:, np.newaxis] + covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) + covars_pred_diag = _estimate_gaussian_covariances_diag(resp, X, nk, xk, 0) + + ecov = EmpiricalCovariance() + for cov_full, cov_diag in zip(covars_pred_full, covars_pred_diag): + ecov.covariance_ = np.diag(np.diag(cov_full)) + cov_diag = np.diag(cov_diag) + assert_almost_equal(ecov.error_norm(cov_diag, norm="frobenius"), 0) + assert_almost_equal(ecov.error_norm(cov_diag, norm="spectral"), 0) + + # check the precision computation + precs_chol_pred = _compute_precision_cholesky(covars_pred_diag, "diag") + assert_almost_equal(covars_pred_diag, 1.0 / precs_chol_pred**2) + + +def test_gaussian_suffstat_sk_spherical(): + # computing spherical covariance equals to the variance of one-dimension + # data after flattening, n_components=1 + rng = np.random.RandomState(0) + n_samples, n_features = 500, 2 + + X = rng.rand(n_samples, n_features) + X = X - X.mean() + resp = np.ones((n_samples, 1)) + nk = np.array([n_samples]) + xk = X.mean() + covars_pred_spherical = _estimate_gaussian_covariances_spherical(resp, X, nk, xk, 0) + covars_pred_spherical2 = np.dot(X.flatten().T, X.flatten()) / ( + n_features * n_samples + ) + assert_almost_equal(covars_pred_spherical, covars_pred_spherical2) + + # check the precision computation + precs_chol_pred = _compute_precision_cholesky(covars_pred_spherical, "spherical") + assert_almost_equal(covars_pred_spherical, 1.0 / precs_chol_pred**2) + + +def test_compute_log_det_cholesky(): + n_features = 2 + rand_data = RandomData(np.random.RandomState(0)) + + for covar_type in COVARIANCE_TYPE: + covariance = rand_data.covariances[covar_type] + + if covar_type == "full": + predected_det = np.array([linalg.det(cov) for cov in covariance]) + elif covar_type == "tied": + predected_det = linalg.det(covariance) + elif covar_type == "diag": + predected_det = np.array([np.prod(cov) for cov in covariance]) + elif covar_type == "spherical": + predected_det = covariance**n_features + + # We compute the cholesky decomposition of the covariance matrix + expected_det = _compute_log_det_cholesky( + _compute_precision_cholesky(covariance, covar_type), + covar_type, + n_features=n_features, + ) + assert_array_almost_equal(expected_det, -0.5 * np.log(predected_det)) + + +def _naive_lmvnpdf_diag(X, means, covars): + resp = np.empty((len(X), len(means))) + stds = np.sqrt(covars) + for i, (mean, std) in enumerate(zip(means, stds)): + resp[:, i] = stats.norm.logpdf(X, mean, std).sum(axis=1) + return resp + + +def test_gaussian_mixture_log_probabilities(): + from sklearn.mixture._gaussian_mixture import _estimate_log_gaussian_prob + + # test against with _naive_lmvnpdf_diag + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + n_samples = 500 + n_features = rand_data.n_features + n_components = rand_data.n_components + + means = rand_data.means + covars_diag = rng.rand(n_components, n_features) + X = rng.rand(n_samples, n_features) + log_prob_naive = _naive_lmvnpdf_diag(X, means, covars_diag) + + # full covariances + precs_full = np.array([np.diag(1.0 / np.sqrt(x)) for x in covars_diag]) + + log_prob = _estimate_log_gaussian_prob(X, means, precs_full, "full") + assert_array_almost_equal(log_prob, log_prob_naive) + + # diag covariances + precs_chol_diag = 1.0 / np.sqrt(covars_diag) + log_prob = _estimate_log_gaussian_prob(X, means, precs_chol_diag, "diag") + assert_array_almost_equal(log_prob, log_prob_naive) + + # tied + covars_tied = np.array([x for x in covars_diag]).mean(axis=0) + precs_tied = np.diag(np.sqrt(1.0 / covars_tied)) + + log_prob_naive = _naive_lmvnpdf_diag(X, means, [covars_tied] * n_components) + log_prob = _estimate_log_gaussian_prob(X, means, precs_tied, "tied") + + assert_array_almost_equal(log_prob, log_prob_naive) + + # spherical + covars_spherical = covars_diag.mean(axis=1) + precs_spherical = 1.0 / np.sqrt(covars_diag.mean(axis=1)) + log_prob_naive = _naive_lmvnpdf_diag( + X, means, [[k] * n_features for k in covars_spherical] + ) + log_prob = _estimate_log_gaussian_prob(X, means, precs_spherical, "spherical") + assert_array_almost_equal(log_prob, log_prob_naive) + + +# skip tests on weighted_log_probabilities, log_weights + + +def test_gaussian_mixture_estimate_log_prob_resp(): + # test whether responsibilities are normalized + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=5) + n_samples = rand_data.n_samples + n_features = rand_data.n_features + n_components = rand_data.n_components + + X = rng.rand(n_samples, n_features) + for covar_type in COVARIANCE_TYPE: + weights = rand_data.weights + means = rand_data.means + precisions = rand_data.precisions[covar_type] + g = GaussianMixture( + n_components=n_components, + random_state=rng, + weights_init=weights, + means_init=means, + precisions_init=precisions, + covariance_type=covar_type, + ) + g.fit(X) + resp = g.predict_proba(X) + assert_array_almost_equal(resp.sum(axis=1), np.ones(n_samples)) + assert_array_equal(g.weights_init, weights) + assert_array_equal(g.means_init, means) + assert_array_equal(g.precisions_init, precisions) + + +def test_gaussian_mixture_predict_predict_proba(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + Y = rand_data.Y + g = GaussianMixture( + n_components=rand_data.n_components, + random_state=rng, + weights_init=rand_data.weights, + means_init=rand_data.means, + precisions_init=rand_data.precisions[covar_type], + covariance_type=covar_type, + ) + + # Check a warning message arrive if we don't do fit + msg = ( + "This GaussianMixture instance is not fitted yet. Call 'fit' " + "with appropriate arguments before using this estimator." + ) + with pytest.raises(NotFittedError, match=msg): + g.predict(X) + + g.fit(X) + Y_pred = g.predict(X) + Y_pred_proba = g.predict_proba(X).argmax(axis=1) + assert_array_equal(Y_pred, Y_pred_proba) + assert adjusted_rand_score(Y, Y_pred) > 0.95 + + +@pytest.mark.filterwarnings("ignore:.*did not converge.*") +@pytest.mark.parametrize( + "seed, max_iter, tol", + [ + (0, 2, 1e-7), # strict non-convergence + (1, 2, 1e-1), # loose non-convergence + (3, 300, 1e-7), # strict convergence + (4, 300, 1e-1), # loose convergence + ], +) +def test_gaussian_mixture_fit_predict(seed, max_iter, tol): + rng = np.random.RandomState(seed) + rand_data = RandomData(rng) + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + Y = rand_data.Y + g = GaussianMixture( + n_components=rand_data.n_components, + random_state=rng, + weights_init=rand_data.weights, + means_init=rand_data.means, + precisions_init=rand_data.precisions[covar_type], + covariance_type=covar_type, + max_iter=max_iter, + tol=tol, + ) + + # check if fit_predict(X) is equivalent to fit(X).predict(X) + f = copy.deepcopy(g) + Y_pred1 = f.fit(X).predict(X) + Y_pred2 = g.fit_predict(X) + assert_array_equal(Y_pred1, Y_pred2) + assert adjusted_rand_score(Y, Y_pred2) > 0.95 + + +def test_gaussian_mixture_fit_predict_n_init(): + # Check that fit_predict is equivalent to fit.predict, when n_init > 1 + X = np.random.RandomState(0).randn(1000, 5) + gm = GaussianMixture(n_components=5, n_init=5, random_state=0) + y_pred1 = gm.fit_predict(X) + y_pred2 = gm.predict(X) + assert_array_equal(y_pred1, y_pred2) + + +def test_gaussian_mixture_fit(): + # recover the ground truth + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + n_features = rand_data.n_features + n_components = rand_data.n_components + + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + g = GaussianMixture( + n_components=n_components, + n_init=20, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + g.fit(X) + + # needs more data to pass the test with rtol=1e-7 + assert_allclose( + np.sort(g.weights_), np.sort(rand_data.weights), rtol=0.1, atol=1e-2 + ) + + arg_idx1 = g.means_[:, 0].argsort() + arg_idx2 = rand_data.means[:, 0].argsort() + assert_allclose( + g.means_[arg_idx1], rand_data.means[arg_idx2], rtol=0.1, atol=1e-2 + ) + + if covar_type == "full": + prec_pred = g.precisions_ + prec_test = rand_data.precisions["full"] + elif covar_type == "tied": + prec_pred = np.array([g.precisions_] * n_components) + prec_test = np.array([rand_data.precisions["tied"]] * n_components) + elif covar_type == "spherical": + prec_pred = np.array([np.eye(n_features) * c for c in g.precisions_]) + prec_test = np.array( + [np.eye(n_features) * c for c in rand_data.precisions["spherical"]] + ) + elif covar_type == "diag": + prec_pred = np.array([np.diag(d) for d in g.precisions_]) + prec_test = np.array([np.diag(d) for d in rand_data.precisions["diag"]]) + + arg_idx1 = np.trace(prec_pred, axis1=1, axis2=2).argsort() + arg_idx2 = np.trace(prec_test, axis1=1, axis2=2).argsort() + for k, h in zip(arg_idx1, arg_idx2): + ecov = EmpiricalCovariance() + ecov.covariance_ = prec_test[h] + # the accuracy depends on the number of data and randomness, rng + assert_allclose(ecov.error_norm(prec_pred[k]), 0, atol=0.15) + + +def test_gaussian_mixture_fit_best_params(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + n_components = rand_data.n_components + n_init = 10 + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + g = GaussianMixture( + n_components=n_components, + n_init=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + ll = [] + for _ in range(n_init): + g.fit(X) + ll.append(g.score(X)) + ll = np.array(ll) + g_best = GaussianMixture( + n_components=n_components, + n_init=n_init, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + g_best.fit(X) + assert_almost_equal(ll.min(), g_best.score(X)) + + +def test_gaussian_mixture_fit_convergence_warning(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=1) + n_components = rand_data.n_components + max_iter = 1 + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + g = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=max_iter, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + msg = ( + "Best performing initialization did not converge. " + "Try different init parameters, or increase max_iter, " + "tol, or check for degenerate data." + ) + with pytest.warns(ConvergenceWarning, match=msg): + g.fit(X) + + +def test_multiple_init(): + # Test that multiple inits does not much worse than a single one + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 50, 5, 2 + X = rng.randn(n_samples, n_features) + for cv_type in COVARIANCE_TYPE: + train1 = ( + GaussianMixture( + n_components=n_components, covariance_type=cv_type, random_state=0 + ) + .fit(X) + .score(X) + ) + train2 = ( + GaussianMixture( + n_components=n_components, + covariance_type=cv_type, + random_state=0, + n_init=5, + ) + .fit(X) + .score(X) + ) + assert train2 >= train1 + + +def test_gaussian_mixture_n_parameters(): + # Test that the right number of parameters is estimated + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 50, 5, 2 + X = rng.randn(n_samples, n_features) + n_params = {"spherical": 13, "diag": 21, "tied": 26, "full": 41} + for cv_type in COVARIANCE_TYPE: + g = GaussianMixture( + n_components=n_components, covariance_type=cv_type, random_state=rng + ).fit(X) + assert g._n_parameters() == n_params[cv_type] + + +def test_bic_1d_1component(): + # Test all of the covariance_types return the same BIC score for + # 1-dimensional, 1 component fits. + rng = np.random.RandomState(0) + n_samples, n_dim, n_components = 100, 1, 1 + X = rng.randn(n_samples, n_dim) + bic_full = ( + GaussianMixture( + n_components=n_components, covariance_type="full", random_state=rng + ) + .fit(X) + .bic(X) + ) + for covariance_type in ["tied", "diag", "spherical"]: + bic = ( + GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + random_state=rng, + ) + .fit(X) + .bic(X) + ) + assert_almost_equal(bic_full, bic) + + +def test_gaussian_mixture_aic_bic(): + # Test the aic and bic criteria + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 50, 3, 2 + X = rng.randn(n_samples, n_features) + # standard gaussian entropy + sgh = 0.5 * ( + fast_logdet(np.cov(X.T, bias=1)) + n_features * (1 + np.log(2 * np.pi)) + ) + for cv_type in COVARIANCE_TYPE: + g = GaussianMixture( + n_components=n_components, + covariance_type=cv_type, + random_state=rng, + max_iter=200, + ) + g.fit(X) + aic = 2 * n_samples * sgh + 2 * g._n_parameters() + bic = 2 * n_samples * sgh + np.log(n_samples) * g._n_parameters() + bound = n_features / np.sqrt(n_samples) + assert (g.aic(X) - aic) / n_samples < bound + assert (g.bic(X) - bic) / n_samples < bound + + +def test_gaussian_mixture_verbose(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + n_components = rand_data.n_components + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + g = GaussianMixture( + n_components=n_components, + n_init=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + verbose=1, + ) + h = GaussianMixture( + n_components=n_components, + n_init=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + verbose=2, + ) + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + g.fit(X) + h.fit(X) + finally: + sys.stdout = old_stdout + + +@pytest.mark.filterwarnings("ignore:.*did not converge.*") +@pytest.mark.parametrize("seed", (0, 1, 2)) +def test_warm_start(seed): + random_state = seed + rng = np.random.RandomState(random_state) + n_samples, n_features, n_components = 500, 2, 2 + X = rng.rand(n_samples, n_features) + + # Assert the warm_start give the same result for the same number of iter + g = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=2, + reg_covar=0, + random_state=random_state, + warm_start=False, + ) + h = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=1, + reg_covar=0, + random_state=random_state, + warm_start=True, + ) + + g.fit(X) + score1 = h.fit(X).score(X) + score2 = h.fit(X).score(X) + + assert_almost_equal(g.weights_, h.weights_) + assert_almost_equal(g.means_, h.means_) + assert_almost_equal(g.precisions_, h.precisions_) + assert score2 > score1 + + # Assert that by using warm_start we can converge to a good solution + g = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=5, + reg_covar=0, + random_state=random_state, + warm_start=False, + tol=1e-6, + ) + h = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=5, + reg_covar=0, + random_state=random_state, + warm_start=True, + tol=1e-6, + ) + + g.fit(X) + assert not g.converged_ + + h.fit(X) + # depending on the data there is large variability in the number of + # refit necessary to converge due to the complete randomness of the + # data + for _ in range(1000): + h.fit(X) + if h.converged_: + break + assert h.converged_ + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_convergence_detected_with_warm_start(): + # We check that convergence is detected when warm_start=True + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + n_components = rand_data.n_components + X = rand_data.X["full"] + + for max_iter in (1, 2, 50): + gmm = GaussianMixture( + n_components=n_components, + warm_start=True, + max_iter=max_iter, + random_state=rng, + ) + for _ in range(100): + gmm.fit(X) + if gmm.converged_: + break + assert gmm.converged_ + assert max_iter >= gmm.n_iter_ + + +def test_score(): + covar_type = "full" + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7) + n_components = rand_data.n_components + X = rand_data.X[covar_type] + + # Check the error message if we don't call fit + gmm1 = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + msg = ( + "This GaussianMixture instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator." + ) + with pytest.raises(NotFittedError, match=msg): + gmm1.score(X) + + # Check score value + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + gmm1.fit(X) + gmm_score = gmm1.score(X) + gmm_score_proba = gmm1.score_samples(X).mean() + assert_almost_equal(gmm_score, gmm_score_proba) + + # Check if the score increase + gmm2 = GaussianMixture( + n_components=n_components, + n_init=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ).fit(X) + assert gmm2.score(X) > gmm1.score(X) + + +def test_score_samples(): + covar_type = "full" + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7) + n_components = rand_data.n_components + X = rand_data.X[covar_type] + + # Check the error message if we don't call fit + gmm = GaussianMixture( + n_components=n_components, + n_init=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + msg = ( + "This GaussianMixture instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator." + ) + with pytest.raises(NotFittedError, match=msg): + gmm.score_samples(X) + + gmm_score_samples = gmm.fit(X).score_samples(X) + assert gmm_score_samples.shape[0] == rand_data.n_samples + + +def test_monotonic_likelihood(): + # We check that each step of the EM without regularization improve + # monotonically the training set likelihood + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7) + n_components = rand_data.n_components + + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + gmm = GaussianMixture( + n_components=n_components, + covariance_type=covar_type, + reg_covar=0, + warm_start=True, + max_iter=1, + random_state=rng, + tol=1e-7, + ) + current_log_likelihood = -np.inf + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + # Do one training iteration at a time so we can make sure that the + # training log likelihood increases after each iteration. + for _ in range(600): + prev_log_likelihood = current_log_likelihood + current_log_likelihood = gmm.fit(X).score(X) + assert current_log_likelihood >= prev_log_likelihood + + if gmm.converged_: + break + + assert gmm.converged_ + + +def test_regularisation(): + # We train the GaussianMixture on degenerate data by defining two clusters + # of a 0 covariance. + rng = np.random.RandomState(0) + n_samples, n_features = 10, 5 + + X = np.vstack( + (np.ones((n_samples // 2, n_features)), np.zeros((n_samples // 2, n_features))) + ) + + for covar_type in COVARIANCE_TYPE: + gmm = GaussianMixture( + n_components=n_samples, + reg_covar=0, + covariance_type=covar_type, + random_state=rng, + ) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + msg = re.escape( + "Fitting the mixture model failed because some components have" + " ill-defined empirical covariance (for instance caused by " + "singleton or collapsed samples). Try to decrease the number " + "of components, or increase reg_covar." + ) + with pytest.raises(ValueError, match=msg): + gmm.fit(X) + + gmm.set_params(reg_covar=1e-6).fit(X) + + +def test_property(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7) + n_components = rand_data.n_components + + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + gmm = GaussianMixture( + n_components=n_components, + covariance_type=covar_type, + random_state=rng, + n_init=5, + ) + gmm.fit(X) + if covar_type == "full": + for prec, covar in zip(gmm.precisions_, gmm.covariances_): + assert_array_almost_equal(linalg.inv(prec), covar) + elif covar_type == "tied": + assert_array_almost_equal(linalg.inv(gmm.precisions_), gmm.covariances_) + else: + assert_array_almost_equal(gmm.precisions_, 1.0 / gmm.covariances_) + + +def test_sample(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7, n_components=3) + n_features, n_components = rand_data.n_features, rand_data.n_components + + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + + gmm = GaussianMixture( + n_components=n_components, covariance_type=covar_type, random_state=rng + ) + # To sample we need that GaussianMixture is fitted + msg = "This GaussianMixture instance is not fitted" + with pytest.raises(NotFittedError, match=msg): + gmm.sample(0) + gmm.fit(X) + + msg = "Invalid value for 'n_samples'" + with pytest.raises(ValueError, match=msg): + gmm.sample(0) + + # Just to make sure the class samples correctly + n_samples = 20000 + X_s, y_s = gmm.sample(n_samples) + + for k in range(n_components): + if covar_type == "full": + assert_array_almost_equal( + gmm.covariances_[k], np.cov(X_s[y_s == k].T), decimal=1 + ) + elif covar_type == "tied": + assert_array_almost_equal( + gmm.covariances_, np.cov(X_s[y_s == k].T), decimal=1 + ) + elif covar_type == "diag": + assert_array_almost_equal( + gmm.covariances_[k], np.diag(np.cov(X_s[y_s == k].T)), decimal=1 + ) + else: + assert_array_almost_equal( + gmm.covariances_[k], + np.var(X_s[y_s == k] - gmm.means_[k]), + decimal=1, + ) + + means_s = np.array([np.mean(X_s[y_s == k], 0) for k in range(n_components)]) + assert_array_almost_equal(gmm.means_, means_s, decimal=1) + + # Check shapes of sampled data, see + # https://github.com/scikit-learn/scikit-learn/issues/7701 + assert X_s.shape == (n_samples, n_features) + + for sample_size in range(1, 100): + X_s, _ = gmm.sample(sample_size) + assert X_s.shape == (sample_size, n_features) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_init(): + # We check that by increasing the n_init number we have a better solution + for random_state in range(15): + rand_data = RandomData( + np.random.RandomState(random_state), n_samples=50, scale=1 + ) + n_components = rand_data.n_components + X = rand_data.X["full"] + + gmm1 = GaussianMixture( + n_components=n_components, n_init=1, max_iter=1, random_state=random_state + ).fit(X) + gmm2 = GaussianMixture( + n_components=n_components, n_init=10, max_iter=1, random_state=random_state + ).fit(X) + + assert gmm2.lower_bound_ >= gmm1.lower_bound_ + + +def test_gaussian_mixture_setting_best_params(): + """`GaussianMixture`'s best_parameters, `n_iter_` and `lower_bound_` + must be set appropriately in the case of divergence. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/18216 + """ + rnd = np.random.RandomState(0) + n_samples = 30 + X = rnd.uniform(size=(n_samples, 3)) + + # following initialization parameters were found to lead to divergence + means_init = np.array( + [ + [0.670637869618158, 0.21038256107384043, 0.12892629765485303], + [0.09394051075844147, 0.5759464955561779, 0.929296197576212], + [0.5033230372781258, 0.9569852381759425, 0.08654043447295741], + [0.18578301420435747, 0.5531158970919143, 0.19388943970532435], + [0.4548589928173794, 0.35182513658825276, 0.568146063202464], + [0.609279894978321, 0.7929063819678847, 0.9620097270828052], + ] + ) + precisions_init = np.array( + [ + 999999.999604483, + 999999.9990869573, + 553.7603944542167, + 204.78596008931834, + 15.867423501783637, + 85.4595728389735, + ] + ) + weights_init = [ + 0.03333333333333341, + 0.03333333333333341, + 0.06666666666666674, + 0.06666666666666674, + 0.7000000000000001, + 0.10000000000000007, + ] + + gmm = GaussianMixture( + covariance_type="spherical", + reg_covar=0, + means_init=means_init, + weights_init=weights_init, + random_state=rnd, + n_components=len(weights_init), + precisions_init=precisions_init, + max_iter=1, + ) + # ensure that no error is thrown during fit + gmm.fit(X) + + # check that the fit did not converge + assert not gmm.converged_ + + # check that parameters are set for gmm + for attr in [ + "weights_", + "means_", + "covariances_", + "precisions_cholesky_", + "n_iter_", + "lower_bound_", + ]: + assert hasattr(gmm, attr) + + +@pytest.mark.parametrize( + "init_params", ["random", "random_from_data", "k-means++", "kmeans"] +) +def test_init_means_not_duplicated(init_params, global_random_seed): + # Check that all initialisations provide not duplicated starting means + rng = np.random.RandomState(global_random_seed) + rand_data = RandomData(rng, scale=5) + n_components = rand_data.n_components + X = rand_data.X["full"] + + gmm = GaussianMixture( + n_components=n_components, init_params=init_params, random_state=rng, max_iter=0 + ) + gmm.fit(X) + + means = gmm.means_ + for i_mean, j_mean in itertools.combinations(means, r=2): + assert not np.allclose(i_mean, j_mean) + + +@pytest.mark.parametrize( + "init_params", ["random", "random_from_data", "k-means++", "kmeans"] +) +def test_means_for_all_inits(init_params, global_random_seed): + # Check fitted means properties for all initializations + rng = np.random.RandomState(global_random_seed) + rand_data = RandomData(rng, scale=5) + n_components = rand_data.n_components + X = rand_data.X["full"] + + gmm = GaussianMixture( + n_components=n_components, init_params=init_params, random_state=rng + ) + gmm.fit(X) + + assert gmm.means_.shape == (n_components, X.shape[1]) + assert np.all(X.min(axis=0) <= gmm.means_) + assert np.all(gmm.means_ <= X.max(axis=0)) + assert gmm.converged_ + + +def test_max_iter_zero(): + # Check that max_iter=0 returns initialisation as expected + # Pick arbitrary initial means and check equal to max_iter=0 + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=5) + n_components = rand_data.n_components + X = rand_data.X["full"] + means_init = [[20, 30], [30, 25]] + gmm = GaussianMixture( + n_components=n_components, + random_state=rng, + means_init=means_init, + tol=1e-06, + max_iter=0, + ) + gmm.fit(X) + + assert_allclose(gmm.means_, means_init) + + +def test_gaussian_mixture_precisions_init_diag(): + """Check that we properly initialize `precision_cholesky_` when we manually + provide the precision matrix. + + In this regard, we check the consistency between estimating the precision + matrix and providing the same precision matrix as initialization. It should + lead to the same results with the same number of iterations. + + If the initialization is wrong then the number of iterations will increase. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/16944 + """ + # generate a toy dataset + n_samples = 300 + rng = np.random.RandomState(0) + shifted_gaussian = rng.randn(n_samples, 2) + np.array([20, 20]) + C = np.array([[0.0, -0.7], [3.5, 0.7]]) + stretched_gaussian = np.dot(rng.randn(n_samples, 2), C) + X = np.vstack([shifted_gaussian, stretched_gaussian]) + + # common parameters to check the consistency of precision initialization + n_components, covariance_type, reg_covar, random_state = 2, "diag", 1e-6, 0 + + # execute the manual initialization to compute the precision matrix: + # - run KMeans to have an initial guess + # - estimate the covariance + # - compute the precision matrix from the estimated covariance + resp = np.zeros((X.shape[0], n_components)) + label = ( + KMeans(n_clusters=n_components, n_init=1, random_state=random_state) + .fit(X) + .labels_ + ) + resp[np.arange(X.shape[0]), label] = 1 + _, _, covariance = _estimate_gaussian_parameters( + X, resp, reg_covar=reg_covar, covariance_type=covariance_type + ) + precisions_init = 1 / covariance + + gm_with_init = GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + reg_covar=reg_covar, + precisions_init=precisions_init, + random_state=random_state, + ).fit(X) + + gm_without_init = GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + reg_covar=reg_covar, + random_state=random_state, + ).fit(X) + + assert gm_without_init.n_iter_ == gm_with_init.n_iter_ + assert_allclose( + gm_with_init.precisions_cholesky_, gm_without_init.precisions_cholesky_ + ) + + +def _generate_data(seed, n_samples, n_features, n_components): + """Randomly generate samples and responsibilities.""" + rs = np.random.RandomState(seed) + X = rs.random_sample((n_samples, n_features)) + resp = rs.random_sample((n_samples, n_components)) + resp /= resp.sum(axis=1)[:, np.newaxis] + return X, resp + + +def _calculate_precisions(X, resp, covariance_type): + """Calculate precision matrix of X and its Cholesky decomposition + for the given covariance type. + """ + reg_covar = 1e-6 + weights, means, covariances = _estimate_gaussian_parameters( + X, resp, reg_covar, covariance_type + ) + precisions_cholesky = _compute_precision_cholesky(covariances, covariance_type) + + _, n_components = resp.shape + # Instantiate a `GaussianMixture` model in order to use its + # `_set_parameters` method to return the `precisions_` and + # `precisions_cholesky_` from matching the `covariance_type` + # provided. + gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type) + params = (weights, means, covariances, precisions_cholesky) + gmm._set_parameters(params) + return gmm.precisions_, gmm.precisions_cholesky_ + + +@pytest.mark.parametrize("covariance_type", COVARIANCE_TYPE) +def test_gaussian_mixture_precisions_init(covariance_type, global_random_seed): + """Non-regression test for #26415.""" + + X, resp = _generate_data( + seed=global_random_seed, + n_samples=100, + n_features=3, + n_components=4, + ) + + precisions_init, desired_precisions_cholesky = _calculate_precisions( + X, resp, covariance_type + ) + gmm = GaussianMixture( + covariance_type=covariance_type, precisions_init=precisions_init + ) + gmm._initialize(X, resp) + actual_precisions_cholesky = gmm.precisions_cholesky_ + assert_allclose(actual_precisions_cholesky, desired_precisions_cholesky) + + +def test_gaussian_mixture_single_component_stable(): + """ + Non-regression test for #23032 ensuring 1-component GM works on only a + few samples. + """ + rng = np.random.RandomState(0) + X = rng.multivariate_normal(np.zeros(2), np.identity(2), size=3) + gm = GaussianMixture(n_components=1) + gm.fit(X).sample() + + +def test_gaussian_mixture_all_init_does_not_estimate_gaussian_parameters( + monkeypatch, + global_random_seed, +): + """When all init parameters are provided, the Gaussian parameters + are not estimated. + + Non-regression test for gh-26015. + """ + + mock = Mock(side_effect=_estimate_gaussian_parameters) + monkeypatch.setattr( + sklearn.mixture._gaussian_mixture, "_estimate_gaussian_parameters", mock + ) + + rng = np.random.RandomState(global_random_seed) + rand_data = RandomData(rng) + + gm = GaussianMixture( + n_components=rand_data.n_components, + weights_init=rand_data.weights, + means_init=rand_data.means, + precisions_init=rand_data.precisions["full"], + random_state=rng, + ) + gm.fit(rand_data.X["full"]) + # The initial gaussian parameters are not estimated. They are estimated for every + # m_step. + assert mock.call_count == gm.n_iter_ diff --git a/.venv/Lib/site-packages/sklearn/mixture/tests/test_mixture.py b/.venv/Lib/site-packages/sklearn/mixture/tests/test_mixture.py new file mode 100644 index 0000000000000000000000000000000000000000..d1a45ffef5e07f5eafacd704e69c1f0060c9bb93 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/mixture/tests/test_mixture.py @@ -0,0 +1,30 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import pytest + +from sklearn.mixture import BayesianGaussianMixture, GaussianMixture + + +@pytest.mark.parametrize("estimator", [GaussianMixture(), BayesianGaussianMixture()]) +def test_gaussian_mixture_n_iter(estimator): + # check that n_iter is the number of iteration performed. + rng = np.random.RandomState(0) + X = rng.rand(10, 5) + max_iter = 1 + estimator.set_params(max_iter=max_iter) + estimator.fit(X) + assert estimator.n_iter_ == max_iter + + +@pytest.mark.parametrize("estimator", [GaussianMixture(), BayesianGaussianMixture()]) +def test_mixture_n_components_greater_than_n_samples_error(estimator): + """Check error when n_components <= n_samples""" + rng = np.random.RandomState(0) + X = rng.rand(10, 5) + estimator.set_params(n_components=12) + + msg = "Expected n_samples >= n_components" + with pytest.raises(ValueError, match=msg): + estimator.fit(X) diff --git a/.venv/Lib/site-packages/sklearn/model_selection/__init__.py b/.venv/Lib/site-packages/sklearn/model_selection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3009b9310d81166691ef1d4db3d41268ee906af5 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/__init__.py @@ -0,0 +1,99 @@ +"""Tools for model selection, such as cross validation and hyper-parameter tuning.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import typing + +from ._classification_threshold import ( + FixedThresholdClassifier, + TunedThresholdClassifierCV, +) +from ._plot import LearningCurveDisplay, ValidationCurveDisplay +from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV +from ._split import ( + BaseCrossValidator, + BaseShuffleSplit, + GroupKFold, + GroupShuffleSplit, + KFold, + LeaveOneGroupOut, + LeaveOneOut, + LeavePGroupsOut, + LeavePOut, + PredefinedSplit, + RepeatedKFold, + RepeatedStratifiedKFold, + ShuffleSplit, + StratifiedGroupKFold, + StratifiedKFold, + StratifiedShuffleSplit, + TimeSeriesSplit, + check_cv, + train_test_split, +) +from ._validation import ( + cross_val_predict, + cross_val_score, + cross_validate, + learning_curve, + permutation_test_score, + validation_curve, +) + +if typing.TYPE_CHECKING: + # Avoid errors in type checkers (e.g. mypy) for experimental estimators. + # TODO: remove this check once the estimator is no longer experimental. + from ._search_successive_halving import ( # noqa + HalvingGridSearchCV, + HalvingRandomSearchCV, + ) + + +__all__ = [ + "BaseCrossValidator", + "BaseShuffleSplit", + "GridSearchCV", + "TimeSeriesSplit", + "KFold", + "GroupKFold", + "GroupShuffleSplit", + "LeaveOneGroupOut", + "LeaveOneOut", + "LeavePGroupsOut", + "LeavePOut", + "RepeatedKFold", + "RepeatedStratifiedKFold", + "ParameterGrid", + "ParameterSampler", + "PredefinedSplit", + "RandomizedSearchCV", + "ShuffleSplit", + "StratifiedKFold", + "StratifiedGroupKFold", + "StratifiedShuffleSplit", + "FixedThresholdClassifier", + "TunedThresholdClassifierCV", + "check_cv", + "cross_val_predict", + "cross_val_score", + "cross_validate", + "learning_curve", + "LearningCurveDisplay", + "permutation_test_score", + "train_test_split", + "validation_curve", + "ValidationCurveDisplay", +] + + +# TODO: remove this check once the estimator is no longer experimental. +def __getattr__(name): + if name in {"HalvingGridSearchCV", "HalvingRandomSearchCV"}: + raise ImportError( + f"{name} is experimental and the API might change without any " + "deprecation cycle. To use it, you need to explicitly import " + "enable_halving_search_cv:\n" + "from sklearn.experimental import enable_halving_search_cv" + ) + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_classification_threshold.py b/.venv/Lib/site-packages/sklearn/model_selection/_classification_threshold.py new file mode 100644 index 0000000000000000000000000000000000000000..2bff98d232492e00d5aea424331f32f7090d0bf6 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/_classification_threshold.py @@ -0,0 +1,892 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from collections.abc import MutableMapping +from numbers import Integral, Real + +import numpy as np + +from ..base import ( + BaseEstimator, + ClassifierMixin, + MetaEstimatorMixin, + _fit_context, + clone, +) +from ..exceptions import NotFittedError +from ..metrics import ( + check_scoring, + get_scorer_names, +) +from ..metrics._scorer import ( + _CurveScorer, + _threshold_scores_to_class_labels, +) +from ..utils import _safe_indexing +from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions +from ..utils._response import _get_response_values_binary +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + process_routing, +) +from ..utils.metaestimators import available_if +from ..utils.multiclass import type_of_target +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _check_method_params, + _estimator_has, + _num_samples, + check_is_fitted, + indexable, +) +from ._split import StratifiedShuffleSplit, check_cv + + +def _check_is_fitted(estimator): + try: + check_is_fitted(estimator.estimator) + except NotFittedError: + check_is_fitted(estimator, "estimator_") + + +class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): + """Base class for binary classifiers that set a non-default decision threshold. + + In this base class, we define the following interface: + + - the validation of common parameters in `fit`; + - the different prediction methods that can be used with the classifier. + + .. versionadded:: 1.5 + + Parameters + ---------- + estimator : estimator instance + The binary classifier, fitted or not, for which we want to optimize + the decision threshold used during `predict`. + + response_method : {"auto", "decision_function", "predict_proba"}, default="auto" + Methods by the classifier `estimator` corresponding to the + decision function for which we want to find a threshold. It can be: + + * if `"auto"`, it will try to invoke, for each classifier, + `"predict_proba"` or `"decision_function"` in that order. + * otherwise, one of `"predict_proba"` or `"decision_function"`. + If the method is not implemented by the classifier, it will raise an + error. + """ + + _parameter_constraints: dict = { + "estimator": [ + HasMethods(["fit", "predict_proba"]), + HasMethods(["fit", "decision_function"]), + ], + "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})], + } + + def __init__(self, estimator, *, response_method="auto"): + self.estimator = estimator + self.response_method = response_method + + def _get_response_method(self): + """Define the response method.""" + if self.response_method == "auto": + response_method = ["predict_proba", "decision_function"] + else: + response_method = self.response_method + return response_method + + @_fit_context( + # *ThresholdClassifier*.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, **params): + """Fit the classifier. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + **params : dict + Parameters to pass to the `fit` method of the underlying + classifier. + + Returns + ------- + self : object + Returns an instance of self. + """ + _raise_for_params(params, self, None) + + X, y = indexable(X, y) + + y_type = type_of_target(y, input_name="y") + if y_type != "binary": + raise ValueError( + f"Only binary classification is supported. Unknown label type: {y_type}" + ) + + self._fit(X, y, **params) + + if hasattr(self.estimator_, "n_features_in_"): + self.n_features_in_ = self.estimator_.n_features_in_ + if hasattr(self.estimator_, "feature_names_in_"): + self.feature_names_in_ = self.estimator_.feature_names_in_ + + return self + + @property + def classes_(self): + """Classes labels.""" + return self.estimator_.classes_ + + @available_if(_estimator_has("predict_proba")) + def predict_proba(self, X): + """Predict class probabilities for `X` using the fitted estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + probabilities : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. + """ + _check_is_fitted(self) + estimator = getattr(self, "estimator_", self.estimator) + return estimator.predict_proba(X) + + @available_if(_estimator_has("predict_log_proba")) + def predict_log_proba(self, X): + """Predict logarithm class probabilities for `X` using the fitted estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + log_probabilities : ndarray of shape (n_samples, n_classes) + The logarithm class probabilities of the input samples. + """ + _check_is_fitted(self) + estimator = getattr(self, "estimator_", self.estimator) + return estimator.predict_log_proba(X) + + @available_if(_estimator_has("decision_function")) + def decision_function(self, X): + """Decision function for samples in `X` using the fitted estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + decisions : ndarray of shape (n_samples,) + The decision function computed the fitted estimator. + """ + _check_is_fitted(self) + estimator = getattr(self, "estimator_", self.estimator) + return estimator.decision_function(X) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_class = False + return tags + + +class FixedThresholdClassifier(BaseThresholdClassifier): + """Binary classifier that manually sets the decision threshold. + + This classifier allows to change the default decision threshold used for + converting posterior probability estimates (i.e. output of `predict_proba`) or + decision scores (i.e. output of `decision_function`) into a class label. + + Here, the threshold is not optimized and is set to a constant value. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.5 + + Parameters + ---------- + estimator : estimator instance + The binary classifier, fitted or not, for which we want to optimize + the decision threshold used during `predict`. + + threshold : {"auto"} or float, default="auto" + The decision threshold to use when converting posterior probability estimates + (i.e. output of `predict_proba`) or decision scores (i.e. output of + `decision_function`) into a class label. When `"auto"`, the threshold is set + to 0.5 if `predict_proba` is used as `response_method`, otherwise it is set to + 0 (i.e. the default threshold for `decision_function`). + + pos_label : int, float, bool or str, default=None + The label of the positive class. Used to process the output of the + `response_method` method. When `pos_label=None`, if `y_true` is in `{-1, 1}` or + `{0, 1}`, `pos_label` is set to 1, otherwise an error will be raised. + + response_method : {"auto", "decision_function", "predict_proba"}, default="auto" + Methods by the classifier `estimator` corresponding to the + decision function for which we want to find a threshold. It can be: + + * if `"auto"`, it will try to invoke `"predict_proba"` or `"decision_function"` + in that order. + * otherwise, one of `"predict_proba"` or `"decision_function"`. + If the method is not implemented by the classifier, it will raise an + error. + + Attributes + ---------- + estimator_ : estimator instance + The fitted classifier used when predicting. + + classes_ : ndarray of shape (n_classes,) + The class labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + See Also + -------- + sklearn.model_selection.TunedThresholdClassifierCV : Classifier that post-tunes + the decision threshold based on some metrics and using cross-validation. + sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates + probabilities. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.metrics import confusion_matrix + >>> from sklearn.model_selection import FixedThresholdClassifier, train_test_split + >>> X, y = make_classification( + ... n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42 + ... ) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, stratify=y, random_state=42 + ... ) + >>> classifier = LogisticRegression(random_state=0).fit(X_train, y_train) + >>> print(confusion_matrix(y_test, classifier.predict(X_test))) + [[217 7] + [ 19 7]] + >>> classifier_other_threshold = FixedThresholdClassifier( + ... classifier, threshold=0.1, response_method="predict_proba" + ... ).fit(X_train, y_train) + >>> print(confusion_matrix(y_test, classifier_other_threshold.predict(X_test))) + [[184 40] + [ 6 20]] + """ + + _parameter_constraints: dict = { + **BaseThresholdClassifier._parameter_constraints, + "threshold": [StrOptions({"auto"}), Real], + "pos_label": [Real, str, "boolean", None], + } + + def __init__( + self, + estimator, + *, + threshold="auto", + pos_label=None, + response_method="auto", + ): + super().__init__(estimator=estimator, response_method=response_method) + self.pos_label = pos_label + self.threshold = threshold + + @property + def classes_(self): + if estimator := getattr(self, "estimator_", None): + return estimator.classes_ + try: + check_is_fitted(self.estimator) + return self.estimator.classes_ + except NotFittedError: + raise AttributeError( + "The underlying estimator is not fitted yet." + ) from NotFittedError + + def _fit(self, X, y, **params): + """Fit the classifier. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + **params : dict + Parameters to pass to the `fit` method of the underlying + classifier. + + Returns + ------- + self : object + Returns an instance of self. + """ + routed_params = process_routing(self, "fit", **params) + self.estimator_ = clone(self.estimator).fit(X, y, **routed_params.estimator.fit) + return self + + def predict(self, X): + """Predict the target of new samples. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The samples, as accepted by `estimator.predict`. + + Returns + ------- + class_labels : ndarray of shape (n_samples,) + The predicted class. + """ + _check_is_fitted(self) + + estimator = getattr(self, "estimator_", self.estimator) + + y_score, _, response_method_used = _get_response_values_binary( + estimator, + X, + self._get_response_method(), + pos_label=self.pos_label, + return_response_method_used=True, + ) + + if self.threshold == "auto": + decision_threshold = 0.5 if response_method_used == "predict_proba" else 0.0 + else: + decision_threshold = self.threshold + + return _threshold_scores_to_class_labels( + y_score, decision_threshold, self.classes_, self.pos_label + ) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, + method_mapping=MethodMapping().add(callee="fit", caller="fit"), + ) + return router + + +def _fit_and_score_over_thresholds( + classifier, + X, + y, + *, + fit_params, + train_idx, + val_idx, + curve_scorer, + score_params, +): + """Fit a classifier and compute the scores for different decision thresholds. + + Parameters + ---------- + classifier : estimator instance + The classifier to fit and use for scoring. If `classifier` is already fitted, + it will be used as is. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The entire dataset. + + y : array-like of shape (n_samples,) + The entire target vector. + + fit_params : dict + Parameters to pass to the `fit` method of the underlying classifier. + + train_idx : ndarray of shape (n_train_samples,) or None + The indices of the training set. If `None`, `classifier` is expected to be + already fitted. + + val_idx : ndarray of shape (n_val_samples,) + The indices of the validation set used to score `classifier`. If `train_idx`, + the entire set will be used. + + curve_scorer : scorer instance + The scorer taking `classifier` and the validation set as input and outputting + decision thresholds and scores as a curve. Note that this is different from + the usual scorer that output a single score value: + + * when `score_method` is one of the four constraint metrics, the curve scorer + will output a curve of two scores parametrized by the decision threshold, e.g. + TPR/TNR or precision/recall curves for each threshold; + * otherwise, the curve scorer will output a single score value for each + threshold. + + score_params : dict + Parameters to pass to the `score` method of the underlying scorer. + + Returns + ------- + scores : ndarray of shape (thresholds,) or tuple of such arrays + The scores computed for each decision threshold. When TPR/TNR or precision/ + recall are computed, `scores` is a tuple of two arrays. + + potential_thresholds : ndarray of shape (thresholds,) + The decision thresholds used to compute the scores. They are returned in + ascending order. + """ + + if train_idx is not None: + X_train, X_val = _safe_indexing(X, train_idx), _safe_indexing(X, val_idx) + y_train, y_val = _safe_indexing(y, train_idx), _safe_indexing(y, val_idx) + fit_params_train = _check_method_params(X, fit_params, indices=train_idx) + score_params_val = _check_method_params(X, score_params, indices=val_idx) + classifier.fit(X_train, y_train, **fit_params_train) + else: # prefit estimator, only a validation set is provided + X_val, y_val, score_params_val = X, y, score_params + + return curve_scorer(classifier, X_val, y_val, **score_params_val) + + +def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores): + """Compute the mean interpolated score across folds by defining common thresholds. + + Parameters + ---------- + target_thresholds : ndarray of shape (thresholds,) + The thresholds to use to compute the mean score. + + cv_thresholds : ndarray of shape (n_folds, thresholds_fold) + The thresholds used to compute the scores for each fold. + + cv_scores : ndarray of shape (n_folds, thresholds_fold) + The scores computed for each threshold for each fold. + + Returns + ------- + mean_score : ndarray of shape (thresholds,) + The mean score across all folds for each target threshold. + """ + return np.mean( + [ + np.interp(target_thresholds, split_thresholds, split_score) + for split_thresholds, split_score in zip(cv_thresholds, cv_scores) + ], + axis=0, + ) + + +class TunedThresholdClassifierCV(BaseThresholdClassifier): + """Classifier that post-tunes the decision threshold using cross-validation. + + This estimator post-tunes the decision threshold (cut-off point) that is + used for converting posterior probability estimates (i.e. output of + `predict_proba`) or decision scores (i.e. output of `decision_function`) + into a class label. The tuning is done by optimizing a binary metric, + potentially constrained by a another metric. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.5 + + Parameters + ---------- + estimator : estimator instance + The classifier, fitted or not, for which we want to optimize + the decision threshold used during `predict`. + + scoring : str or callable, default="balanced_accuracy" + The objective metric to be optimized. Can be one of: + + * a string associated to a scoring function for binary classification + (see :ref:`scoring_parameter`); + * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`; + + response_method : {"auto", "decision_function", "predict_proba"}, default="auto" + Methods by the classifier `estimator` corresponding to the + decision function for which we want to find a threshold. It can be: + + * if `"auto"`, it will try to invoke, for each classifier, + `"predict_proba"` or `"decision_function"` in that order. + * otherwise, one of `"predict_proba"` or `"decision_function"`. + If the method is not implemented by the classifier, it will raise an + error. + + thresholds : int or array-like, default=100 + The number of decision threshold to use when discretizing the output of the + classifier `method`. Pass an array-like to manually specify the thresholds + to use. + + cv : int, float, cross-validation generator, iterable or "prefit", default=None + Determines the cross-validation splitting strategy to train classifier. + Possible inputs for cv are: + + * `None`, to use the default 5-fold stratified K-fold cross validation; + * An integer number, to specify the number of folds in a stratified k-fold; + * A float number, to specify a single shuffle split. The floating number should + be in (0, 1) and represent the size of the validation set; + * An object to be used as a cross-validation generator; + * An iterable yielding train, test splits; + * `"prefit"`, to bypass the cross-validation. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. warning:: + Using `cv="prefit"` and passing the same dataset for fitting `estimator` + and tuning the cut-off point is subject to undesired overfitting. You can + refer to :ref:`TunedThresholdClassifierCV_no_cv` for an example. + + This option should only be used when the set used to fit `estimator` is + different from the one used to tune the cut-off point (by calling + :meth:`TunedThresholdClassifierCV.fit`). + + refit : bool, default=True + Whether or not to refit the classifier on the entire training set once + the decision threshold has been found. + Note that forcing `refit=False` on cross-validation having more + than a single split will raise an error. Similarly, `refit=True` in + conjunction with `cv="prefit"` will raise an error. + + n_jobs : int, default=None + The number of jobs to run in parallel. When `cv` represents a + cross-validation strategy, the fitting and scoring on each data split + is done in parallel. ``None`` means 1 unless in a + :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. See :term:`Glossary ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of cross-validation when `cv` is a float. + See :term:`Glossary `. + + store_cv_results : bool, default=False + Whether to store all scores and thresholds computed during the cross-validation + process. + + Attributes + ---------- + estimator_ : estimator instance + The fitted classifier used when predicting. + + best_threshold_ : float + The new decision threshold. + + best_score_ : float or None + The optimal score of the objective metric, evaluated at `best_threshold_`. + + cv_results_ : dict or None + A dictionary containing the scores and thresholds computed during the + cross-validation process. Only exist if `store_cv_results=True`. The + keys are `"thresholds"` and `"scores"`. + + classes_ : ndarray of shape (n_classes,) + The class labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + See Also + -------- + sklearn.model_selection.FixedThresholdClassifier : Classifier that uses a + constant threshold. + sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates + probabilities. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.metrics import classification_report + >>> from sklearn.model_selection import TunedThresholdClassifierCV, train_test_split + >>> X, y = make_classification( + ... n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42 + ... ) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, stratify=y, random_state=42 + ... ) + >>> classifier = RandomForestClassifier(random_state=0).fit(X_train, y_train) + >>> print(classification_report(y_test, classifier.predict(X_test))) + precision recall f1-score support + + 0 0.94 0.99 0.96 224 + 1 0.80 0.46 0.59 26 + + accuracy 0.93 250 + macro avg 0.87 0.72 0.77 250 + weighted avg 0.93 0.93 0.92 250 + + >>> classifier_tuned = TunedThresholdClassifierCV( + ... classifier, scoring="balanced_accuracy" + ... ).fit(X_train, y_train) + >>> print( + ... f"Cut-off point found at {classifier_tuned.best_threshold_:.3f}" + ... ) + Cut-off point found at 0.342 + >>> print(classification_report(y_test, classifier_tuned.predict(X_test))) + precision recall f1-score support + + 0 0.96 0.95 0.96 224 + 1 0.61 0.65 0.63 26 + + accuracy 0.92 250 + macro avg 0.78 0.80 0.79 250 + weighted avg 0.92 0.92 0.92 250 + + """ + + _parameter_constraints: dict = { + **BaseThresholdClassifier._parameter_constraints, + "scoring": [ + StrOptions(set(get_scorer_names())), + callable, + MutableMapping, + ], + "thresholds": [Interval(Integral, 1, None, closed="left"), "array-like"], + "cv": [ + "cv_object", + StrOptions({"prefit"}), + Interval(RealNotInt, 0.0, 1.0, closed="neither"), + ], + "refit": ["boolean"], + "n_jobs": [Integral, None], + "random_state": ["random_state"], + "store_cv_results": ["boolean"], + } + + def __init__( + self, + estimator, + *, + scoring="balanced_accuracy", + response_method="auto", + thresholds=100, + cv=None, + refit=True, + n_jobs=None, + random_state=None, + store_cv_results=False, + ): + super().__init__(estimator=estimator, response_method=response_method) + self.scoring = scoring + self.thresholds = thresholds + self.cv = cv + self.refit = refit + self.n_jobs = n_jobs + self.random_state = random_state + self.store_cv_results = store_cv_results + + def _fit(self, X, y, **params): + """Fit the classifier and post-tune the decision threshold. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + **params : dict + Parameters to pass to the `fit` method of the underlying + classifier and to the `scoring` scorer. + + Returns + ------- + self : object + Returns an instance of self. + """ + if isinstance(self.cv, Real) and 0 < self.cv < 1: + cv = StratifiedShuffleSplit( + n_splits=1, test_size=self.cv, random_state=self.random_state + ) + elif self.cv == "prefit": + if self.refit is True: + raise ValueError("When cv='prefit', refit cannot be True.") + try: + check_is_fitted(self.estimator, "classes_") + except NotFittedError as exc: + raise NotFittedError( + """When cv='prefit', `estimator` must be fitted.""" + ) from exc + cv = self.cv + else: + cv = check_cv(self.cv, y=y, classifier=True) + if self.refit is False and cv.get_n_splits() > 1: + raise ValueError("When cv has several folds, refit cannot be False.") + + routed_params = process_routing(self, "fit", **params) + self._curve_scorer = self._get_curve_scorer() + + # in the following block, we: + # - define the final classifier `self.estimator_` and train it if necessary + # - define `classifier` to be used to post-tune the decision threshold + # - define `split` to be used to fit/score `classifier` + if cv == "prefit": + self.estimator_ = self.estimator + classifier = self.estimator_ + splits = [(None, range(_num_samples(X)))] + else: + self.estimator_ = clone(self.estimator) + classifier = clone(self.estimator) + splits = cv.split(X, y, **routed_params.splitter.split) + + if self.refit: + # train on the whole dataset + X_train, y_train, fit_params_train = X, y, routed_params.estimator.fit + else: + # single split cross-validation + train_idx, _ = next(cv.split(X, y, **routed_params.splitter.split)) + X_train = _safe_indexing(X, train_idx) + y_train = _safe_indexing(y, train_idx) + fit_params_train = _check_method_params( + X, routed_params.estimator.fit, indices=train_idx + ) + + self.estimator_.fit(X_train, y_train, **fit_params_train) + + cv_scores, cv_thresholds = zip( + *Parallel(n_jobs=self.n_jobs)( + delayed(_fit_and_score_over_thresholds)( + clone(classifier) if cv != "prefit" else classifier, + X, + y, + fit_params=routed_params.estimator.fit, + train_idx=train_idx, + val_idx=val_idx, + curve_scorer=self._curve_scorer, + score_params=routed_params.scorer.score, + ) + for train_idx, val_idx in splits + ) + ) + + if any(np.isclose(th[0], th[-1]) for th in cv_thresholds): + raise ValueError( + "The provided estimator makes constant predictions. Therefore, it is " + "impossible to optimize the decision threshold." + ) + + # find the global min and max thresholds across all folds + min_threshold = min( + split_thresholds.min() for split_thresholds in cv_thresholds + ) + max_threshold = max( + split_thresholds.max() for split_thresholds in cv_thresholds + ) + if isinstance(self.thresholds, Integral): + decision_thresholds = np.linspace( + min_threshold, max_threshold, num=self.thresholds + ) + else: + decision_thresholds = np.asarray(self.thresholds) + + objective_scores = _mean_interpolated_score( + decision_thresholds, cv_thresholds, cv_scores + ) + best_idx = objective_scores.argmax() + self.best_score_ = objective_scores[best_idx] + self.best_threshold_ = decision_thresholds[best_idx] + if self.store_cv_results: + self.cv_results_ = { + "thresholds": decision_thresholds, + "scores": objective_scores, + } + + return self + + def predict(self, X): + """Predict the target of new samples. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The samples, as accepted by `estimator.predict`. + + Returns + ------- + class_labels : ndarray of shape (n_samples,) + The predicted class. + """ + check_is_fitted(self, "estimator_") + pos_label = self._curve_scorer._get_pos_label() + y_score, _ = _get_response_values_binary( + self.estimator_, + X, + self._get_response_method(), + pos_label=pos_label, + ) + + return _threshold_scores_to_class_labels( + y_score, self.best_threshold_, self.classes_, pos_label + ) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add( + estimator=self.estimator, + method_mapping=MethodMapping().add(callee="fit", caller="fit"), + ) + .add( + splitter=self.cv, + method_mapping=MethodMapping().add(callee="split", caller="fit"), + ) + .add( + scorer=self._get_curve_scorer(), + method_mapping=MethodMapping().add(callee="score", caller="fit"), + ) + ) + return router + + def _get_curve_scorer(self): + """Get the curve scorer based on the objective metric used.""" + scoring = check_scoring(self.estimator, scoring=self.scoring) + curve_scorer = _CurveScorer.from_scorer( + scoring, self._get_response_method(), self.thresholds + ) + return curve_scorer diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_plot.py b/.venv/Lib/site-packages/sklearn/model_selection/_plot.py new file mode 100644 index 0000000000000000000000000000000000000000..241be09dfb9b483ac441dd46e7f757feabfe25e4 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/_plot.py @@ -0,0 +1,877 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np + +from ..utils._optional_dependencies import check_matplotlib_support +from ..utils._plotting import _interval_max_min_ratio, _validate_score_name +from ._validation import learning_curve, validation_curve + + +class _BaseCurveDisplay: + def _plot_curve( + self, + x_data, + *, + ax=None, + negate_score=False, + score_name=None, + score_type="test", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + check_matplotlib_support(f"{self.__class__.__name__}.plot") + + import matplotlib.pyplot as plt + + if ax is None: + _, ax = plt.subplots() + + if negate_score: + train_scores, test_scores = -self.train_scores, -self.test_scores + else: + train_scores, test_scores = self.train_scores, self.test_scores + + if std_display_style not in ("errorbar", "fill_between", None): + raise ValueError( + f"Unknown std_display_style: {std_display_style}. Should be one of" + " 'errorbar', 'fill_between', or None." + ) + + if score_type not in ("test", "train", "both"): + raise ValueError( + f"Unknown score_type: {score_type}. Should be one of 'test', " + "'train', or 'both'." + ) + + if score_type == "train": + scores = {"Train": train_scores} + elif score_type == "test": + scores = {"Test": test_scores} + else: # score_type == "both" + scores = {"Train": train_scores, "Test": test_scores} + + if std_display_style in ("fill_between", None): + # plot the mean score + if line_kw is None: + line_kw = {} + + self.lines_ = [] + for line_label, score in scores.items(): + self.lines_.append( + *ax.plot( + x_data, + score.mean(axis=1), + label=line_label, + **line_kw, + ) + ) + self.errorbar_ = None + self.fill_between_ = None # overwritten below by fill_between + + if std_display_style == "errorbar": + if errorbar_kw is None: + errorbar_kw = {} + + self.errorbar_ = [] + for line_label, score in scores.items(): + self.errorbar_.append( + ax.errorbar( + x_data, + score.mean(axis=1), + score.std(axis=1), + label=line_label, + **errorbar_kw, + ) + ) + self.lines_, self.fill_between_ = None, None + elif std_display_style == "fill_between": + if fill_between_kw is None: + fill_between_kw = {} + default_fill_between_kw = {"alpha": 0.5} + fill_between_kw = {**default_fill_between_kw, **fill_between_kw} + + self.fill_between_ = [] + for line_label, score in scores.items(): + self.fill_between_.append( + ax.fill_between( + x_data, + score.mean(axis=1) - score.std(axis=1), + score.mean(axis=1) + score.std(axis=1), + **fill_between_kw, + ) + ) + + score_name = self.score_name if score_name is None else score_name + + ax.legend() + + # We found that a ratio, smaller or bigger than 5, between the largest and + # smallest gap of the x values is a good indicator to choose between linear + # and log scale. + if _interval_max_min_ratio(x_data) > 5: + xscale = "symlog" if x_data.min() <= 0 else "log" + else: + xscale = "linear" + + ax.set_xscale(xscale) + ax.set_ylabel(f"{score_name}") + + self.ax_ = ax + self.figure_ = ax.figure + + +class LearningCurveDisplay(_BaseCurveDisplay): + """Learning Curve visualization. + + It is recommended to use + :meth:`~sklearn.model_selection.LearningCurveDisplay.from_estimator` to + create a :class:`~sklearn.model_selection.LearningCurveDisplay` instance. + All parameters are stored as attributes. + + Read more in the :ref:`User Guide ` for general information + about the visualization API and + :ref:`detailed documentation ` regarding the learning + curve visualization. + + .. versionadded:: 1.2 + + Parameters + ---------- + train_sizes : ndarray of shape (n_unique_ticks,) + Numbers of training examples that has been used to generate the + learning curve. + + train_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on test set. + + score_name : str, default=None + The name of the score used in `learning_curve`. It will override the name + inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if + `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a + string or a callable, we infer the name. We replace `_` by spaces and capitalize + the first letter. We remove `neg_` and replace it by `"Negative"` if + `negate_score` is `False` or just remove it otherwise. + + Attributes + ---------- + ax_ : matplotlib Axes + Axes with the learning curve. + + figure_ : matplotlib Figure + Figure containing the learning curve. + + errorbar_ : list of matplotlib Artist or None + When the `std_display_style` is `"errorbar"`, this is a list of + `matplotlib.container.ErrorbarContainer` objects. If another style is + used, `errorbar_` is `None`. + + lines_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.lines.Line2D` objects corresponding to the mean train and + test scores. If another style is used, `line_` is `None`. + + fill_between_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.collections.PolyCollection` objects. If another style is + used, `fill_between_` is `None`. + + See Also + -------- + sklearn.model_selection.learning_curve : Compute the learning curve. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import load_iris + >>> from sklearn.model_selection import LearningCurveDisplay, learning_curve + >>> from sklearn.tree import DecisionTreeClassifier + >>> X, y = load_iris(return_X_y=True) + >>> tree = DecisionTreeClassifier(random_state=0) + >>> train_sizes, train_scores, test_scores = learning_curve( + ... tree, X, y) + >>> display = LearningCurveDisplay(train_sizes=train_sizes, + ... train_scores=train_scores, test_scores=test_scores, score_name="Score") + >>> display.plot() + <...> + >>> plt.show() + """ + + def __init__(self, *, train_sizes, train_scores, test_scores, score_name=None): + self.train_sizes = train_sizes + self.train_scores = train_scores + self.test_scores = test_scores + self.score_name = score_name + + def plot( + self, + ax=None, + *, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Plot visualization. + + Parameters + ---------- + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.learning_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If None, no standard deviation representation is + displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.LearningCurveDisplay` + Object that stores computed values. + """ + self._plot_curve( + self.train_sizes, + ax=ax, + negate_score=negate_score, + score_name=score_name, + score_type=score_type, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) + self.ax_.set_xlabel("Number of samples in the training set") + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + groups=None, + train_sizes=np.linspace(0.1, 1.0, 5), + cv=None, + scoring=None, + exploit_incremental_learning=False, + n_jobs=None, + pre_dispatch="all", + verbose=0, + shuffle=False, + random_state=None, + error_score=np.nan, + fit_params=None, + ax=None, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Create a learning curve display from an estimator. + + Read more in the :ref:`User Guide ` for general + information about the visualization API and :ref:`detailed + documentation ` regarding the learning curve + visualization. + + Parameters + ---------- + estimator : object type that implements the "fit" and "predict" methods + An object of that type which is cloned for each validation. + + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + Target relative to X for classification or regression; + None for unsupervised learning. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + train_sizes : array-like of shape (n_ticks,), \ + default=np.linspace(0.1, 1.0, 5) + Relative or absolute numbers of training examples that will be used + to generate the learning curve. If the dtype is float, it is + regarded as a fraction of the maximum size of the training set + (that is determined by the selected validation method), i.e. it has + to be within (0, 1]. Otherwise it is interpreted as absolute sizes + of the training sets. Note that for classification the number of + samples usually have to be big enough to contain at least one + sample from each class. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and `y` is + either binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. In all + other cases, :class:`~sklearn.model_selection.KFold` is used. These + splitters are instantiated with `shuffle=False` so the splits will + be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + scoring : str or callable, default=None + A string (see :ref:`scoring_parameter`) or + a scorer callable object / function with signature + `scorer(estimator, X, y)` (see :ref:`scoring_callable`). + + exploit_incremental_learning : bool, default=False + If the estimator supports incremental learning, this will be + used to speed up fitting for different training set sizes. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and + computing the score are parallelized over the different training + and test sets. `None` means 1 unless in a + :obj:`joblib.parallel_backend` context. `-1` means using all + processors. See :term:`Glossary ` for more details. + + pre_dispatch : int or str, default='all' + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The str can + be an expression like '2*n_jobs'. + + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + shuffle : bool, default=False + Whether to shuffle training data before taking prefixes of it + based on`train_sizes`. + + random_state : int, RandomState instance or None, default=None + Used when `shuffle` is True. Pass an int for reproducible + output across multiple function calls. + See :term:`Glossary `. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator + fitting. If set to 'raise', the error is raised. If a numeric value + is given, FitFailedWarning is raised. + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.learning_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If `None`, no representation of the standard deviation + is displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.LearningCurveDisplay` + Object that stores computed values. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import load_iris + >>> from sklearn.model_selection import LearningCurveDisplay + >>> from sklearn.tree import DecisionTreeClassifier + >>> X, y = load_iris(return_X_y=True) + >>> tree = DecisionTreeClassifier(random_state=0) + >>> LearningCurveDisplay.from_estimator(tree, X, y) + <...> + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_estimator") + + score_name = _validate_score_name(score_name, scoring, negate_score) + + train_sizes, train_scores, test_scores = learning_curve( + estimator, + X, + y, + groups=groups, + train_sizes=train_sizes, + cv=cv, + scoring=scoring, + exploit_incremental_learning=exploit_incremental_learning, + n_jobs=n_jobs, + pre_dispatch=pre_dispatch, + verbose=verbose, + shuffle=shuffle, + random_state=random_state, + error_score=error_score, + return_times=False, + fit_params=fit_params, + ) + + viz = cls( + train_sizes=train_sizes, + train_scores=train_scores, + test_scores=test_scores, + score_name=score_name, + ) + return viz.plot( + ax=ax, + negate_score=negate_score, + score_type=score_type, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) + + +class ValidationCurveDisplay(_BaseCurveDisplay): + """Validation Curve visualization. + + It is recommended to use + :meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` to + create a :class:`~sklearn.model_selection.ValidationCurveDisplay` instance. + All parameters are stored as attributes. + + Read more in the :ref:`User Guide ` for general information + about the visualization API and :ref:`detailed documentation + ` regarding the validation curve visualization. + + .. versionadded:: 1.3 + + Parameters + ---------- + param_name : str + Name of the parameter that has been varied. + + param_range : array-like of shape (n_ticks,) + The values of the parameter that have been evaluated. + + train_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on test set. + + score_name : str, default=None + The name of the score used in `validation_curve`. It will override the name + inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if + `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a + string or a callable, we infer the name. We replace `_` by spaces and capitalize + the first letter. We remove `neg_` and replace it by `"Negative"` if + `negate_score` is `False` or just remove it otherwise. + + Attributes + ---------- + ax_ : matplotlib Axes + Axes with the validation curve. + + figure_ : matplotlib Figure + Figure containing the validation curve. + + errorbar_ : list of matplotlib Artist or None + When the `std_display_style` is `"errorbar"`, this is a list of + `matplotlib.container.ErrorbarContainer` objects. If another style is + used, `errorbar_` is `None`. + + lines_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.lines.Line2D` objects corresponding to the mean train and + test scores. If another style is used, `line_` is `None`. + + fill_between_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.collections.PolyCollection` objects. If another style is + used, `fill_between_` is `None`. + + See Also + -------- + sklearn.model_selection.validation_curve : Compute the validation curve. + + Examples + -------- + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import ValidationCurveDisplay, validation_curve + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=1_000, random_state=0) + >>> logistic_regression = LogisticRegression() + >>> param_name, param_range = "C", np.logspace(-8, 3, 10) + >>> train_scores, test_scores = validation_curve( + ... logistic_regression, X, y, param_name=param_name, param_range=param_range + ... ) + >>> display = ValidationCurveDisplay( + ... param_name=param_name, param_range=param_range, + ... train_scores=train_scores, test_scores=test_scores, score_name="Score" + ... ) + >>> display.plot() + <...> + >>> plt.show() + """ + + def __init__( + self, *, param_name, param_range, train_scores, test_scores, score_name=None + ): + self.param_name = param_name + self.param_range = param_range + self.train_scores = train_scores + self.test_scores = test_scores + self.score_name = score_name + + def plot( + self, + ax=None, + *, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Plot visualization. + + Parameters + ---------- + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.validation_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If None, no standard deviation representation is + displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.ValidationCurveDisplay` + Object that stores computed values. + """ + self._plot_curve( + self.param_range, + ax=ax, + negate_score=negate_score, + score_name=score_name, + score_type=score_type, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) + self.ax_.set_xlabel(f"{self.param_name}") + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + param_name, + param_range, + groups=None, + cv=None, + scoring=None, + n_jobs=None, + pre_dispatch="all", + verbose=0, + error_score=np.nan, + fit_params=None, + ax=None, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Create a validation curve display from an estimator. + + Read more in the :ref:`User Guide ` for general + information about the visualization API and :ref:`detailed + documentation ` regarding the validation curve + visualization. + + Parameters + ---------- + estimator : object type that implements the "fit" and "predict" methods + An object of that type which is cloned for each validation. + + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + Target relative to X for classification or regression; + None for unsupervised learning. + + param_name : str + Name of the parameter that will be varied. + + param_range : array-like of shape (n_values,) + The values of the parameter that will be evaluated. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and `y` is + either binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. In all + other cases, :class:`~sklearn.model_selection.KFold` is used. These + splitters are instantiated with `shuffle=False` so the splits will + be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + scoring : str or callable, default=None + A string (see :ref:`scoring_parameter`) or + a scorer callable object / function with signature + `scorer(estimator, X, y)` (see :ref:`scoring_callable`). + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and + computing the score are parallelized over the different training + and test sets. `None` means 1 unless in a + :obj:`joblib.parallel_backend` context. `-1` means using all + processors. See :term:`Glossary ` for more details. + + pre_dispatch : int or str, default='all' + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The str can + be an expression like '2*n_jobs'. + + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator + fitting. If set to 'raise', the error is raised. If a numeric value + is given, FitFailedWarning is raised. + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.validation_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If `None`, no representation of the standard deviation + is displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.ValidationCurveDisplay` + Object that stores computed values. + + Examples + -------- + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import ValidationCurveDisplay + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=1_000, random_state=0) + >>> logistic_regression = LogisticRegression() + >>> param_name, param_range = "C", np.logspace(-8, 3, 10) + >>> ValidationCurveDisplay.from_estimator( + ... logistic_regression, X, y, param_name=param_name, + ... param_range=param_range, + ... ) + <...> + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_estimator") + + score_name = _validate_score_name(score_name, scoring, negate_score) + + train_scores, test_scores = validation_curve( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + groups=groups, + cv=cv, + scoring=scoring, + n_jobs=n_jobs, + pre_dispatch=pre_dispatch, + verbose=verbose, + error_score=error_score, + fit_params=fit_params, + ) + + viz = cls( + param_name=param_name, + param_range=np.asarray(param_range), + train_scores=train_scores, + test_scores=test_scores, + score_name=score_name, + ) + return viz.plot( + ax=ax, + negate_score=negate_score, + score_type=score_type, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_search.py b/.venv/Lib/site-packages/sklearn/model_selection/_search.py new file mode 100644 index 0000000000000000000000000000000000000000..75266a41ae71d7f049d240db1a84dca14d477493 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/_search.py @@ -0,0 +1,1954 @@ +""" +The :mod:`sklearn.model_selection._search` includes utilities to fine-tune the +parameters of an estimator. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import operator +import time +import warnings +from abc import ABCMeta, abstractmethod +from collections import defaultdict +from collections.abc import Iterable, Mapping, Sequence +from copy import deepcopy +from functools import partial, reduce +from itertools import product + +import numpy as np +from numpy.ma import MaskedArray +from scipy.stats import rankdata + +from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier +from ..exceptions import NotFittedError +from ..metrics import check_scoring +from ..metrics._scorer import ( + _check_multimetric_scoring, + _MultimetricScorer, + get_scorer_names, +) +from ..utils import Bunch, check_random_state +from ..utils._estimator_html_repr import _VisualBlock +from ..utils._param_validation import HasMethods, Interval, StrOptions +from ..utils._tags import get_tags +from ..utils.deprecation import _deprecate_Xt_in_inverse_transform +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.metaestimators import available_if +from ..utils.parallel import Parallel, delayed +from ..utils.random import sample_without_replacement +from ..utils.validation import _check_method_params, check_is_fitted, indexable +from ._split import check_cv +from ._validation import ( + _aggregate_score_dicts, + _fit_and_score, + _insert_error_scores, + _normalize_score_results, + _warn_or_raise_about_fit_failures, +) + +__all__ = ["GridSearchCV", "ParameterGrid", "ParameterSampler", "RandomizedSearchCV"] + + +class ParameterGrid: + """Grid of parameters with a discrete number of values for each. + + Can be used to iterate over parameter value combinations with the + Python built-in function iter. + The order of the generated parameter combinations is deterministic. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + param_grid : dict of str to sequence, or sequence of such + The parameter grid to explore, as a dictionary mapping estimator + parameters to sequences of allowed values. + + An empty dict signifies default parameters. + + A sequence of dicts signifies a sequence of grids to search, and is + useful to avoid exploring parameter combinations that make no sense + or have no effect. See the examples below. + + Examples + -------- + >>> from sklearn.model_selection import ParameterGrid + >>> param_grid = {'a': [1, 2], 'b': [True, False]} + >>> list(ParameterGrid(param_grid)) == ( + ... [{'a': 1, 'b': True}, {'a': 1, 'b': False}, + ... {'a': 2, 'b': True}, {'a': 2, 'b': False}]) + True + + >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}] + >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'}, + ... {'kernel': 'rbf', 'gamma': 1}, + ... {'kernel': 'rbf', 'gamma': 10}] + True + >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1} + True + + See Also + -------- + GridSearchCV : Uses :class:`ParameterGrid` to perform a full parallelized + parameter search. + """ + + def __init__(self, param_grid): + if not isinstance(param_grid, (Mapping, Iterable)): + raise TypeError( + f"Parameter grid should be a dict or a list, got: {param_grid!r} of" + f" type {type(param_grid).__name__}" + ) + + if isinstance(param_grid, Mapping): + # wrap dictionary in a singleton list to support either dict + # or list of dicts + param_grid = [param_grid] + + # check if all entries are dictionaries of lists + for grid in param_grid: + if not isinstance(grid, dict): + raise TypeError(f"Parameter grid is not a dict ({grid!r})") + for key, value in grid.items(): + if isinstance(value, np.ndarray) and value.ndim > 1: + raise ValueError( + f"Parameter array for {key!r} should be one-dimensional, got:" + f" {value!r} with shape {value.shape}" + ) + if isinstance(value, str) or not isinstance( + value, (np.ndarray, Sequence) + ): + raise TypeError( + f"Parameter grid for parameter {key!r} needs to be a list or a" + f" numpy array, but got {value!r} (of type " + f"{type(value).__name__}) instead. Single values " + "need to be wrapped in a list with one element." + ) + if len(value) == 0: + raise ValueError( + f"Parameter grid for parameter {key!r} need " + f"to be a non-empty sequence, got: {value!r}" + ) + + self.param_grid = param_grid + + def __iter__(self): + """Iterate over the points in the grid. + + Returns + ------- + params : iterator over dict of str to any + Yields dictionaries mapping each estimator parameter to one of its + allowed values. + """ + for p in self.param_grid: + # Always sort the keys of a dictionary, for reproducibility + items = sorted(p.items()) + if not items: + yield {} + else: + keys, values = zip(*items) + for v in product(*values): + params = dict(zip(keys, v)) + yield params + + def __len__(self): + """Number of points on the grid.""" + # Product function that can handle iterables (np.prod can't). + product = partial(reduce, operator.mul) + return sum( + product(len(v) for v in p.values()) if p else 1 for p in self.param_grid + ) + + def __getitem__(self, ind): + """Get the parameters that would be ``ind``th in iteration + + Parameters + ---------- + ind : int + The iteration index + + Returns + ------- + params : dict of str to any + Equal to list(self)[ind] + """ + # This is used to make discrete sampling without replacement memory + # efficient. + for sub_grid in self.param_grid: + # XXX: could memoize information used here + if not sub_grid: + if ind == 0: + return {} + else: + ind -= 1 + continue + + # Reverse so most frequent cycling parameter comes first + keys, values_lists = zip(*sorted(sub_grid.items())[::-1]) + sizes = [len(v_list) for v_list in values_lists] + total = np.prod(sizes) + + if ind >= total: + # Try the next grid + ind -= total + else: + out = {} + for key, v_list, n in zip(keys, values_lists, sizes): + ind, offset = divmod(ind, n) + out[key] = v_list[offset] + return out + + raise IndexError("ParameterGrid index out of range") + + +class ParameterSampler: + """Generator on parameters sampled from given distributions. + + Non-deterministic iterable over random candidate combinations for hyper- + parameter search. If all parameters are presented as a list, + sampling without replacement is performed. If at least one parameter + is given as a distribution, sampling with replacement is used. + It is highly recommended to use continuous distributions for continuous + parameters. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + param_distributions : dict + Dictionary with parameters names (`str`) as keys and distributions + or lists of parameters to try. Distributions must provide a ``rvs`` + method for sampling (such as those from scipy.stats.distributions). + If a list is given, it is sampled uniformly. + If a list of dicts is given, first a dict is sampled uniformly, and + then a parameter is sampled using that dict as above. + + n_iter : int + Number of parameter settings that are produced. + + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for random uniform sampling + from lists of possible values instead of scipy.stats distributions. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + + Returns + ------- + params : dict of str to any + **Yields** dictionaries mapping each estimator parameter to + as sampled value. + + Examples + -------- + >>> from sklearn.model_selection import ParameterSampler + >>> from scipy.stats.distributions import expon + >>> import numpy as np + >>> rng = np.random.RandomState(0) + >>> param_grid = {'a':[1, 2], 'b': expon()} + >>> param_list = list(ParameterSampler(param_grid, n_iter=4, + ... random_state=rng)) + >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items()) + ... for d in param_list] + >>> rounded_list == [{'b': 0.89856, 'a': 1}, + ... {'b': 0.923223, 'a': 1}, + ... {'b': 1.878964, 'a': 2}, + ... {'b': 1.038159, 'a': 2}] + True + """ + + def __init__(self, param_distributions, n_iter, *, random_state=None): + if not isinstance(param_distributions, (Mapping, Iterable)): + raise TypeError( + "Parameter distribution is not a dict or a list," + f" got: {param_distributions!r} of type " + f"{type(param_distributions).__name__}" + ) + + if isinstance(param_distributions, Mapping): + # wrap dictionary in a singleton list to support either dict + # or list of dicts + param_distributions = [param_distributions] + + for dist in param_distributions: + if not isinstance(dist, dict): + raise TypeError( + "Parameter distribution is not a dict ({!r})".format(dist) + ) + for key in dist: + if not isinstance(dist[key], Iterable) and not hasattr( + dist[key], "rvs" + ): + raise TypeError( + f"Parameter grid for parameter {key!r} is not iterable " + f"or a distribution (value={dist[key]})" + ) + self.n_iter = n_iter + self.random_state = random_state + self.param_distributions = param_distributions + + def _is_all_lists(self): + return all( + all(not hasattr(v, "rvs") for v in dist.values()) + for dist in self.param_distributions + ) + + def __iter__(self): + rng = check_random_state(self.random_state) + + # if all distributions are given as lists, we want to sample without + # replacement + if self._is_all_lists(): + # look up sampled parameter settings in parameter grid + param_grid = ParameterGrid(self.param_distributions) + grid_size = len(param_grid) + n_iter = self.n_iter + + if grid_size < n_iter: + warnings.warn( + "The total space of parameters %d is smaller " + "than n_iter=%d. Running %d iterations. For exhaustive " + "searches, use GridSearchCV." % (grid_size, self.n_iter, grid_size), + UserWarning, + ) + n_iter = grid_size + for i in sample_without_replacement(grid_size, n_iter, random_state=rng): + yield param_grid[i] + + else: + for _ in range(self.n_iter): + dist = rng.choice(self.param_distributions) + # Always sort the keys of a dictionary, for reproducibility + items = sorted(dist.items()) + params = dict() + for k, v in items: + if hasattr(v, "rvs"): + params[k] = v.rvs(random_state=rng) + else: + params[k] = v[rng.randint(len(v))] + yield params + + def __len__(self): + """Number of points that will be sampled.""" + if self._is_all_lists(): + grid_size = len(ParameterGrid(self.param_distributions)) + return min(self.n_iter, grid_size) + else: + return self.n_iter + + +def _check_refit(search_cv, attr): + if not search_cv.refit: + raise AttributeError( + f"This {type(search_cv).__name__} instance was initialized with " + f"`refit=False`. {attr} is available only after refitting on the best " + "parameters. You can refit an estimator manually using the " + "`best_params_` attribute" + ) + + +def _search_estimator_has(attr): + """Check if we can delegate a method to the underlying estimator. + + Calling a prediction method will only be available if `refit=True`. In + such case, we check first the fitted best estimator. If it is not + fitted, we check the unfitted estimator. + + Checking the unfitted estimator allows to use `hasattr` on the `SearchCV` + instance even before calling `fit`. + """ + + def check(self): + _check_refit(self, attr) + if hasattr(self, "best_estimator_"): + # raise an AttributeError if `attr` does not exist + getattr(self.best_estimator_, attr) + return True + # raise an AttributeError if `attr` does not exist + getattr(self.estimator, attr) + return True + + return check + + +def _yield_masked_array_for_each_param(candidate_params): + """ + Yield a masked array for each candidate param. + + `candidate_params` is a sequence of params which were used in + a `GridSearchCV`. We use masked arrays for the results, as not + all params are necessarily present in each element of + `candidate_params`. For example, if using `GridSearchCV` with + a `SVC` model, then one might search over params like: + + - kernel=["rbf"], gamma=[0.1, 1] + - kernel=["poly"], degree=[1, 2] + + and then param `'gamma'` would not be present in entries of + `candidate_params` corresponding to `kernel='poly'`. + """ + n_candidates = len(candidate_params) + param_results = defaultdict(dict) + + for cand_idx, params in enumerate(candidate_params): + for name, value in params.items(): + param_results["param_%s" % name][cand_idx] = value + + for key, param_result in param_results.items(): + param_list = list(param_result.values()) + try: + arr = np.array(param_list) + except ValueError: + # This can happen when param_list contains lists of different + # lengths, for example: + # param_list=[[1], [2, 3]] + arr_dtype = np.dtype(object) + else: + # There are two cases when we don't use the automatically inferred + # dtype when creating the array and we use object instead: + # - string dtype + # - when array.ndim > 1, that means that param_list was something + # like a list of same-size sequences, which gets turned into a + # multi-dimensional array but we want a 1d array + arr_dtype = arr.dtype if arr.dtype.kind != "U" and arr.ndim == 1 else object + + # Use one MaskedArray and mask all the places where the param is not + # applicable for that candidate (which may not contain all the params). + ma = MaskedArray(np.empty(n_candidates, dtype=arr_dtype), mask=True) + for index, value in param_result.items(): + # Setting the value at an index unmasks that index + ma[index] = value + yield (key, ma) + + +class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): + """Abstract base class for hyper parameter search with cross-validation.""" + + _parameter_constraints: dict = { + "estimator": [HasMethods(["fit"])], + "scoring": [ + StrOptions(set(get_scorer_names())), + callable, + list, + tuple, + dict, + None, + ], + "n_jobs": [numbers.Integral, None], + "refit": ["boolean", str, callable], + "cv": ["cv_object"], + "verbose": ["verbose"], + "pre_dispatch": [numbers.Integral, str], + "error_score": [StrOptions({"raise"}), numbers.Real], + "return_train_score": ["boolean"], + } + + @abstractmethod + def __init__( + self, + estimator, + *, + scoring=None, + n_jobs=None, + refit=True, + cv=None, + verbose=0, + pre_dispatch="2*n_jobs", + error_score=np.nan, + return_train_score=True, + ): + self.scoring = scoring + self.estimator = estimator + self.n_jobs = n_jobs + self.refit = refit + self.cv = cv + self.verbose = verbose + self.pre_dispatch = pre_dispatch + self.error_score = error_score + self.return_train_score = return_train_score + + @property + # TODO(1.8) remove this property + def _estimator_type(self): + return self.estimator._estimator_type + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + sub_estimator_tags = get_tags(self.estimator) + tags.estimator_type = sub_estimator_tags.estimator_type + tags.classifier_tags = deepcopy(sub_estimator_tags.classifier_tags) + tags.regressor_tags = deepcopy(sub_estimator_tags.regressor_tags) + # allows cross-validation to see 'precomputed' metrics + tags.input_tags.pairwise = get_tags(self.estimator).input_tags.pairwise + tags.array_api_support = get_tags(self.estimator).array_api_support + return tags + + def score(self, X, y=None, **params): + """Return the score on the given data, if the estimator has been refit. + + This uses the score defined by ``scoring`` where provided, and the + ``best_estimator_.score`` method otherwise. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples, n_output) \ + or (n_samples,), default=None + Target relative to X for classification or regression; + None for unsupervised learning. + + **params : dict + Parameters to be passed to the underlying scorer(s). + + .. versionadded:: 1.4 + Only available if `enable_metadata_routing=True`. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + score : float + The score defined by ``scoring`` if provided, and the + ``best_estimator_.score`` method otherwise. + """ + _check_refit(self, "score") + check_is_fitted(self) + + _raise_for_params(params, self, "score") + + if _routing_enabled(): + score_params = process_routing(self, "score", **params).scorer["score"] + else: + score_params = dict() + + if self.scorer_ is None: + raise ValueError( + "No score function explicitly defined, " + "and the estimator doesn't provide one %s" % self.best_estimator_ + ) + if isinstance(self.scorer_, dict): + if self.multimetric_: + scorer = self.scorer_[self.refit] + else: + scorer = self.scorer_ + return scorer(self.best_estimator_, X, y, **score_params) + + # callable + score = self.scorer_(self.best_estimator_, X, y, **score_params) + if self.multimetric_: + score = score[self.refit] + return score + + @available_if(_search_estimator_has("score_samples")) + def score_samples(self, X): + """Call score_samples on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``score_samples``. + + .. versionadded:: 0.24 + + Parameters + ---------- + X : iterable + Data to predict on. Must fulfill input requirements + of the underlying estimator. + + Returns + ------- + y_score : ndarray of shape (n_samples,) + The ``best_estimator_.score_samples`` method. + """ + check_is_fitted(self) + return self.best_estimator_.score_samples(X) + + @available_if(_search_estimator_has("predict")) + def predict(self, X): + """Call predict on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``predict``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + The predicted labels or values for `X` based on the estimator with + the best found parameters. + """ + check_is_fitted(self) + return self.best_estimator_.predict(X) + + @available_if(_search_estimator_has("predict_proba")) + def predict_proba(self, X): + """Call predict_proba on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``predict_proba``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes) + Predicted class probabilities for `X` based on the estimator with + the best found parameters. The order of the classes corresponds + to that in the fitted attribute :term:`classes_`. + """ + check_is_fitted(self) + return self.best_estimator_.predict_proba(X) + + @available_if(_search_estimator_has("predict_log_proba")) + def predict_log_proba(self, X): + """Call predict_log_proba on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``predict_log_proba``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes) + Predicted class log-probabilities for `X` based on the estimator + with the best found parameters. The order of the classes + corresponds to that in the fitted attribute :term:`classes_`. + """ + check_is_fitted(self) + return self.best_estimator_.predict_log_proba(X) + + @available_if(_search_estimator_has("decision_function")) + def decision_function(self, X): + """Call decision_function on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``decision_function``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + y_score : ndarray of shape (n_samples,) or (n_samples, n_classes) \ + or (n_samples, n_classes * (n_classes-1) / 2) + Result of the decision function for `X` based on the estimator with + the best found parameters. + """ + check_is_fitted(self) + return self.best_estimator_.decision_function(X) + + @available_if(_search_estimator_has("transform")) + def transform(self, X): + """Call transform on the estimator with the best found parameters. + + Only available if the underlying estimator supports ``transform`` and + ``refit=True``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) + `X` transformed in the new space based on the estimator with + the best found parameters. + """ + check_is_fitted(self) + return self.best_estimator_.transform(X) + + @available_if(_search_estimator_has("inverse_transform")) + def inverse_transform(self, X=None, Xt=None): + """Call inverse_transform on the estimator with the best found params. + + Only available if the underlying estimator implements + ``inverse_transform`` and ``refit=True``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Xt : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + .. deprecated:: 1.5 + `Xt` was deprecated in 1.5 and will be removed in 1.7. Use `X` instead. + + Returns + ------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Result of the `inverse_transform` function for `Xt` based on the + estimator with the best found parameters. + """ + X = _deprecate_Xt_in_inverse_transform(X, Xt) + check_is_fitted(self) + return self.best_estimator_.inverse_transform(X) + + @property + def n_features_in_(self): + """Number of features seen during :term:`fit`. + + Only available when `refit=True`. + """ + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the search estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute.".format( + self.__class__.__name__ + ) + ) from nfe + + return self.best_estimator_.n_features_in_ + + @property + def classes_(self): + """Class labels. + + Only available when `refit=True` and the estimator is a classifier. + """ + _search_estimator_has("classes_")(self) + return self.best_estimator_.classes_ + + def _run_search(self, evaluate_candidates): + """Repeatedly calls `evaluate_candidates` to conduct a search. + + This method, implemented in sub-classes, makes it possible to + customize the scheduling of evaluations: GridSearchCV and + RandomizedSearchCV schedule evaluations for their whole parameter + search space at once but other more sequential approaches are also + possible: for instance is possible to iteratively schedule evaluations + for new regions of the parameter search space based on previously + collected evaluation results. This makes it possible to implement + Bayesian optimization or more generally sequential model-based + optimization by deriving from the BaseSearchCV abstract base class. + For example, Successive Halving is implemented by calling + `evaluate_candidates` multiples times (once per iteration of the SH + process), each time passing a different set of candidates with `X` + and `y` of increasing sizes. + + Parameters + ---------- + evaluate_candidates : callable + This callback accepts: + - a list of candidates, where each candidate is a dict of + parameter settings. + - an optional `cv` parameter which can be used to e.g. + evaluate candidates on different dataset splits, or + evaluate candidates on subsampled data (as done in the + SucessiveHaling estimators). By default, the original `cv` + parameter is used, and it is available as a private + `_checked_cv_orig` attribute. + - an optional `more_results` dict. Each key will be added to + the `cv_results_` attribute. Values should be lists of + length `n_candidates` + + It returns a dict of all results so far, formatted like + ``cv_results_``. + + Important note (relevant whether the default cv is used or not): + in randomized splitters, and unless the random_state parameter of + cv was set to an int, calling cv.split() multiple times will + yield different splits. Since cv.split() is called in + evaluate_candidates, this means that candidates will be evaluated + on different splits each time evaluate_candidates is called. This + might be a methodological issue depending on the search strategy + that you're implementing. To prevent randomized splitters from + being used, you may use _split._yields_constant_splits() + + Examples + -------- + + :: + + def _run_search(self, evaluate_candidates): + 'Try C=0.1 only if C=1 is better than C=10' + all_results = evaluate_candidates([{'C': 1}, {'C': 10}]) + score = all_results['mean_test_score'] + if score[0] < score[1]: + evaluate_candidates([{'C': 0.1}]) + """ + raise NotImplementedError("_run_search not implemented.") + + def _check_refit_for_multimetric(self, scores): + """Check `refit` is compatible with `scores` is valid""" + multimetric_refit_msg = ( + "For multi-metric scoring, the parameter refit must be set to a " + "scorer key or a callable to refit an estimator with the best " + "parameter setting on the whole data and make the best_* " + "attributes available for that metric. If this is not needed, " + f"refit should be set to False explicitly. {self.refit!r} was " + "passed." + ) + + valid_refit_dict = isinstance(self.refit, str) and self.refit in scores + + if ( + self.refit is not False + and not valid_refit_dict + and not callable(self.refit) + ): + raise ValueError(multimetric_refit_msg) + + @staticmethod + def _select_best_index(refit, refit_metric, results): + """Select index of the best combination of hyperparemeters.""" + if callable(refit): + # If callable, refit is expected to return the index of the best + # parameter set. + best_index = refit(results) + if not isinstance(best_index, numbers.Integral): + raise TypeError("best_index_ returned is not an integer") + if best_index < 0 or best_index >= len(results["params"]): + raise IndexError("best_index_ index out of range") + else: + best_index = results[f"rank_test_{refit_metric}"].argmin() + return best_index + + def _get_scorers(self): + """Get the scorer(s) to be used. + + This is used in ``fit`` and ``get_metadata_routing``. + + Returns + ------- + scorers, refit_metric + """ + refit_metric = "score" + + if callable(self.scoring): + scorers = self.scoring + elif self.scoring is None or isinstance(self.scoring, str): + scorers = check_scoring(self.estimator, self.scoring) + else: + scorers = _check_multimetric_scoring(self.estimator, self.scoring) + self._check_refit_for_multimetric(scorers) + refit_metric = self.refit + scorers = _MultimetricScorer( + scorers=scorers, raise_exc=(self.error_score == "raise") + ) + + return scorers, refit_metric + + def _get_routed_params_for_fit(self, params): + """Get the parameters to be used for routing. + + This is a method instead of a snippet in ``fit`` since it's used twice, + here in ``fit``, and in ``HalvingRandomSearchCV.fit``. + """ + if _routing_enabled(): + routed_params = process_routing(self, "fit", **params) + else: + params = params.copy() + groups = params.pop("groups", None) + routed_params = Bunch( + estimator=Bunch(fit=params), + splitter=Bunch(split={"groups": groups}), + scorer=Bunch(score={}), + ) + return routed_params + + @_fit_context( + # *SearchCV.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None, **params): + """Run fit with all sets of parameters. + + Parameters + ---------- + + X : array-like of shape (n_samples, n_features) or (n_samples, n_samples) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. For precomputed kernel or + distance matrix, the expected shape of X is (n_samples, n_samples). + + y : array-like of shape (n_samples, n_output) \ + or (n_samples,), default=None + Target relative to X for classification or regression; + None for unsupervised learning. + + **params : dict of str -> object + Parameters passed to the ``fit`` method of the estimator, the scorer, + and the CV splitter. + + If a fit parameter is an array-like whose length is equal to + `num_samples` then it will be split by cross-validation along with + `X` and `y`. For example, the :term:`sample_weight` parameter is + split because `len(sample_weights) = len(X)`. However, this behavior + does not apply to `groups` which is passed to the splitter configured + via the `cv` parameter of the constructor. Thus, `groups` is used + *to perform the split* and determines which samples are + assigned to the each side of the a split. + + Returns + ------- + self : object + Instance of fitted estimator. + """ + estimator = self.estimator + scorers, refit_metric = self._get_scorers() + + X, y = indexable(X, y) + params = _check_method_params(X, params=params) + + routed_params = self._get_routed_params_for_fit(params) + + cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator)) + n_splits = cv_orig.get_n_splits(X, y, **routed_params.splitter.split) + + base_estimator = clone(self.estimator) + + parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch) + + fit_and_score_kwargs = dict( + scorer=scorers, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + return_train_score=self.return_train_score, + return_n_test_samples=True, + return_times=True, + return_parameters=False, + error_score=self.error_score, + verbose=self.verbose, + ) + results = {} + with parallel: + all_candidate_params = [] + all_out = [] + all_more_results = defaultdict(list) + + def evaluate_candidates(candidate_params, cv=None, more_results=None): + cv = cv or cv_orig + candidate_params = list(candidate_params) + n_candidates = len(candidate_params) + + if self.verbose > 0: + print( + "Fitting {0} folds for each of {1} candidates," + " totalling {2} fits".format( + n_splits, n_candidates, n_candidates * n_splits + ) + ) + + out = parallel( + delayed(_fit_and_score)( + clone(base_estimator), + X, + y, + train=train, + test=test, + parameters=parameters, + split_progress=(split_idx, n_splits), + candidate_progress=(cand_idx, n_candidates), + **fit_and_score_kwargs, + ) + for (cand_idx, parameters), (split_idx, (train, test)) in product( + enumerate(candidate_params), + enumerate(cv.split(X, y, **routed_params.splitter.split)), + ) + ) + + if len(out) < 1: + raise ValueError( + "No fits were performed. " + "Was the CV iterator empty? " + "Were there no candidates?" + ) + elif len(out) != n_candidates * n_splits: + raise ValueError( + "cv.split and cv.get_n_splits returned " + "inconsistent results. Expected {} " + "splits, got {}".format(n_splits, len(out) // n_candidates) + ) + + _warn_or_raise_about_fit_failures(out, self.error_score) + + # For callable self.scoring, the return type is only know after + # calling. If the return type is a dictionary, the error scores + # can now be inserted with the correct key. The type checking + # of out will be done in `_insert_error_scores`. + if callable(self.scoring): + _insert_error_scores(out, self.error_score) + + all_candidate_params.extend(candidate_params) + all_out.extend(out) + + if more_results is not None: + for key, value in more_results.items(): + all_more_results[key].extend(value) + + nonlocal results + results = self._format_results( + all_candidate_params, n_splits, all_out, all_more_results + ) + + return results + + self._run_search(evaluate_candidates) + + # multimetric is determined here because in the case of a callable + # self.scoring the return type is only known after calling + first_test_score = all_out[0]["test_scores"] + self.multimetric_ = isinstance(first_test_score, dict) + + # check refit_metric now for a callable scorer that is multimetric + if callable(self.scoring) and self.multimetric_: + self._check_refit_for_multimetric(first_test_score) + refit_metric = self.refit + + # For multi-metric evaluation, store the best_index_, best_params_ and + # best_score_ iff refit is one of the scorer names + # In single metric evaluation, refit_metric is "score" + if self.refit or not self.multimetric_: + self.best_index_ = self._select_best_index( + self.refit, refit_metric, results + ) + if not callable(self.refit): + # With a non-custom callable, we can select the best score + # based on the best index + self.best_score_ = results[f"mean_test_{refit_metric}"][ + self.best_index_ + ] + self.best_params_ = results["params"][self.best_index_] + + if self.refit: + # here we clone the estimator as well as the parameters, since + # sometimes the parameters themselves might be estimators, e.g. + # when we search over different estimators in a pipeline. + # ref: https://github.com/scikit-learn/scikit-learn/pull/26786 + self.best_estimator_ = clone(base_estimator).set_params( + **clone(self.best_params_, safe=False) + ) + + refit_start_time = time.time() + if y is not None: + self.best_estimator_.fit(X, y, **routed_params.estimator.fit) + else: + self.best_estimator_.fit(X, **routed_params.estimator.fit) + refit_end_time = time.time() + self.refit_time_ = refit_end_time - refit_start_time + + if hasattr(self.best_estimator_, "feature_names_in_"): + self.feature_names_in_ = self.best_estimator_.feature_names_in_ + + # Store the only scorer not as a dict for single metric evaluation + if isinstance(scorers, _MultimetricScorer): + self.scorer_ = scorers._scorers + else: + self.scorer_ = scorers + + self.cv_results_ = results + self.n_splits_ = n_splits + + return self + + def _format_results(self, candidate_params, n_splits, out, more_results=None): + n_candidates = len(candidate_params) + out = _aggregate_score_dicts(out) + + results = dict(more_results or {}) + for key, val in results.items(): + # each value is a list (as per evaluate_candidate's convention) + # we convert it to an array for consistency with the other keys + results[key] = np.asarray(val) + + def _store(key_name, array, weights=None, splits=False, rank=False): + """A small helper to store the scores/times to the cv_results_""" + # When iterated first by splits, then by parameters + # We want `array` to have `n_candidates` rows and `n_splits` cols. + array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) + if splits: + for split_idx in range(n_splits): + # Uses closure to alter the results + results["split%d_%s" % (split_idx, key_name)] = array[:, split_idx] + + array_means = np.average(array, axis=1, weights=weights) + results["mean_%s" % key_name] = array_means + + if key_name.startswith(("train_", "test_")) and np.any( + ~np.isfinite(array_means) + ): + warnings.warn( + ( + f"One or more of the {key_name.split('_')[0]} scores " + f"are non-finite: {array_means}" + ), + category=UserWarning, + ) + + # Weighted std is not directly available in numpy + array_stds = np.sqrt( + np.average( + (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights + ) + ) + results["std_%s" % key_name] = array_stds + + if rank: + # When the fit/scoring fails `array_means` contains NaNs, we + # will exclude them from the ranking process and consider them + # as tied with the worst performers. + if np.isnan(array_means).all(): + # All fit/scoring routines failed. + rank_result = np.ones_like(array_means, dtype=np.int32) + else: + min_array_means = np.nanmin(array_means) - 1 + array_means = np.nan_to_num(array_means, nan=min_array_means) + rank_result = rankdata(-array_means, method="min").astype( + np.int32, copy=False + ) + results["rank_%s" % key_name] = rank_result + + _store("fit_time", out["fit_time"]) + _store("score_time", out["score_time"]) + # Store a list of param dicts at the key 'params' + for param, ma in _yield_masked_array_for_each_param(candidate_params): + results[param] = ma + results["params"] = candidate_params + + test_scores_dict = _normalize_score_results(out["test_scores"]) + if self.return_train_score: + train_scores_dict = _normalize_score_results(out["train_scores"]) + + for scorer_name in test_scores_dict: + # Computed the (weighted) mean and std for test scores alone + _store( + "test_%s" % scorer_name, + test_scores_dict[scorer_name], + splits=True, + rank=True, + weights=None, + ) + if self.return_train_score: + _store( + "train_%s" % scorer_name, + train_scores_dict[scorer_name], + splits=True, + ) + + return results + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.4 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + router.add( + estimator=self.estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + + scorer, _ = self._get_scorers() + router.add( + scorer=scorer, + method_mapping=MethodMapping() + .add(caller="score", callee="score") + .add(caller="fit", callee="score"), + ) + router.add( + splitter=self.cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + return router + + def _sk_visual_block_(self): + if hasattr(self, "best_estimator_"): + key, estimator = "best_estimator_", self.best_estimator_ + else: + key, estimator = "estimator", self.estimator + + return _VisualBlock( + "parallel", + [estimator], + names=[f"{key}: {estimator.__class__.__name__}"], + name_details=[str(estimator)], + ) + + +class GridSearchCV(BaseSearchCV): + """Exhaustive search over specified parameter values for an estimator. + + Important members are fit, predict. + + GridSearchCV implements a "fit" and a "score" method. + It also implements "score_samples", "predict", "predict_proba", + "decision_function", "transform" and "inverse_transform" if they are + implemented in the estimator used. + + The parameters of the estimator used to apply these methods are optimized + by cross-validated grid-search over a parameter grid. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_grid : dict or list of dictionaries + Dictionary with parameters names (`str`) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + scoring : str, callable, list, tuple or dict, default=None + Strategy to evaluate the performance of the cross-validated model on + the test set. + + If `scoring` represents a single score, one can use: + + - a single string (see :ref:`scoring_parameter`); + - a callable (see :ref:`scoring_callable`) that returns a single value. + + If `scoring` represents multiple scores, one can use: + + - a list or tuple of unique strings; + - a callable returning a dictionary where the keys are the metric + names and the values are the metric scores; + - a dictionary with metric names as keys and callables as values. + + See :ref:`multimetric_grid_search` for an example. + + n_jobs : int, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + + refit : bool, str, or callable, default=True + Refit an estimator using the best found parameters on the whole + dataset. + + For multiple metric evaluation, this needs to be a `str` denoting the + scorer that would be used to find the best parameters for refitting + the estimator at the end. + + Where there are considerations other than maximum score in + choosing a best estimator, ``refit`` can be set to a function which + returns the selected ``best_index_`` given ``cv_results_``. In that + case, the ``best_estimator_`` and ``best_params_`` will be set + according to the returned ``best_index_`` while the ``best_score_`` + attribute will not be available. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``GridSearchCV`` instance. + + Also for multiple metric evaluation, the attributes ``best_index_``, + ``best_score_`` and ``best_params_`` will only be available if + ``refit`` is set and all of them will be determined w.r.t this specific + scorer. + + See ``scoring`` parameter to know more about multiple metric + evaluation. + + See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` + to see how to design a custom selection strategy using a callable + via `refit`. + + .. versionchanged:: 0.20 + Support for callable added. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + verbose : int + Controls the verbosity: the higher, the more messages. + + - >1 : the computation time for each fold and parameter candidate is + displayed; + - >2 : the score is also displayed; + - >3 : the fold and candidate parameter indexes are also displayed + together with the starting time of the computation. + + pre_dispatch : int, or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately created and spawned. Use + this for lightweight and fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + - An int, giving the exact number of total jobs that are spawned + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. + + return_train_score : bool, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + .. versionadded:: 0.19 + + .. versionchanged:: 0.21 + Default value was changed from ``True`` to ``False`` + + Attributes + ---------- + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. + + For instance the below given table + + +------------+-----------+------------+-----------------+---+---------+ + |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...| + +============+===========+============+=================+===+=========+ + | 'poly' | -- | 2 | 0.80 |...| 2 | + +------------+-----------+------------+-----------------+---+---------+ + | 'poly' | -- | 3 | 0.70 |...| 4 | + +------------+-----------+------------+-----------------+---+---------+ + | 'rbf' | 0.1 | -- | 0.80 |...| 3 | + +------------+-----------+------------+-----------------+---+---------+ + | 'rbf' | 0.2 | -- | 0.93 |...| 1 | + +------------+-----------+------------+-----------------+---+---------+ + + will be represented by a ``cv_results_`` dict of:: + + { + 'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'], + mask = [False False False False]...) + 'param_gamma': masked_array(data = [-- -- 0.1 0.2], + mask = [ True True False False]...), + 'param_degree': masked_array(data = [2.0 3.0 -- --], + mask = [False False True True]...), + 'split0_test_score' : [0.80, 0.70, 0.80, 0.93], + 'split1_test_score' : [0.82, 0.50, 0.70, 0.78], + 'mean_test_score' : [0.81, 0.60, 0.75, 0.85], + 'std_test_score' : [0.01, 0.10, 0.05, 0.08], + 'rank_test_score' : [2, 4, 3, 1], + 'split0_train_score' : [0.80, 0.92, 0.70, 0.93], + 'split1_train_score' : [0.82, 0.55, 0.70, 0.87], + 'mean_train_score' : [0.81, 0.74, 0.70, 0.90], + 'std_train_score' : [0.01, 0.19, 0.00, 0.03], + 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49], + 'std_fit_time' : [0.01, 0.02, 0.01, 0.01], + 'mean_score_time' : [0.01, 0.06, 0.04, 0.04], + 'std_score_time' : [0.00, 0.00, 0.00, 0.01], + 'params' : [{'kernel': 'poly', 'degree': 2}, ...], + } + + NOTE + + The key ``'params'`` is used to store a list of parameter + settings dicts for all the parameter candidates. + + The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and + ``std_score_time`` are all in seconds. + + For multi-metric evaluation, the scores for all the scorers are + available in the ``cv_results_`` dict at the keys ending with that + scorer's name (``'_'``) instead of ``'_score'`` shown + above. ('split0_test_precision', 'mean_train_precision' etc.) + + best_estimator_ : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if ``refit=False``. + + See ``refit`` parameter for more information on allowed values. + + best_score_ : float + Mean cross-validated score of the best_estimator + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + This attribute is not available if ``refit`` is a function. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best + parameters for the model. + + For multi-metric evaluation, this attribute holds the validated + ``scoring`` dict which maps the scorer key to the scorer callable. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + refit_time_ : float + Seconds used for refitting the best model on the whole dataset. + + This is present only if ``refit`` is not False. + + .. versionadded:: 0.20 + + multimetric_ : bool + Whether or not the scorers compute several metrics. + + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `n_features_in_` when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `feature_names_in_` when fit. + + .. versionadded:: 1.0 + + See Also + -------- + ParameterGrid : Generates all the combinations of a hyperparameter grid. + train_test_split : Utility function to split the data into a development + set usable for fitting a GridSearchCV instance and an evaluation set + for its final evaluation. + sklearn.metrics.make_scorer : Make a scorer from a performance metric or + loss function. + + Notes + ----- + The parameters selected are those that maximize the score of the left out + data, unless an explicit score is passed in which case it is used instead. + + If `n_jobs` was set to a value higher than one, the data is copied for each + point in the grid (and not `n_jobs` times). This is done for efficiency + reasons if individual jobs take very little time, but may raise errors if + the dataset is large and not enough memory is available. A workaround in + this case is to set `pre_dispatch`. Then, the memory is copied only + `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * + n_jobs`. + + Examples + -------- + >>> from sklearn import svm, datasets + >>> from sklearn.model_selection import GridSearchCV + >>> iris = datasets.load_iris() + >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} + >>> svc = svm.SVC() + >>> clf = GridSearchCV(svc, parameters) + >>> clf.fit(iris.data, iris.target) + GridSearchCV(estimator=SVC(), + param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')}) + >>> sorted(clf.cv_results_.keys()) + ['mean_fit_time', 'mean_score_time', 'mean_test_score',... + 'param_C', 'param_kernel', 'params',... + 'rank_test_score', 'split0_test_score',... + 'split2_test_score', ... + 'std_fit_time', 'std_score_time', 'std_test_score'] + """ + + _parameter_constraints: dict = { + **BaseSearchCV._parameter_constraints, + "param_grid": [dict, list], + } + + def __init__( + self, + estimator, + param_grid, + *, + scoring=None, + n_jobs=None, + refit=True, + cv=None, + verbose=0, + pre_dispatch="2*n_jobs", + error_score=np.nan, + return_train_score=False, + ): + super().__init__( + estimator=estimator, + scoring=scoring, + n_jobs=n_jobs, + refit=refit, + cv=cv, + verbose=verbose, + pre_dispatch=pre_dispatch, + error_score=error_score, + return_train_score=return_train_score, + ) + self.param_grid = param_grid + + def _run_search(self, evaluate_candidates): + """Search all candidates in param_grid""" + evaluate_candidates(ParameterGrid(self.param_grid)) + + +class RandomizedSearchCV(BaseSearchCV): + """Randomized search on hyper parameters. + + RandomizedSearchCV implements a "fit" and a "score" method. + It also implements "score_samples", "predict", "predict_proba", + "decision_function", "transform" and "inverse_transform" if they are + implemented in the estimator used. + + The parameters of the estimator used to apply these methods are optimized + by cross-validated search over parameter settings. + + In contrast to GridSearchCV, not all parameter values are tried out, but + rather a fixed number of parameter settings is sampled from the specified + distributions. The number of parameter settings that are tried is + given by n_iter. + + If all parameters are presented as a list, + sampling without replacement is performed. If at least one parameter + is given as a distribution, sampling with replacement is used. + It is highly recommended to use continuous distributions for continuous + parameters. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.14 + + Parameters + ---------- + estimator : estimator object + An object of that type is instantiated for each grid point. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_distributions : dict or list of dicts + Dictionary with parameters names (`str`) as keys and distributions + or lists of parameters to try. Distributions must provide a ``rvs`` + method for sampling (such as those from scipy.stats.distributions). + If a list is given, it is sampled uniformly. + If a list of dicts is given, first a dict is sampled uniformly, and + then a parameter is sampled using that dict as above. + + n_iter : int, default=10 + Number of parameter settings that are sampled. n_iter trades + off runtime vs quality of the solution. + + scoring : str, callable, list, tuple or dict, default=None + Strategy to evaluate the performance of the cross-validated model on + the test set. + + If `scoring` represents a single score, one can use: + + - a single string (see :ref:`scoring_parameter`); + - a callable (see :ref:`scoring_callable`) that returns a single value. + + If `scoring` represents multiple scores, one can use: + + - a list or tuple of unique strings; + - a callable returning a dictionary where the keys are the metric + names and the values are the metric scores; + - a dictionary with metric names as keys and callables as values. + + See :ref:`multimetric_grid_search` for an example. + + If None, the estimator's score method is used. + + n_jobs : int, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + + refit : bool, str, or callable, default=True + Refit an estimator using the best found parameters on the whole + dataset. + + For multiple metric evaluation, this needs to be a `str` denoting the + scorer that would be used to find the best parameters for refitting + the estimator at the end. + + Where there are considerations other than maximum score in + choosing a best estimator, ``refit`` can be set to a function which + returns the selected ``best_index_`` given the ``cv_results_``. In that + case, the ``best_estimator_`` and ``best_params_`` will be set + according to the returned ``best_index_`` while the ``best_score_`` + attribute will not be available. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``RandomizedSearchCV`` instance. + + Also for multiple metric evaluation, the attributes ``best_index_``, + ``best_score_`` and ``best_params_`` will only be available if + ``refit`` is set and all of them will be determined w.r.t this specific + scorer. + + See ``scoring`` parameter to know more about multiple metric + evaluation. + + .. versionchanged:: 0.20 + Support for callable added. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + verbose : int + Controls the verbosity: the higher, the more messages. + + - >1 : the computation time for each fold and parameter candidate is + displayed; + - >2 : the score is also displayed; + - >3 : the fold and candidate parameter indexes are also displayed + together with the starting time of the computation. + + pre_dispatch : int, or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately created and spawned. Use + this for lightweight and fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + - An int, giving the exact number of total jobs that are spawned + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' + + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for random uniform sampling + from lists of possible values instead of scipy.stats distributions. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. + + return_train_score : bool, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + .. versionadded:: 0.19 + + .. versionchanged:: 0.21 + Default value was changed from ``True`` to ``False`` + + Attributes + ---------- + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. + + For instance the below given table + + +--------------+-------------+-------------------+---+---------------+ + | param_kernel | param_gamma | split0_test_score |...|rank_test_score| + +==============+=============+===================+===+===============+ + | 'rbf' | 0.1 | 0.80 |...| 1 | + +--------------+-------------+-------------------+---+---------------+ + | 'rbf' | 0.2 | 0.84 |...| 3 | + +--------------+-------------+-------------------+---+---------------+ + | 'rbf' | 0.3 | 0.70 |...| 2 | + +--------------+-------------+-------------------+---+---------------+ + + will be represented by a ``cv_results_`` dict of:: + + { + 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'], + mask = False), + 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), + 'split0_test_score' : [0.80, 0.84, 0.70], + 'split1_test_score' : [0.82, 0.50, 0.70], + 'mean_test_score' : [0.81, 0.67, 0.70], + 'std_test_score' : [0.01, 0.24, 0.00], + 'rank_test_score' : [1, 3, 2], + 'split0_train_score' : [0.80, 0.92, 0.70], + 'split1_train_score' : [0.82, 0.55, 0.70], + 'mean_train_score' : [0.81, 0.74, 0.70], + 'std_train_score' : [0.01, 0.19, 0.00], + 'mean_fit_time' : [0.73, 0.63, 0.43], + 'std_fit_time' : [0.01, 0.02, 0.01], + 'mean_score_time' : [0.01, 0.06, 0.04], + 'std_score_time' : [0.00, 0.00, 0.00], + 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], + } + + NOTE + + The key ``'params'`` is used to store a list of parameter + settings dicts for all the parameter candidates. + + The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and + ``std_score_time`` are all in seconds. + + For multi-metric evaluation, the scores for all the scorers are + available in the ``cv_results_`` dict at the keys ending with that + scorer's name (``'_'``) instead of ``'_score'`` shown + above. ('split0_test_precision', 'mean_train_precision' etc.) + + best_estimator_ : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if ``refit=False``. + + For multi-metric evaluation, this attribute is present only if + ``refit`` is specified. + + See ``refit`` parameter for more information on allowed values. + + best_score_ : float + Mean cross-validated score of the best_estimator. + + For multi-metric evaluation, this is not available if ``refit`` is + ``False``. See ``refit`` parameter for more information. + + This attribute is not available if ``refit`` is a function. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + For multi-metric evaluation, this is not available if ``refit`` is + ``False``. See ``refit`` parameter for more information. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + For multi-metric evaluation, this is not available if ``refit`` is + ``False``. See ``refit`` parameter for more information. + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best + parameters for the model. + + For multi-metric evaluation, this attribute holds the validated + ``scoring`` dict which maps the scorer key to the scorer callable. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + refit_time_ : float + Seconds used for refitting the best model on the whole dataset. + + This is present only if ``refit`` is not False. + + .. versionadded:: 0.20 + + multimetric_ : bool + Whether or not the scorers compute several metrics. + + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `n_features_in_` when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `feature_names_in_` when fit. + + .. versionadded:: 1.0 + + See Also + -------- + GridSearchCV : Does exhaustive search over a grid of parameters. + ParameterSampler : A generator over parameter settings, constructed from + param_distributions. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + If `n_jobs` was set to a value higher than one, the data is copied for each + parameter setting(and not `n_jobs` times). This is done for efficiency + reasons if individual jobs take very little time, but may raise errors if + the dataset is large and not enough memory is available. A workaround in + this case is to set `pre_dispatch`. Then, the memory is copied only + `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * + n_jobs`. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import RandomizedSearchCV + >>> from scipy.stats import uniform + >>> iris = load_iris() + >>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, + ... random_state=0) + >>> distributions = dict(C=uniform(loc=0, scale=4), + ... penalty=['l2', 'l1']) + >>> clf = RandomizedSearchCV(logistic, distributions, random_state=0) + >>> search = clf.fit(iris.data, iris.target) + >>> search.best_params_ + {'C': np.float64(2...), 'penalty': 'l1'} + """ + + _parameter_constraints: dict = { + **BaseSearchCV._parameter_constraints, + "param_distributions": [dict, list], + "n_iter": [Interval(numbers.Integral, 1, None, closed="left")], + "random_state": ["random_state"], + } + + def __init__( + self, + estimator, + param_distributions, + *, + n_iter=10, + scoring=None, + n_jobs=None, + refit=True, + cv=None, + verbose=0, + pre_dispatch="2*n_jobs", + random_state=None, + error_score=np.nan, + return_train_score=False, + ): + self.param_distributions = param_distributions + self.n_iter = n_iter + self.random_state = random_state + super().__init__( + estimator=estimator, + scoring=scoring, + n_jobs=n_jobs, + refit=refit, + cv=cv, + verbose=verbose, + pre_dispatch=pre_dispatch, + error_score=error_score, + return_train_score=return_train_score, + ) + + def _run_search(self, evaluate_candidates): + """Search n_iter candidates from param_distributions""" + evaluate_candidates( + ParameterSampler( + self.param_distributions, self.n_iter, random_state=self.random_state + ) + ) diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_search_successive_halving.py b/.venv/Lib/site-packages/sklearn/model_selection/_search_successive_halving.py new file mode 100644 index 0000000000000000000000000000000000000000..662335d7140bb153e234d60c596997861873e877 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/_search_successive_halving.py @@ -0,0 +1,1063 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from abc import abstractmethod +from math import ceil, floor, log +from numbers import Integral, Real + +import numpy as np + +from ..base import _fit_context, is_classifier +from ..metrics._scorer import get_scorer_names +from ..utils import resample +from ..utils._param_validation import Interval, StrOptions +from ..utils.multiclass import check_classification_targets +from ..utils.validation import _num_samples, validate_data +from . import ParameterGrid, ParameterSampler +from ._search import BaseSearchCV +from ._split import _yields_constant_splits, check_cv + +__all__ = ["HalvingGridSearchCV", "HalvingRandomSearchCV"] + + +class _SubsampleMetaSplitter: + """Splitter that subsamples a given fraction of the dataset""" + + def __init__(self, *, base_cv, fraction, subsample_test, random_state): + self.base_cv = base_cv + self.fraction = fraction + self.subsample_test = subsample_test + self.random_state = random_state + + def split(self, X, y, **kwargs): + for train_idx, test_idx in self.base_cv.split(X, y, **kwargs): + train_idx = resample( + train_idx, + replace=False, + random_state=self.random_state, + n_samples=int(self.fraction * len(train_idx)), + ) + if self.subsample_test: + test_idx = resample( + test_idx, + replace=False, + random_state=self.random_state, + n_samples=int(self.fraction * len(test_idx)), + ) + yield train_idx, test_idx + + +def _top_k(results, k, itr): + # Return the best candidates of a given iteration + iteration, mean_test_score, params = ( + np.asarray(a) + for a in (results["iter"], results["mean_test_score"], results["params"]) + ) + iter_indices = np.flatnonzero(iteration == itr) + scores = mean_test_score[iter_indices] + # argsort() places NaNs at the end of the array so we move NaNs to the + # front of the array so the last `k` items are the those with the + # highest scores. + sorted_indices = np.roll(np.argsort(scores), np.count_nonzero(np.isnan(scores))) + return np.array(params[iter_indices][sorted_indices[-k:]]) + + +class BaseSuccessiveHalving(BaseSearchCV): + """Implements successive halving. + + Ref: + Almost optimal exploration in multi-armed bandits, ICML 13 + Zohar Karnin, Tomer Koren, Oren Somekh + """ + + _parameter_constraints: dict = { + **BaseSearchCV._parameter_constraints, + # overwrite `scoring` since multi-metrics are not supported + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "random_state": ["random_state"], + "max_resources": [ + Interval(Integral, 0, None, closed="neither"), + StrOptions({"auto"}), + ], + "min_resources": [ + Interval(Integral, 0, None, closed="neither"), + StrOptions({"exhaust", "smallest"}), + ], + "resource": [str], + "factor": [Interval(Real, 0, None, closed="neither")], + "aggressive_elimination": ["boolean"], + } + _parameter_constraints.pop("pre_dispatch") # not used in this class + + def __init__( + self, + estimator, + *, + scoring=None, + n_jobs=None, + refit=True, + cv=5, + verbose=0, + random_state=None, + error_score=np.nan, + return_train_score=True, + max_resources="auto", + min_resources="exhaust", + resource="n_samples", + factor=3, + aggressive_elimination=False, + ): + super().__init__( + estimator, + scoring=scoring, + n_jobs=n_jobs, + refit=refit, + cv=cv, + verbose=verbose, + error_score=error_score, + return_train_score=return_train_score, + ) + + self.random_state = random_state + self.max_resources = max_resources + self.resource = resource + self.factor = factor + self.min_resources = min_resources + self.aggressive_elimination = aggressive_elimination + + def _check_input_parameters(self, X, y, split_params): + # We need to enforce that successive calls to cv.split() yield the same + # splits: see https://github.com/scikit-learn/scikit-learn/issues/15149 + if not _yields_constant_splits(self._checked_cv_orig): + raise ValueError( + "The cv parameter must yield consistent folds across " + "calls to split(). Set its random_state to an int, or set " + "shuffle=False." + ) + + if ( + self.resource != "n_samples" + and self.resource not in self.estimator.get_params() + ): + raise ValueError( + f"Cannot use resource={self.resource} which is not supported " + f"by estimator {self.estimator.__class__.__name__}" + ) + + if isinstance(self, HalvingRandomSearchCV): + if self.min_resources == self.n_candidates == "exhaust": + # for n_candidates=exhaust to work, we need to know what + # min_resources is. Similarly min_resources=exhaust needs to + # know the actual number of candidates. + raise ValueError( + "n_candidates and min_resources cannot be both set to 'exhaust'." + ) + + self.min_resources_ = self.min_resources + if self.min_resources_ in ("smallest", "exhaust"): + if self.resource == "n_samples": + n_splits = self._checked_cv_orig.get_n_splits(X, y, **split_params) + # please see https://gph.is/1KjihQe for a justification + magic_factor = 2 + self.min_resources_ = n_splits * magic_factor + if is_classifier(self.estimator): + y = validate_data(self, X="no_validation", y=y) + check_classification_targets(y) + n_classes = np.unique(y).shape[0] + self.min_resources_ *= n_classes + else: + self.min_resources_ = 1 + # if 'exhaust', min_resources_ might be set to a higher value later + # in _run_search + + self.max_resources_ = self.max_resources + if self.max_resources_ == "auto": + if not self.resource == "n_samples": + raise ValueError( + "resource can only be 'n_samples' when max_resources='auto'" + ) + self.max_resources_ = _num_samples(X) + + if self.min_resources_ > self.max_resources_: + raise ValueError( + f"min_resources_={self.min_resources_} is greater " + f"than max_resources_={self.max_resources_}." + ) + + if self.min_resources_ == 0: + raise ValueError( + f"min_resources_={self.min_resources_}: you might have passed " + "an empty dataset X." + ) + + @staticmethod + def _select_best_index(refit, refit_metric, results): + """Custom refit callable to return the index of the best candidate. + + We want the best candidate out of the last iteration. By default + BaseSearchCV would return the best candidate out of all iterations. + + Currently, we only support for a single metric thus `refit` and + `refit_metric` are not required. + """ + last_iter = np.max(results["iter"]) + last_iter_indices = np.flatnonzero(results["iter"] == last_iter) + + test_scores = results["mean_test_score"][last_iter_indices] + # If all scores are NaNs there is no way to pick between them, + # so we (arbitrarily) declare the zero'th entry the best one + if np.isnan(test_scores).all(): + best_idx = 0 + else: + best_idx = np.nanargmax(test_scores) + + return last_iter_indices[best_idx] + + @_fit_context( + # Halving*SearchCV.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None, **params): + """Run fit with all sets of parameters. + + Parameters + ---------- + + X : array-like, shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like, shape (n_samples,) or (n_samples, n_output), optional + Target relative to X for classification or regression; + None for unsupervised learning. + + **params : dict of string -> object + Parameters passed to the ``fit`` method of the estimator. + + Returns + ------- + self : object + Instance of fitted estimator. + """ + self._checked_cv_orig = check_cv( + self.cv, y, classifier=is_classifier(self.estimator) + ) + + routed_params = self._get_routed_params_for_fit(params) + self._check_input_parameters( + X=X, y=y, split_params=routed_params.splitter.split + ) + + self._n_samples_orig = _num_samples(X) + + super().fit(X, y=y, **params) + + # Set best_score_: BaseSearchCV does not set it, as refit is a callable + self.best_score_ = self.cv_results_["mean_test_score"][self.best_index_] + + return self + + def _run_search(self, evaluate_candidates): + candidate_params = self._generate_candidate_params() + + if self.resource != "n_samples" and any( + self.resource in candidate for candidate in candidate_params + ): + # Can only check this now since we need the candidates list + raise ValueError( + f"Cannot use parameter {self.resource} as the resource since " + "it is part of the searched parameters." + ) + + # n_required_iterations is the number of iterations needed so that the + # last iterations evaluates less than `factor` candidates. + n_required_iterations = 1 + floor(log(len(candidate_params), self.factor)) + + if self.min_resources == "exhaust": + # To exhaust the resources, we want to start with the biggest + # min_resources possible so that the last (required) iteration + # uses as many resources as possible + last_iteration = n_required_iterations - 1 + self.min_resources_ = max( + self.min_resources_, + self.max_resources_ // self.factor**last_iteration, + ) + + # n_possible_iterations is the number of iterations that we can + # actually do starting from min_resources and without exceeding + # max_resources. Depending on max_resources and the number of + # candidates, this may be higher or smaller than + # n_required_iterations. + n_possible_iterations = 1 + floor( + log(self.max_resources_ // self.min_resources_, self.factor) + ) + + if self.aggressive_elimination: + n_iterations = n_required_iterations + else: + n_iterations = min(n_possible_iterations, n_required_iterations) + + if self.verbose: + print(f"n_iterations: {n_iterations}") + print(f"n_required_iterations: {n_required_iterations}") + print(f"n_possible_iterations: {n_possible_iterations}") + print(f"min_resources_: {self.min_resources_}") + print(f"max_resources_: {self.max_resources_}") + print(f"aggressive_elimination: {self.aggressive_elimination}") + print(f"factor: {self.factor}") + + self.n_resources_ = [] + self.n_candidates_ = [] + + for itr in range(n_iterations): + power = itr # default + if self.aggressive_elimination: + # this will set n_resources to the initial value (i.e. the + # value of n_resources at the first iteration) for as many + # iterations as needed (while candidates are being + # eliminated), and then go on as usual. + power = max(0, itr - n_required_iterations + n_possible_iterations) + + n_resources = int(self.factor**power * self.min_resources_) + # guard, probably not needed + n_resources = min(n_resources, self.max_resources_) + self.n_resources_.append(n_resources) + + n_candidates = len(candidate_params) + self.n_candidates_.append(n_candidates) + + if self.verbose: + print("-" * 10) + print(f"iter: {itr}") + print(f"n_candidates: {n_candidates}") + print(f"n_resources: {n_resources}") + + if self.resource == "n_samples": + # subsampling will be done in cv.split() + cv = _SubsampleMetaSplitter( + base_cv=self._checked_cv_orig, + fraction=n_resources / self._n_samples_orig, + subsample_test=True, + random_state=self.random_state, + ) + + else: + # Need copy so that the n_resources of next iteration does + # not overwrite + candidate_params = [c.copy() for c in candidate_params] + for candidate in candidate_params: + candidate[self.resource] = n_resources + cv = self._checked_cv_orig + + more_results = { + "iter": [itr] * n_candidates, + "n_resources": [n_resources] * n_candidates, + } + + results = evaluate_candidates( + candidate_params, cv, more_results=more_results + ) + + n_candidates_to_keep = ceil(n_candidates / self.factor) + candidate_params = _top_k(results, n_candidates_to_keep, itr) + + self.n_remaining_candidates_ = len(candidate_params) + self.n_required_iterations_ = n_required_iterations + self.n_possible_iterations_ = n_possible_iterations + self.n_iterations_ = n_iterations + + @abstractmethod + def _generate_candidate_params(self): + pass + + +class HalvingGridSearchCV(BaseSuccessiveHalving): + """Search over specified parameter values with successive halving. + + The search strategy starts evaluating all the candidates with a small + amount of resources and iteratively selects the best candidates, using + more and more resources. + + Read more in the :ref:`User guide `. + + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_halving_search_cv``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> # now you can import normally from model_selection + >>> from sklearn.model_selection import HalvingGridSearchCV + + Parameters + ---------- + estimator : estimator object + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_grid : dict or list of dictionaries + Dictionary with parameters names (string) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + factor : int or float, default=3 + The 'halving' parameter, which determines the proportion of candidates + that are selected for each subsequent iteration. For example, + ``factor=3`` means that only one third of the candidates are selected. + + resource : ``'n_samples'`` or str, default='n_samples' + Defines the resource that increases with each iteration. By default, + the resource is the number of samples. It can also be set to any + parameter of the base estimator that accepts positive integer + values, e.g. 'n_iterations' or 'n_estimators' for a gradient + boosting estimator. In this case ``max_resources`` cannot be 'auto' + and must be set explicitly. + + max_resources : int, default='auto' + The maximum amount of resource that any candidate is allowed to use + for a given iteration. By default, this is set to ``n_samples`` when + ``resource='n_samples'`` (default), else an error is raised. + + min_resources : {'exhaust', 'smallest'} or int, default='exhaust' + The minimum amount of resource that any candidate is allowed to use + for a given iteration. Equivalently, this defines the amount of + resources `r0` that are allocated for each candidate at the first + iteration. + + - 'smallest' is a heuristic that sets `r0` to a small value: + + - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem + - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a + classification problem + - ``1`` when ``resource != 'n_samples'`` + + - 'exhaust' will set `r0` such that the **last** iteration uses as + much resources as possible. Namely, the last iteration will use the + highest value smaller than ``max_resources`` that is a multiple of + both ``min_resources`` and ``factor``. In general, using 'exhaust' + leads to a more accurate estimator, but is slightly more time + consuming. + + Note that the amount of resources used at each iteration is always a + multiple of ``min_resources``. + + aggressive_elimination : bool, default=False + This is only relevant in cases where there isn't enough resources to + reduce the remaining candidates to at most `factor` after the last + iteration. If ``True``, then the search process will 'replay' the + first iteration for as long as needed until the number of candidates + is small enough. This is ``False`` by default, which means that the + last iteration may evaluate more than ``factor`` candidates. See + :ref:`aggressive_elimination` for more details. + + cv : int, cross-validation generator or iterable, default=5 + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. note:: + Due to implementation details, the folds produced by `cv` must be + the same across multiple calls to `cv.split()`. For + built-in `scikit-learn` iterators, this can be achieved by + deactivating shuffling (`shuffle=False`), or by setting the + `cv`'s `random_state` parameter to an integer. + + scoring : str, callable, or None, default=None + A single string (see :ref:`scoring_parameter`) or a callable + (see :ref:`scoring_callable`) to evaluate the predictions on the test set. + If None, the estimator's score method is used. + + refit : bool, default=True + If True, refit an estimator using the best found parameters on the + whole dataset. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``HalvingGridSearchCV`` instance. + + error_score : 'raise' or numeric + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. Default is ``np.nan``. + + return_train_score : bool, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for subsampling the dataset + when `resources != 'n_samples'`. Ignored otherwise. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + n_jobs : int or None, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + n_resources_ : list of int + The amount of resources used at each iteration. + + n_candidates_ : list of int + The number of candidate parameters that were evaluated at each + iteration. + + n_remaining_candidates_ : int + The number of candidate parameters that are left after the last + iteration. It corresponds to `ceil(n_candidates[-1] / factor)` + + max_resources_ : int + The maximum number of resources that any candidate is allowed to use + for a given iteration. Note that since the number of resources used + at each iteration must be a multiple of ``min_resources_``, the + actual number of resources used at the last iteration may be smaller + than ``max_resources_``. + + min_resources_ : int + The amount of resources that are allocated for each candidate at the + first iteration. + + n_iterations_ : int + The actual number of iterations that were run. This is equal to + ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``. + Else, this is equal to ``min(n_possible_iterations_, + n_required_iterations_)``. + + n_possible_iterations_ : int + The number of iterations that are possible starting with + ``min_resources_`` resources and without exceeding + ``max_resources_``. + + n_required_iterations_ : int + The number of iterations that are required to end up with less than + ``factor`` candidates at the last iteration, starting with + ``min_resources_`` resources. This will be smaller than + ``n_possible_iterations_`` when there isn't enough resources. + + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. It contains lots of information + for analysing the results of a search. + Please refer to the :ref:`User guide` + for details. + + best_estimator_ : estimator or dict + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if ``refit=False``. + + best_score_ : float + Mean cross-validated score of the best_estimator. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best + parameters for the model. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + refit_time_ : float + Seconds used for refitting the best model on the whole dataset. + + This is present only if ``refit`` is not False. + + multimetric_ : bool + Whether or not the scorers compute several metrics. + + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `n_features_in_` when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `feature_names_in_` when fit. + + .. versionadded:: 1.0 + + See Also + -------- + :class:`HalvingRandomSearchCV`: + Random search over a set of parameters using successive halving. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + All parameter combinations scored with a NaN will share the lowest rank. + + Examples + -------- + + >>> from sklearn.datasets import load_iris + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> from sklearn.model_selection import HalvingGridSearchCV + ... + >>> X, y = load_iris(return_X_y=True) + >>> clf = RandomForestClassifier(random_state=0) + ... + >>> param_grid = {"max_depth": [3, None], + ... "min_samples_split": [5, 10]} + >>> search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators', + ... max_resources=10, + ... random_state=0).fit(X, y) + >>> search.best_params_ # doctest: +SKIP + {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} + """ + + _parameter_constraints: dict = { + **BaseSuccessiveHalving._parameter_constraints, + "param_grid": [dict, list], + } + + def __init__( + self, + estimator, + param_grid, + *, + factor=3, + resource="n_samples", + max_resources="auto", + min_resources="exhaust", + aggressive_elimination=False, + cv=5, + scoring=None, + refit=True, + error_score=np.nan, + return_train_score=True, + random_state=None, + n_jobs=None, + verbose=0, + ): + super().__init__( + estimator, + scoring=scoring, + n_jobs=n_jobs, + refit=refit, + verbose=verbose, + cv=cv, + random_state=random_state, + error_score=error_score, + return_train_score=return_train_score, + max_resources=max_resources, + resource=resource, + factor=factor, + min_resources=min_resources, + aggressive_elimination=aggressive_elimination, + ) + self.param_grid = param_grid + + def _generate_candidate_params(self): + return ParameterGrid(self.param_grid) + + +class HalvingRandomSearchCV(BaseSuccessiveHalving): + """Randomized search on hyper parameters. + + The search strategy starts evaluating all the candidates with a small + amount of resources and iteratively selects the best candidates, using more + and more resources. + + The candidates are sampled at random from the parameter space and the + number of sampled candidates is determined by ``n_candidates``. + + Read more in the :ref:`User guide`. + + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_halving_search_cv``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> # now you can import normally from model_selection + >>> from sklearn.model_selection import HalvingRandomSearchCV + + Parameters + ---------- + estimator : estimator object + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_distributions : dict or list of dicts + Dictionary with parameters names (`str`) as keys and distributions + or lists of parameters to try. Distributions must provide a ``rvs`` + method for sampling (such as those from scipy.stats.distributions). + If a list is given, it is sampled uniformly. + If a list of dicts is given, first a dict is sampled uniformly, and + then a parameter is sampled using that dict as above. + + n_candidates : "exhaust" or int, default="exhaust" + The number of candidate parameters to sample, at the first + iteration. Using 'exhaust' will sample enough candidates so that the + last iteration uses as many resources as possible, based on + `min_resources`, `max_resources` and `factor`. In this case, + `min_resources` cannot be 'exhaust'. + + factor : int or float, default=3 + The 'halving' parameter, which determines the proportion of candidates + that are selected for each subsequent iteration. For example, + ``factor=3`` means that only one third of the candidates are selected. + + resource : ``'n_samples'`` or str, default='n_samples' + Defines the resource that increases with each iteration. By default, + the resource is the number of samples. It can also be set to any + parameter of the base estimator that accepts positive integer + values, e.g. 'n_iterations' or 'n_estimators' for a gradient + boosting estimator. In this case ``max_resources`` cannot be 'auto' + and must be set explicitly. + + max_resources : int, default='auto' + The maximum number of resources that any candidate is allowed to use + for a given iteration. By default, this is set ``n_samples`` when + ``resource='n_samples'`` (default), else an error is raised. + + min_resources : {'exhaust', 'smallest'} or int, default='smallest' + The minimum amount of resource that any candidate is allowed to use + for a given iteration. Equivalently, this defines the amount of + resources `r0` that are allocated for each candidate at the first + iteration. + + - 'smallest' is a heuristic that sets `r0` to a small value: + + - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem + - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a + classification problem + - ``1`` when ``resource != 'n_samples'`` + + - 'exhaust' will set `r0` such that the **last** iteration uses as + much resources as possible. Namely, the last iteration will use the + highest value smaller than ``max_resources`` that is a multiple of + both ``min_resources`` and ``factor``. In general, using 'exhaust' + leads to a more accurate estimator, but is slightly more time + consuming. 'exhaust' isn't available when `n_candidates='exhaust'`. + + Note that the amount of resources used at each iteration is always a + multiple of ``min_resources``. + + aggressive_elimination : bool, default=False + This is only relevant in cases where there isn't enough resources to + reduce the remaining candidates to at most `factor` after the last + iteration. If ``True``, then the search process will 'replay' the + first iteration for as long as needed until the number of candidates + is small enough. This is ``False`` by default, which means that the + last iteration may evaluate more than ``factor`` candidates. See + :ref:`aggressive_elimination` for more details. + + cv : int, cross-validation generator or an iterable, default=5 + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. note:: + Due to implementation details, the folds produced by `cv` must be + the same across multiple calls to `cv.split()`. For + built-in `scikit-learn` iterators, this can be achieved by + deactivating shuffling (`shuffle=False`), or by setting the + `cv`'s `random_state` parameter to an integer. + + scoring : str, callable, or None, default=None + A single string (see :ref:`scoring_parameter`) or a callable + (see :ref:`scoring_callable`) to evaluate the predictions on the test set. + If None, the estimator's score method is used. + + refit : bool, default=True + If True, refit an estimator using the best found parameters on the + whole dataset. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``HalvingRandomSearchCV`` instance. + + error_score : 'raise' or numeric + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. Default is ``np.nan``. + + return_train_score : bool, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for subsampling the dataset + when `resources != 'n_samples'`. Also used for random uniform + sampling from lists of possible values instead of scipy.stats + distributions. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + n_jobs : int or None, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + n_resources_ : list of int + The amount of resources used at each iteration. + + n_candidates_ : list of int + The number of candidate parameters that were evaluated at each + iteration. + + n_remaining_candidates_ : int + The number of candidate parameters that are left after the last + iteration. It corresponds to `ceil(n_candidates[-1] / factor)` + + max_resources_ : int + The maximum number of resources that any candidate is allowed to use + for a given iteration. Note that since the number of resources used at + each iteration must be a multiple of ``min_resources_``, the actual + number of resources used at the last iteration may be smaller than + ``max_resources_``. + + min_resources_ : int + The amount of resources that are allocated for each candidate at the + first iteration. + + n_iterations_ : int + The actual number of iterations that were run. This is equal to + ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``. + Else, this is equal to ``min(n_possible_iterations_, + n_required_iterations_)``. + + n_possible_iterations_ : int + The number of iterations that are possible starting with + ``min_resources_`` resources and without exceeding + ``max_resources_``. + + n_required_iterations_ : int + The number of iterations that are required to end up with less than + ``factor`` candidates at the last iteration, starting with + ``min_resources_`` resources. This will be smaller than + ``n_possible_iterations_`` when there isn't enough resources. + + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. It contains lots of information + for analysing the results of a search. + Please refer to the :ref:`User guide` + for details. + + best_estimator_ : estimator or dict + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if ``refit=False``. + + best_score_ : float + Mean cross-validated score of the best_estimator. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best + parameters for the model. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + refit_time_ : float + Seconds used for refitting the best model on the whole dataset. + + This is present only if ``refit`` is not False. + + multimetric_ : bool + Whether or not the scorers compute several metrics. + + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `n_features_in_` when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `feature_names_in_` when fit. + + .. versionadded:: 1.0 + + See Also + -------- + :class:`HalvingGridSearchCV`: + Search over a grid of parameters using successive halving. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + All parameter combinations scored with a NaN will share the lowest rank. + + Examples + -------- + + >>> from sklearn.datasets import load_iris + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> from sklearn.model_selection import HalvingRandomSearchCV + >>> from scipy.stats import randint + >>> import numpy as np + ... + >>> X, y = load_iris(return_X_y=True) + >>> clf = RandomForestClassifier(random_state=0) + >>> np.random.seed(0) + ... + >>> param_distributions = {"max_depth": [3, None], + ... "min_samples_split": randint(2, 11)} + >>> search = HalvingRandomSearchCV(clf, param_distributions, + ... resource='n_estimators', + ... max_resources=10, + ... random_state=0).fit(X, y) + >>> search.best_params_ # doctest: +SKIP + {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} + """ + + _parameter_constraints: dict = { + **BaseSuccessiveHalving._parameter_constraints, + "param_distributions": [dict, list], + "n_candidates": [ + Interval(Integral, 0, None, closed="neither"), + StrOptions({"exhaust"}), + ], + } + + def __init__( + self, + estimator, + param_distributions, + *, + n_candidates="exhaust", + factor=3, + resource="n_samples", + max_resources="auto", + min_resources="smallest", + aggressive_elimination=False, + cv=5, + scoring=None, + refit=True, + error_score=np.nan, + return_train_score=True, + random_state=None, + n_jobs=None, + verbose=0, + ): + super().__init__( + estimator, + scoring=scoring, + n_jobs=n_jobs, + refit=refit, + verbose=verbose, + cv=cv, + random_state=random_state, + error_score=error_score, + return_train_score=return_train_score, + max_resources=max_resources, + resource=resource, + factor=factor, + min_resources=min_resources, + aggressive_elimination=aggressive_elimination, + ) + self.param_distributions = param_distributions + self.n_candidates = n_candidates + + def _generate_candidate_params(self): + n_candidates_first_iter = self.n_candidates + if n_candidates_first_iter == "exhaust": + # This will generate enough candidate so that the last iteration + # uses as much resources as possible + n_candidates_first_iter = self.max_resources_ // self.min_resources_ + return ParameterSampler( + self.param_distributions, + n_candidates_first_iter, + random_state=self.random_state, + ) diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_split.py b/.venv/Lib/site-packages/sklearn/model_selection/_split.py new file mode 100644 index 0000000000000000000000000000000000000000..2acbef99007e91e1724bc8712c9e6aa59aa956b1 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/_split.py @@ -0,0 +1,2987 @@ +""" +The :mod:`sklearn.model_selection._split` module includes classes and +functions to split the data based on a preset strategy. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import warnings +from abc import ABCMeta, abstractmethod +from collections import defaultdict +from collections.abc import Iterable +from inspect import signature +from itertools import chain, combinations +from math import ceil, floor + +import numpy as np +from scipy.special import comb + +from ..utils import ( + _safe_indexing, + check_random_state, + indexable, + metadata_routing, +) +from ..utils._array_api import ( + _convert_to_numpy, + ensure_common_namespace_device, + get_namespace, +) +from ..utils._param_validation import Interval, RealNotInt, validate_params +from ..utils.extmath import _approximate_mode +from ..utils.metadata_routing import _MetadataRequester +from ..utils.multiclass import type_of_target +from ..utils.validation import _num_samples, check_array, column_or_1d + +__all__ = [ + "BaseCrossValidator", + "KFold", + "GroupKFold", + "LeaveOneGroupOut", + "LeaveOneOut", + "LeavePGroupsOut", + "LeavePOut", + "RepeatedStratifiedKFold", + "RepeatedKFold", + "ShuffleSplit", + "GroupShuffleSplit", + "StratifiedKFold", + "StratifiedGroupKFold", + "StratifiedShuffleSplit", + "PredefinedSplit", + "train_test_split", + "check_cv", +] + + +class _UnsupportedGroupCVMixin: + """Mixin for splitters that do not support Groups.""" + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + if groups is not None: + warnings.warn( + f"The groups parameter is ignored by {self.__class__.__name__}", + UserWarning, + ) + return super().split(X, y, groups=groups) + + +class GroupsConsumerMixin(_MetadataRequester): + """A Mixin to ``groups`` by default. + + This Mixin makes the object to request ``groups`` by default as ``True``. + + .. versionadded:: 1.3 + """ + + __metadata_request__split = {"groups": True} + + +class BaseCrossValidator(_MetadataRequester, metaclass=ABCMeta): + """Base class for all cross-validators. + + Implementations must define `_iter_test_masks` or `_iter_test_indices`. + """ + + # This indicates that by default CV splitters don't have a "groups" kwarg, + # unless indicated by inheriting from ``GroupsConsumerMixin``. + # This also prevents ``set_split_request`` to be generated for splitters + # which don't support ``groups``. + __metadata_request__split = {"groups": metadata_routing.UNUSED} + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + X, y, groups = indexable(X, y, groups) + indices = np.arange(_num_samples(X)) + for test_index in self._iter_test_masks(X, y, groups): + train_index = indices[np.logical_not(test_index)] + test_index = indices[test_index] + yield train_index, test_index + + # Since subclasses must implement either _iter_test_masks or + # _iter_test_indices, neither can be abstract. + def _iter_test_masks(self, X=None, y=None, groups=None): + """Generates boolean masks corresponding to test sets. + + By default, delegates to _iter_test_indices(X, y, groups) + """ + for test_index in self._iter_test_indices(X, y, groups): + test_mask = np.zeros(_num_samples(X), dtype=bool) + test_mask[test_index] = True + yield test_mask + + def _iter_test_indices(self, X=None, y=None, groups=None): + """Generates integer indices corresponding to test sets.""" + raise NotImplementedError + + @abstractmethod + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator.""" + + def __repr__(self): + return _build_repr(self) + + +class LeaveOneOut(_UnsupportedGroupCVMixin, BaseCrossValidator): + """Leave-One-Out cross-validator. + + Provides train/test indices to split data in train/test sets. Each + sample is used once as a test set (singleton) while the remaining + samples form the training set. + + Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and + ``LeavePOut(p=1)`` where ``n`` is the number of samples. + + Due to the high number of test sets (which is the same as the + number of samples) this cross-validation method can be very costly. + For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit` + or :class:`StratifiedKFold`. + + Read more in the :ref:`User Guide `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import LeaveOneOut + >>> X = np.array([[1, 2], [3, 4]]) + >>> y = np.array([1, 2]) + >>> loo = LeaveOneOut() + >>> loo.get_n_splits(X) + 2 + >>> print(loo) + LeaveOneOut() + >>> for i, (train_index, test_index) in enumerate(loo.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[1] + Test: index=[0] + Fold 1: + Train: index=[0] + Test: index=[1] + + See Also + -------- + LeaveOneGroupOut : For splitting the data according to explicit, + domain-specific stratification of the dataset. + GroupKFold : K-fold iterator variant with non-overlapping groups. + """ + + def _iter_test_indices(self, X, y=None, groups=None): + n_samples = _num_samples(X) + if n_samples <= 1: + raise ValueError( + "Cannot perform LeaveOneOut with n_samples={}.".format(n_samples) + ) + return range(n_samples) + + def get_n_splits(self, X, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + if X is None: + raise ValueError("The 'X' parameter should not be None.") + return _num_samples(X) + + +class LeavePOut(_UnsupportedGroupCVMixin, BaseCrossValidator): + """Leave-P-Out cross-validator. + + Provides train/test indices to split data in train/test sets. This results + in testing on all distinct samples of size p, while the remaining n - p + samples form the training set in each iteration. + + Note: ``LeavePOut(p)`` is NOT equivalent to + ``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets. + + Due to the high number of iterations which grows combinatorically with the + number of samples this cross-validation method can be very costly. For + large datasets one should favor :class:`KFold`, :class:`StratifiedKFold` + or :class:`ShuffleSplit`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + p : int + Size of the test sets. Must be strictly less than the number of + samples. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import LeavePOut + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + >>> y = np.array([1, 2, 3, 4]) + >>> lpo = LeavePOut(2) + >>> lpo.get_n_splits(X) + 6 + >>> print(lpo) + LeavePOut(p=2) + >>> for i, (train_index, test_index) in enumerate(lpo.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[2 3] + Test: index=[0 1] + Fold 1: + Train: index=[1 3] + Test: index=[0 2] + Fold 2: + Train: index=[1 2] + Test: index=[0 3] + Fold 3: + Train: index=[0 3] + Test: index=[1 2] + Fold 4: + Train: index=[0 2] + Test: index=[1 3] + Fold 5: + Train: index=[0 1] + Test: index=[2 3] + """ + + def __init__(self, p): + self.p = p + + def _iter_test_indices(self, X, y=None, groups=None): + n_samples = _num_samples(X) + if n_samples <= self.p: + raise ValueError( + "p={} must be strictly less than the number of samples={}".format( + self.p, n_samples + ) + ) + for combination in combinations(range(n_samples), self.p): + yield np.array(combination) + + def get_n_splits(self, X, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + """ + if X is None: + raise ValueError("The 'X' parameter should not be None.") + return int(comb(_num_samples(X), self.p, exact=True)) + + +class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta): + """Base class for K-Fold cross-validators and TimeSeriesSplit.""" + + @abstractmethod + def __init__(self, n_splits, *, shuffle, random_state): + if not isinstance(n_splits, numbers.Integral): + raise ValueError( + "The number of folds must be of Integral type. " + "%s of type %s was passed." % (n_splits, type(n_splits)) + ) + n_splits = int(n_splits) + + if n_splits <= 1: + raise ValueError( + "k-fold cross-validation requires at least one" + " train/test split by setting n_splits=2 or more," + " got n_splits={0}.".format(n_splits) + ) + + if not isinstance(shuffle, bool): + raise TypeError("shuffle must be True or False; got {0}".format(shuffle)) + + if not shuffle and random_state is not None: # None is the default + raise ValueError( + ( + "Setting a random_state has no effect since shuffle is " + "False. You should leave " + "random_state to its default (None), or set shuffle=True." + ), + ) + + self.n_splits = n_splits + self.shuffle = shuffle + self.random_state = random_state + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + X, y, groups = indexable(X, y, groups) + n_samples = _num_samples(X) + if self.n_splits > n_samples: + raise ValueError( + ( + "Cannot have number of splits n_splits={0} greater" + " than the number of samples: n_samples={1}." + ).format(self.n_splits, n_samples) + ) + + for train, test in super().split(X, y, groups): + yield train, test + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_splits + + +class KFold(_UnsupportedGroupCVMixin, _BaseKFold): + """K-Fold cross-validator. + + Provides train/test indices to split data in train/test sets. Split + dataset into k consecutive folds (without shuffling by default). + + Each fold is then used once as a validation while the k - 1 remaining + folds form the training set. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + .. versionchanged:: 0.22 + ``n_splits`` default value changed from 3 to 5. + + shuffle : bool, default=False + Whether to shuffle the data before splitting into batches. + Note that the samples within each split will not be shuffled. + + random_state : int, RandomState instance or None, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold. Otherwise, this + parameter has no effect. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import KFold + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([1, 2, 3, 4]) + >>> kf = KFold(n_splits=2) + >>> kf.get_n_splits(X) + 2 + >>> print(kf) + KFold(n_splits=2, random_state=None, shuffle=False) + >>> for i, (train_index, test_index) in enumerate(kf.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[2 3] + Test: index=[0 1] + Fold 1: + Train: index=[0 1] + Test: index=[2 3] + + Notes + ----- + The first ``n_samples % n_splits`` folds have size + ``n_samples // n_splits + 1``, other folds have size + ``n_samples // n_splits``, where ``n_samples`` is the number of samples. + + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + + See Also + -------- + StratifiedKFold : Takes class information into account to avoid building + folds with imbalanced class distributions (for binary or multiclass + classification tasks). + + GroupKFold : K-fold iterator variant with non-overlapping groups. + + RepeatedKFold : Repeats K-Fold n times. + """ + + def __init__(self, n_splits=5, *, shuffle=False, random_state=None): + super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + + def _iter_test_indices(self, X, y=None, groups=None): + n_samples = _num_samples(X) + indices = np.arange(n_samples) + if self.shuffle: + check_random_state(self.random_state).shuffle(indices) + + n_splits = self.n_splits + fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int) + fold_sizes[: n_samples % n_splits] += 1 + current = 0 + for fold_size in fold_sizes: + start, stop = current, current + fold_size + yield indices[start:stop] + current = stop + + +class GroupKFold(GroupsConsumerMixin, _BaseKFold): + """K-fold iterator variant with non-overlapping groups. + + Each group will appear exactly once in the test set across all folds (the + number of distinct groups has to be at least equal to the number of folds). + + The folds are approximately balanced in the sense that the number of + samples is approximately the same in each test fold when `shuffle` is True. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + .. versionchanged:: 0.22 + ``n_splits`` default value changed from 3 to 5. + + shuffle : bool, default=False + Whether to shuffle the groups before splitting into batches. + Note that the samples within each split will not be shuffled. + + .. versionadded:: 1.6 + + random_state : int, RandomState instance or None, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold. Otherwise, this + parameter has no effect. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + .. versionadded:: 1.6 + + Notes + ----- + Groups appear in an arbitrary order throughout the folds. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import GroupKFold + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) + >>> y = np.array([1, 2, 3, 4, 5, 6]) + >>> groups = np.array([0, 0, 2, 2, 3, 3]) + >>> group_kfold = GroupKFold(n_splits=2) + >>> group_kfold.get_n_splits(X, y, groups) + 2 + >>> print(group_kfold) + GroupKFold(n_splits=2, random_state=None, shuffle=False) + >>> for i, (train_index, test_index) in enumerate(group_kfold.split(X, y, groups)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}, group={groups[train_index]}") + ... print(f" Test: index={test_index}, group={groups[test_index]}") + Fold 0: + Train: index=[2 3], group=[2 2] + Test: index=[0 1 4 5], group=[0 0 3 3] + Fold 1: + Train: index=[0 1 4 5], group=[0 0 3 3] + Test: index=[2 3], group=[2 2] + + See Also + -------- + LeaveOneGroupOut : For splitting the data according to explicit + domain-specific stratification of the dataset. + + StratifiedKFold : Takes class information into account to avoid building + folds with imbalanced class proportions (for binary or multiclass + classification tasks). + """ + + def __init__(self, n_splits=5, *, shuffle=False, random_state=None): + super().__init__(n_splits, shuffle=shuffle, random_state=random_state) + + def _iter_test_indices(self, X, y, groups): + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) + + unique_groups, group_idx = np.unique(groups, return_inverse=True) + n_groups = len(unique_groups) + + if self.n_splits > n_groups: + raise ValueError( + "Cannot have number of splits n_splits=%d greater" + " than the number of groups: %d." % (self.n_splits, n_groups) + ) + + if self.shuffle: + # Split and shuffle unique groups across n_splits + rng = check_random_state(self.random_state) + unique_groups = rng.permutation(unique_groups) + split_groups = np.array_split(unique_groups, self.n_splits) + + for test_group_ids in split_groups: + test_mask = np.isin(groups, test_group_ids) + yield np.where(test_mask)[0] + + else: + # Weight groups by their number of occurrences + n_samples_per_group = np.bincount(group_idx) + + # Distribute the most frequent groups first + indices = np.argsort(n_samples_per_group)[::-1] + n_samples_per_group = n_samples_per_group[indices] + + # Total weight of each fold + n_samples_per_fold = np.zeros(self.n_splits) + + # Mapping from group index to fold index + group_to_fold = np.zeros(len(unique_groups)) + + # Distribute samples by adding the largest weight to the lightest fold + for group_index, weight in enumerate(n_samples_per_group): + lightest_fold = np.argmin(n_samples_per_fold) + n_samples_per_fold[lightest_fold] += weight + group_to_fold[indices[group_index]] = lightest_fold + + indices = group_to_fold[group_idx] + + for f in range(self.n_splits): + yield np.where(indices == f)[0] + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + return super().split(X, y, groups) + + +class StratifiedKFold(_BaseKFold): + """Stratified K-Fold cross-validator. + + Provides train/test indices to split data in train/test sets. + + This cross-validation object is a variation of KFold that returns + stratified folds. The folds are made by preserving the percentage of + samples for each class. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + .. versionchanged:: 0.22 + ``n_splits`` default value changed from 3 to 5. + + shuffle : bool, default=False + Whether to shuffle each class's samples before splitting into batches. + Note that the samples within each split will not be shuffled. + + random_state : int, RandomState instance or None, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold for each class. + Otherwise, leave `random_state` as `None`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import StratifiedKFold + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 1, 1]) + >>> skf = StratifiedKFold(n_splits=2) + >>> skf.get_n_splits(X, y) + 2 + >>> print(skf) + StratifiedKFold(n_splits=2, random_state=None, shuffle=False) + >>> for i, (train_index, test_index) in enumerate(skf.split(X, y)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[1 3] + Test: index=[0 2] + Fold 1: + Train: index=[0 2] + Test: index=[1 3] + + Notes + ----- + The implementation is designed to: + + * Generate test sets such that all contain the same distribution of + classes, or as close as possible. + * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to + ``y = [1, 0]`` should not change the indices generated. + * Preserve order dependencies in the dataset ordering, when + ``shuffle=False``: all samples from class k in some test set were + contiguous in y, or separated in y by samples from classes other than k. + * Generate test sets where the smallest and largest differ by at most one + sample. + + .. versionchanged:: 0.22 + The previous implementation did not follow the last constraint. + + See Also + -------- + RepeatedStratifiedKFold : Repeats Stratified K-Fold n times. + """ + + def __init__(self, n_splits=5, *, shuffle=False, random_state=None): + super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + + def _make_test_folds(self, X, y=None): + rng = check_random_state(self.random_state) + # XXX: as of now, cross-validation splitters only operate in NumPy-land + # without attempting to leverage array API namespace features. However + # they might be fed by array API inputs, e.g. in CV-enabled estimators so + # we need the following explicit conversion: + xp, is_array_api = get_namespace(y) + if is_array_api: + y = _convert_to_numpy(y, xp) + else: + y = np.asarray(y) + type_of_target_y = type_of_target(y) + allowed_target_types = ("binary", "multiclass") + if type_of_target_y not in allowed_target_types: + raise ValueError( + "Supported target types are: {}. Got {!r} instead.".format( + allowed_target_types, type_of_target_y + ) + ) + + y = column_or_1d(y) + + _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True) + # y_inv encodes y according to lexicographic order. We invert y_idx to + # map the classes so that they are encoded by order of appearance: + # 0 represents the first label appearing in y, 1 the second, etc. + _, class_perm = np.unique(y_idx, return_inverse=True) + y_encoded = class_perm[y_inv] + + n_classes = len(y_idx) + y_counts = np.bincount(y_encoded) + min_groups = np.min(y_counts) + if np.all(self.n_splits > y_counts): + raise ValueError( + "n_splits=%d cannot be greater than the" + " number of members in each class." % (self.n_splits) + ) + if self.n_splits > min_groups: + warnings.warn( + "The least populated class in y has only %d" + " members, which is less than n_splits=%d." + % (min_groups, self.n_splits), + UserWarning, + ) + + # Determine the optimal number of samples from each class in each fold, + # using round robin over the sorted y. (This can be done direct from + # counts, but that code is unreadable.) + y_order = np.sort(y_encoded) + allocation = np.asarray( + [ + np.bincount(y_order[i :: self.n_splits], minlength=n_classes) + for i in range(self.n_splits) + ] + ) + + # To maintain the data order dependencies as best as possible within + # the stratification constraint, we assign samples from each class in + # blocks (and then mess that up when shuffle=True). + test_folds = np.empty(len(y), dtype="i") + for k in range(n_classes): + # since the kth column of allocation stores the number of samples + # of class k in each test set, this generates blocks of fold + # indices corresponding to the allocation for class k. + folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k]) + if self.shuffle: + rng.shuffle(folds_for_class) + test_folds[y_encoded == k] = folds_for_class + return test_folds + + def _iter_test_masks(self, X, y=None, groups=None): + test_folds = self._make_test_folds(X, y) + for i in range(self.n_splits): + yield test_folds == i + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Note that providing ``y`` is sufficient to generate the splits and + hence ``np.zeros(n_samples)`` may be used as a placeholder for + ``X`` instead of actual training data. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + Stratification is done based on the y labels. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + if groups is not None: + warnings.warn( + f"The groups parameter is ignored by {self.__class__.__name__}", + UserWarning, + ) + y = check_array(y, input_name="y", ensure_2d=False, dtype=None) + return super().split(X, y, groups) + + +class StratifiedGroupKFold(GroupsConsumerMixin, _BaseKFold): + """Stratified K-Fold iterator variant with non-overlapping groups. + + This cross-validation object is a variation of StratifiedKFold attempts to + return stratified folds with non-overlapping groups. The folds are made by + preserving the percentage of samples for each class. + + Each group will appear exactly once in the test set across all folds (the + number of distinct groups has to be at least equal to the number of folds). + + The difference between :class:`GroupKFold` + and `StratifiedGroupKFold` is that + the former attempts to create balanced folds such that the number of + distinct groups is approximately the same in each fold, whereas + `StratifiedGroupKFold` attempts to create folds which preserve the + percentage of samples for each class as much as possible given the + constraint of non-overlapping groups between splits. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + shuffle : bool, default=False + Whether to shuffle each class's samples before splitting into batches. + Note that the samples within each split will not be shuffled. + This implementation can only shuffle groups that have approximately the + same y distribution, no global shuffle will be performed. + + random_state : int or RandomState instance, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold for each class. + Otherwise, leave `random_state` as `None`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import StratifiedGroupKFold + >>> X = np.ones((17, 2)) + >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8]) + >>> sgkf = StratifiedGroupKFold(n_splits=3) + >>> sgkf.get_n_splits(X, y) + 3 + >>> print(sgkf) + StratifiedGroupKFold(n_splits=3, random_state=None, shuffle=False) + >>> for i, (train_index, test_index) in enumerate(sgkf.split(X, y, groups)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" group={groups[train_index]}") + ... print(f" Test: index={test_index}") + ... print(f" group={groups[test_index]}") + Fold 0: + Train: index=[ 0 1 2 3 7 8 9 10 11 15 16] + group=[1 1 2 2 4 5 5 5 5 8 8] + Test: index=[ 4 5 6 12 13 14] + group=[3 3 3 6 6 7] + Fold 1: + Train: index=[ 4 5 6 7 8 9 10 11 12 13 14] + group=[3 3 3 4 5 5 5 5 6 6 7] + Test: index=[ 0 1 2 3 15 16] + group=[1 1 2 2 8 8] + Fold 2: + Train: index=[ 0 1 2 3 4 5 6 12 13 14 15 16] + group=[1 1 2 2 3 3 3 6 6 7 8 8] + Test: index=[ 7 8 9 10 11] + group=[4 5 5 5 5] + + Notes + ----- + The implementation is designed to: + + * Mimic the behavior of StratifiedKFold as much as possible for trivial + groups (e.g. when each group contains only one sample). + * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to + ``y = [1, 0]`` should not change the indices generated. + * Stratify based on samples as much as possible while keeping + non-overlapping groups constraint. That means that in some cases when + there is a small number of groups containing a large number of samples + the stratification will not be possible and the behavior will be close + to GroupKFold. + + See also + -------- + StratifiedKFold: Takes class information into account to build folds which + retain class distributions (for binary or multiclass classification + tasks). + + GroupKFold: K-fold iterator variant with non-overlapping groups. + """ + + def __init__(self, n_splits=5, shuffle=False, random_state=None): + super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + + def _iter_test_indices(self, X, y, groups): + # Implementation is based on this kaggle kernel: + # https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation + # and is a subject to Apache 2.0 License. You may obtain a copy of the + # License at http://www.apache.org/licenses/LICENSE-2.0 + # Changelist: + # - Refactored function to a class following scikit-learn KFold + # interface. + # - Added heuristic for assigning group to the least populated fold in + # cases when all other criteria are equal + # - Swtch from using python ``Counter`` to ``np.unique`` to get class + # distribution + # - Added scikit-learn checks for input: checking that target is binary + # or multiclass, checking passed random state, checking that number + # of splits is less than number of members in each class, checking + # that least populated class has more members than there are splits. + rng = check_random_state(self.random_state) + y = np.asarray(y) + type_of_target_y = type_of_target(y) + allowed_target_types = ("binary", "multiclass") + if type_of_target_y not in allowed_target_types: + raise ValueError( + "Supported target types are: {}. Got {!r} instead.".format( + allowed_target_types, type_of_target_y + ) + ) + + y = column_or_1d(y) + _, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True) + if np.all(self.n_splits > y_cnt): + raise ValueError( + "n_splits=%d cannot be greater than the" + " number of members in each class." % (self.n_splits) + ) + n_smallest_class = np.min(y_cnt) + if self.n_splits > n_smallest_class: + warnings.warn( + "The least populated class in y has only %d" + " members, which is less than n_splits=%d." + % (n_smallest_class, self.n_splits), + UserWarning, + ) + n_classes = len(y_cnt) + + _, groups_inv, groups_cnt = np.unique( + groups, return_inverse=True, return_counts=True + ) + y_counts_per_group = np.zeros((len(groups_cnt), n_classes)) + for class_idx, group_idx in zip(y_inv, groups_inv): + y_counts_per_group[group_idx, class_idx] += 1 + + y_counts_per_fold = np.zeros((self.n_splits, n_classes)) + groups_per_fold = defaultdict(set) + + if self.shuffle: + rng.shuffle(y_counts_per_group) + + # Stable sort to keep shuffled order for groups with the same + # class distribution variance + sorted_groups_idx = np.argsort( + -np.std(y_counts_per_group, axis=1), kind="mergesort" + ) + + for group_idx in sorted_groups_idx: + group_y_counts = y_counts_per_group[group_idx] + best_fold = self._find_best_fold( + y_counts_per_fold=y_counts_per_fold, + y_cnt=y_cnt, + group_y_counts=group_y_counts, + ) + y_counts_per_fold[best_fold] += group_y_counts + groups_per_fold[best_fold].add(group_idx) + + for i in range(self.n_splits): + test_indices = [ + idx + for idx, group_idx in enumerate(groups_inv) + if group_idx in groups_per_fold[i] + ] + yield test_indices + + def _find_best_fold(self, y_counts_per_fold, y_cnt, group_y_counts): + best_fold = None + min_eval = np.inf + min_samples_in_fold = np.inf + for i in range(self.n_splits): + y_counts_per_fold[i] += group_y_counts + # Summarise the distribution over classes in each proposed fold + std_per_class = np.std(y_counts_per_fold / y_cnt.reshape(1, -1), axis=0) + y_counts_per_fold[i] -= group_y_counts + fold_eval = np.mean(std_per_class) + samples_in_fold = np.sum(y_counts_per_fold[i]) + is_current_fold_better = ( + fold_eval < min_eval + or np.isclose(fold_eval, min_eval) + and samples_in_fold < min_samples_in_fold + ) + if is_current_fold_better: + min_eval = fold_eval + min_samples_in_fold = samples_in_fold + best_fold = i + return best_fold + + +class TimeSeriesSplit(_BaseKFold): + """Time Series cross-validator. + + Provides train/test indices to split time series data samples + that are observed at fixed time intervals, in train/test sets. + In each split, test indices must be higher than before, and thus shuffling + in cross validator is inappropriate. + + This cross-validation object is a variation of :class:`KFold`. + In the kth split, it returns first k folds as train set and the + (k+1)th fold as test set. + + Note that unlike standard cross-validation methods, successive + training sets are supersets of those that come before them. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + .. versionadded:: 0.18 + + Parameters + ---------- + n_splits : int, default=5 + Number of splits. Must be at least 2. + + .. versionchanged:: 0.22 + ``n_splits`` default value changed from 3 to 5. + + max_train_size : int, default=None + Maximum size for a single training set. + + test_size : int, default=None + Used to limit the size of the test set. Defaults to + ``n_samples // (n_splits + 1)``, which is the maximum allowed value + with ``gap=0``. + + .. versionadded:: 0.24 + + gap : int, default=0 + Number of samples to exclude from the end of each train set before + the test set. + + .. versionadded:: 0.24 + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import TimeSeriesSplit + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([1, 2, 3, 4, 5, 6]) + >>> tscv = TimeSeriesSplit() + >>> print(tscv) + TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None) + >>> for i, (train_index, test_index) in enumerate(tscv.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[0] + Test: index=[1] + Fold 1: + Train: index=[0 1] + Test: index=[2] + Fold 2: + Train: index=[0 1 2] + Test: index=[3] + Fold 3: + Train: index=[0 1 2 3] + Test: index=[4] + Fold 4: + Train: index=[0 1 2 3 4] + Test: index=[5] + >>> # Fix test_size to 2 with 12 samples + >>> X = np.random.randn(12, 2) + >>> y = np.random.randint(0, 2, 12) + >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2) + >>> for i, (train_index, test_index) in enumerate(tscv.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[0 1 2 3 4 5] + Test: index=[6 7] + Fold 1: + Train: index=[0 1 2 3 4 5 6 7] + Test: index=[8 9] + Fold 2: + Train: index=[0 1 2 3 4 5 6 7 8 9] + Test: index=[10 11] + >>> # Add in a 2 period gap + >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2) + >>> for i, (train_index, test_index) in enumerate(tscv.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[0 1 2 3] + Test: index=[6 7] + Fold 1: + Train: index=[0 1 2 3 4 5] + Test: index=[8 9] + Fold 2: + Train: index=[0 1 2 3 4 5 6 7] + Test: index=[10 11] + + For a more extended example see + :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`. + + Notes + ----- + The training set has size ``i * n_samples // (n_splits + 1) + + n_samples % (n_splits + 1)`` in the ``i`` th split, + with a test set of size ``n_samples//(n_splits + 1)`` by default, + where ``n_samples`` is the number of samples. Note that this + formula is only valid when ``test_size`` and ``max_train_size`` are + left to their default values. + """ + + def __init__(self, n_splits=5, *, max_train_size=None, test_size=None, gap=0): + super().__init__(n_splits, shuffle=False, random_state=None) + self.max_train_size = max_train_size + self.test_size = test_size + self.gap = gap + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + + groups : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + if groups is not None: + warnings.warn( + f"The groups parameter is ignored by {self.__class__.__name__}", + UserWarning, + ) + return self._split(X) + + def _split(self, X): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + (X,) = indexable(X) + n_samples = _num_samples(X) + n_splits = self.n_splits + n_folds = n_splits + 1 + gap = self.gap + test_size = ( + self.test_size if self.test_size is not None else n_samples // n_folds + ) + + # Make sure we have enough samples for the given split parameters + if n_folds > n_samples: + raise ValueError( + f"Cannot have number of folds={n_folds} greater" + f" than the number of samples={n_samples}." + ) + if n_samples - gap - (test_size * n_splits) <= 0: + raise ValueError( + f"Too many splits={n_splits} for number of samples" + f"={n_samples} with test_size={test_size} and gap={gap}." + ) + + indices = np.arange(n_samples) + test_starts = range(n_samples - n_splits * test_size, n_samples, test_size) + + for test_start in test_starts: + train_end = test_start - gap + if self.max_train_size and self.max_train_size < train_end: + yield ( + indices[train_end - self.max_train_size : train_end], + indices[test_start : test_start + test_size], + ) + else: + yield ( + indices[:train_end], + indices[test_start : test_start + test_size], + ) + + +class LeaveOneGroupOut(GroupsConsumerMixin, BaseCrossValidator): + """Leave One Group Out cross-validator. + + Provides train/test indices to split data such that each training set is + comprised of all samples except ones belonging to one specific group. + Arbitrary domain specific group information is provided as an array of integers + that encodes the group of each sample. + + For instance the groups could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + Read more in the :ref:`User Guide `. + + Notes + ----- + Splits are ordered according to the index of the group left out. The first + split has testing set consisting of the group whose index in `groups` is + lowest, and so on. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import LeaveOneGroupOut + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + >>> y = np.array([1, 2, 1, 2]) + >>> groups = np.array([1, 1, 2, 2]) + >>> logo = LeaveOneGroupOut() + >>> logo.get_n_splits(X, y, groups) + 2 + >>> logo.get_n_splits(groups=groups) # 'groups' is always required + 2 + >>> print(logo) + LeaveOneGroupOut() + >>> for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}, group={groups[train_index]}") + ... print(f" Test: index={test_index}, group={groups[test_index]}") + Fold 0: + Train: index=[2 3], group=[2 2] + Test: index=[0 1], group=[1 1] + Fold 1: + Train: index=[0 1], group=[1 1] + Test: index=[2 3], group=[2 2] + + See also + -------- + GroupKFold: K-fold iterator variant with non-overlapping groups. + """ + + def _iter_test_masks(self, X, y, groups): + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + # We make a copy of groups to avoid side-effects during iteration + groups = check_array( + groups, input_name="groups", copy=True, ensure_2d=False, dtype=None + ) + unique_groups = np.unique(groups) + if len(unique_groups) <= 1: + raise ValueError( + "The groups parameter contains fewer than 2 unique groups " + "(%s). LeaveOneGroupOut expects at least 2." % unique_groups + ) + for i in unique_groups: + yield groups == i + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. This 'groups' parameter must always be specified to + calculate the number of splits, though the other parameters can be + omitted. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) + return len(np.unique(groups)) + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + return super().split(X, y, groups) + + +class LeavePGroupsOut(GroupsConsumerMixin, BaseCrossValidator): + """Leave P Group(s) Out cross-validator. + + Provides train/test indices to split data according to a third-party + provided group. This group information can be used to encode arbitrary + domain specific stratifications of the samples as integers. + + For instance the groups could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + The difference between LeavePGroupsOut and LeaveOneGroupOut is that + the former builds the test sets with all the samples assigned to + ``p`` different values of the groups while the latter uses samples + all assigned the same groups. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_groups : int + Number of groups (``p``) to leave out in the test split. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import LeavePGroupsOut + >>> X = np.array([[1, 2], [3, 4], [5, 6]]) + >>> y = np.array([1, 2, 1]) + >>> groups = np.array([1, 2, 3]) + >>> lpgo = LeavePGroupsOut(n_groups=2) + >>> lpgo.get_n_splits(X, y, groups) + 3 + >>> lpgo.get_n_splits(groups=groups) # 'groups' is always required + 3 + >>> print(lpgo) + LeavePGroupsOut(n_groups=2) + >>> for i, (train_index, test_index) in enumerate(lpgo.split(X, y, groups)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}, group={groups[train_index]}") + ... print(f" Test: index={test_index}, group={groups[test_index]}") + Fold 0: + Train: index=[2], group=[3] + Test: index=[0 1], group=[1 2] + Fold 1: + Train: index=[1], group=[2] + Test: index=[0 2], group=[1 3] + Fold 2: + Train: index=[0], group=[1] + Test: index=[1 2], group=[2 3] + + See Also + -------- + GroupKFold : K-fold iterator variant with non-overlapping groups. + """ + + def __init__(self, n_groups): + self.n_groups = n_groups + + def _iter_test_masks(self, X, y, groups): + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array( + groups, input_name="groups", copy=True, ensure_2d=False, dtype=None + ) + unique_groups = np.unique(groups) + if self.n_groups >= len(unique_groups): + raise ValueError( + "The groups parameter contains fewer than (or equal to) " + "n_groups (%d) numbers of unique groups (%s). LeavePGroupsOut " + "expects that at least n_groups + 1 (%d) unique groups be " + "present" % (self.n_groups, unique_groups, self.n_groups + 1) + ) + combi = combinations(range(len(unique_groups)), self.n_groups) + for indices in combi: + test_index = np.zeros(_num_samples(X), dtype=bool) + for l in unique_groups[np.array(indices)]: + test_index[groups == l] = True + yield test_index + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. This 'groups' parameter must always be specified to + calculate the number of splits, though the other parameters can be + omitted. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) + return int(comb(len(np.unique(groups)), self.n_groups, exact=True)) + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + return super().split(X, y, groups) + + +class _RepeatedSplits(_MetadataRequester, metaclass=ABCMeta): + """Repeated splits for an arbitrary randomized CV splitter. + + Repeats splits for cross-validators n times with different randomization + in each repetition. + + Parameters + ---------- + cv : callable + Cross-validator class. + + n_repeats : int, default=10 + Number of times cross-validator needs to be repeated. + + random_state : int, RandomState instance or None, default=None + Passes `random_state` to the arbitrary repeating cross validator. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + **cvargs : additional params + Constructor parameters for cv. Must not contain random_state + and shuffle. + """ + + # This indicates that by default CV splitters don't have a "groups" kwarg, + # unless indicated by inheriting from ``GroupsConsumerMixin``. + # This also prevents ``set_split_request`` to be generated for splitters + # which don't support ``groups``. + __metadata_request__split = {"groups": metadata_routing.UNUSED} + + def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs): + if not isinstance(n_repeats, numbers.Integral): + raise ValueError("Number of repetitions must be of Integral type.") + + if n_repeats <= 0: + raise ValueError("Number of repetitions must be greater than 0.") + + if any(key in cvargs for key in ("random_state", "shuffle")): + raise ValueError("cvargs must not contain random_state or shuffle.") + + self.cv = cv + self.n_repeats = n_repeats + self.random_state = random_state + self.cvargs = cvargs + + def split(self, X, y=None, groups=None): + """Generates indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + n_repeats = self.n_repeats + rng = check_random_state(self.random_state) + + for idx in range(n_repeats): + cv = self.cv(random_state=rng, shuffle=True, **self.cvargs) + for train_index, test_index in cv.split(X, y, groups): + yield train_index, test_index + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + ``np.zeros(n_samples)`` may be used as a placeholder. + + y : object + Always ignored, exists for compatibility. + ``np.zeros(n_samples)`` may be used as a placeholder. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + rng = check_random_state(self.random_state) + cv = self.cv(random_state=rng, shuffle=True, **self.cvargs) + return cv.get_n_splits(X, y, groups) * self.n_repeats + + def __repr__(self): + return _build_repr(self) + + +class RepeatedKFold(_UnsupportedGroupCVMixin, _RepeatedSplits): + """Repeated K-Fold cross validator. + + Repeats K-Fold n times with different randomization in each repetition. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + n_repeats : int, default=10 + Number of times cross-validator needs to be repeated. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of each repeated cross-validation instance. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import RepeatedKFold + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 1, 1]) + >>> rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124) + >>> rkf.get_n_splits(X, y) + 4 + >>> print(rkf) + RepeatedKFold(n_repeats=2, n_splits=2, random_state=2652124) + >>> for i, (train_index, test_index) in enumerate(rkf.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + ... + Fold 0: + Train: index=[0 1] + Test: index=[2 3] + Fold 1: + Train: index=[2 3] + Test: index=[0 1] + Fold 2: + Train: index=[1 2] + Test: index=[0 3] + Fold 3: + Train: index=[0 3] + Test: index=[1 2] + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + + See Also + -------- + RepeatedStratifiedKFold : Repeats Stratified K-Fold n times. + """ + + def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): + super().__init__( + KFold, n_repeats=n_repeats, random_state=random_state, n_splits=n_splits + ) + + +class RepeatedStratifiedKFold(_UnsupportedGroupCVMixin, _RepeatedSplits): + """Repeated Stratified K-Fold cross validator. + + Repeats Stratified K-Fold n times with different randomization in each + repetition. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + n_repeats : int, default=10 + Number of times cross-validator needs to be repeated. + + random_state : int, RandomState instance or None, default=None + Controls the generation of the random states for each repetition. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import RepeatedStratifiedKFold + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 1, 1]) + >>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, + ... random_state=36851234) + >>> rskf.get_n_splits(X, y) + 4 + >>> print(rskf) + RepeatedStratifiedKFold(n_repeats=2, n_splits=2, random_state=36851234) + >>> for i, (train_index, test_index) in enumerate(rskf.split(X, y)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + ... + Fold 0: + Train: index=[1 2] + Test: index=[0 3] + Fold 1: + Train: index=[0 3] + Test: index=[1 2] + Fold 2: + Train: index=[1 3] + Test: index=[0 2] + Fold 3: + Train: index=[0 2] + Test: index=[1 3] + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + + See Also + -------- + RepeatedKFold : Repeats K-Fold n times. + """ + + def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): + super().__init__( + StratifiedKFold, + n_repeats=n_repeats, + random_state=random_state, + n_splits=n_splits, + ) + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Note that providing ``y`` is sufficient to generate the splits and + hence ``np.zeros(n_samples)`` may be used as a placeholder for + ``X`` instead of actual training data. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + Stratification is done based on the y labels. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + y = check_array(y, input_name="y", ensure_2d=False, dtype=None) + return super().split(X, y, groups=groups) + + +class BaseShuffleSplit(_MetadataRequester, metaclass=ABCMeta): + """Base class for *ShuffleSplit. + + Parameters + ---------- + n_splits : int, default=10 + Number of re-shuffling & splitting iterations. + + test_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.1. + + train_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the training and testing indices produced. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + """ + + # This indicates that by default CV splitters don't have a "groups" kwarg, + # unless indicated by inheriting from ``GroupsConsumerMixin``. + # This also prevents ``set_split_request`` to be generated for splitters + # which don't support ``groups``. + __metadata_request__split = {"groups": metadata_routing.UNUSED} + + def __init__( + self, n_splits=10, *, test_size=None, train_size=None, random_state=None + ): + self.n_splits = n_splits + self.test_size = test_size + self.train_size = train_size + self.random_state = random_state + self._default_test_size = 0.1 + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + X, y, groups = indexable(X, y, groups) + for train, test in self._iter_indices(X, y, groups): + yield train, test + + def _iter_indices(self, X, y=None, groups=None): + """Generate (train, test) indices""" + n_samples = _num_samples(X) + n_train, n_test = _validate_shuffle_split( + n_samples, + self.test_size, + self.train_size, + default_test_size=self._default_test_size, + ) + + rng = check_random_state(self.random_state) + for i in range(self.n_splits): + # random partition + permutation = rng.permutation(n_samples) + ind_test = permutation[:n_test] + ind_train = permutation[n_test : (n_test + n_train)] + yield ind_train, ind_test + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_splits + + def __repr__(self): + return _build_repr(self) + + +class ShuffleSplit(_UnsupportedGroupCVMixin, BaseShuffleSplit): + """Random permutation cross-validator. + + Yields indices to split data into training and test sets. + + Note: contrary to other cross-validation strategies, random splits + do not guarantee that test sets across all folds will be mutually exclusive, + and might include overlapping samples. However, this is still very likely for + sizeable datasets. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=10 + Number of re-shuffling & splitting iterations. + + test_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.1. + + train_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the training and testing indices produced. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import ShuffleSplit + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]]) + >>> y = np.array([1, 2, 1, 2, 1, 2]) + >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0) + >>> rs.get_n_splits(X) + 5 + >>> print(rs) + ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None) + >>> for i, (train_index, test_index) in enumerate(rs.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[1 3 0 4] + Test: index=[5 2] + Fold 1: + Train: index=[4 0 2 5] + Test: index=[1 3] + Fold 2: + Train: index=[1 2 4 0] + Test: index=[3 5] + Fold 3: + Train: index=[3 4 1 0] + Test: index=[5 2] + Fold 4: + Train: index=[3 5 1 0] + Test: index=[2 4] + >>> # Specify train and test size + >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25, + ... random_state=0) + >>> for i, (train_index, test_index) in enumerate(rs.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[1 3 0] + Test: index=[5 2] + Fold 1: + Train: index=[4 0 2] + Test: index=[1 3] + Fold 2: + Train: index=[1 2 4] + Test: index=[3 5] + Fold 3: + Train: index=[3 4 1] + Test: index=[5 2] + Fold 4: + Train: index=[3 5 1] + Test: index=[2 4] + """ + + def __init__( + self, n_splits=10, *, test_size=None, train_size=None, random_state=None + ): + super().__init__( + n_splits=n_splits, + test_size=test_size, + train_size=train_size, + random_state=random_state, + ) + self._default_test_size = 0.1 + + +class GroupShuffleSplit(GroupsConsumerMixin, BaseShuffleSplit): + """Shuffle-Group(s)-Out cross-validation iterator. + + Provides randomized train/test indices to split data according to a + third-party provided group. This group information can be used to encode + arbitrary domain specific stratifications of the samples as integers. + + For instance the groups could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + The difference between :class:`LeavePGroupsOut` and ``GroupShuffleSplit`` is that + the former generates splits using all subsets of size ``p`` unique groups, + whereas ``GroupShuffleSplit`` generates a user-determined number of random + test splits, each with a user-determined fraction of unique groups. + + For example, a less computationally intensive alternative to + ``LeavePGroupsOut(p=10)`` would be + ``GroupShuffleSplit(test_size=10, n_splits=100)``. + + Contrary to other cross-validation strategies, the random splits + do not guarantee that test sets across all folds will be mutually exclusive, + and might include overlapping samples. However, this is still very likely for + sizeable datasets. + + Note: The parameters ``test_size`` and ``train_size`` refer to groups, and + not to samples as in :class:`ShuffleSplit`. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=5 + Number of re-shuffling & splitting iterations. + + test_size : float, int, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of groups to include in the test split (rounded up). If int, + represents the absolute number of test groups. If None, the value is + set to the complement of the train size. If ``train_size`` is also None, + it will be set to 0.2. + + train_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the groups to include in the train split. If + int, represents the absolute number of train groups. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the training and testing indices produced. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import GroupShuffleSplit + >>> X = np.ones(shape=(8, 2)) + >>> y = np.ones(shape=(8, 1)) + >>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3]) + >>> print(groups.shape) + (8,) + >>> gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42) + >>> gss.get_n_splits() + 2 + >>> print(gss) + GroupShuffleSplit(n_splits=2, random_state=42, test_size=None, train_size=0.7) + >>> for i, (train_index, test_index) in enumerate(gss.split(X, y, groups)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}, group={groups[train_index]}") + ... print(f" Test: index={test_index}, group={groups[test_index]}") + Fold 0: + Train: index=[2 3 4 5 6 7], group=[2 2 2 3 3 3] + Test: index=[0 1], group=[1 1] + Fold 1: + Train: index=[0 1 5 6 7], group=[1 1 3 3 3] + Test: index=[2 3 4], group=[2 2 2] + + See Also + -------- + ShuffleSplit : Shuffles samples to create independent test/train sets. + + LeavePGroupsOut : Train set leaves out all possible subsets of `p` groups. + """ + + def __init__( + self, n_splits=5, *, test_size=None, train_size=None, random_state=None + ): + super().__init__( + n_splits=n_splits, + test_size=test_size, + train_size=train_size, + random_state=random_state, + ) + self._default_test_size = 0.2 + + def _iter_indices(self, X, y, groups): + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) + classes, group_indices = np.unique(groups, return_inverse=True) + for group_train, group_test in super()._iter_indices(X=classes): + # these are the indices of classes in the partition + # invert them into data indices + + train = np.flatnonzero(np.isin(group_indices, group_train)) + test = np.flatnonzero(np.isin(group_indices, group_test)) + + yield train, test + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + return super().split(X, y, groups) + + +class StratifiedShuffleSplit(BaseShuffleSplit): + """Stratified ShuffleSplit cross-validator. + + Provides train/test indices to split data in train/test sets. + + This cross-validation object is a merge of :class:`StratifiedKFold` and + :class:`ShuffleSplit`, which returns stratified randomized folds. The folds + are made by preserving the percentage of samples for each class. + + Note: like the :class:`ShuffleSplit` strategy, stratified random splits + do not guarantee that test sets across all folds will be mutually exclusive, + and might include overlapping samples. However, this is still very likely for + sizeable datasets. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=10 + Number of re-shuffling & splitting iterations. + + test_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.1. + + train_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the training and testing indices produced. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import StratifiedShuffleSplit + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 0, 1, 1, 1]) + >>> sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0) + >>> sss.get_n_splits(X, y) + 5 + >>> print(sss) + StratifiedShuffleSplit(n_splits=5, random_state=0, ...) + >>> for i, (train_index, test_index) in enumerate(sss.split(X, y)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[5 2 3] + Test: index=[4 1 0] + Fold 1: + Train: index=[5 1 4] + Test: index=[0 2 3] + Fold 2: + Train: index=[5 0 2] + Test: index=[4 3 1] + Fold 3: + Train: index=[4 1 0] + Test: index=[2 3 5] + Fold 4: + Train: index=[0 5 1] + Test: index=[3 4 2] + """ + + def __init__( + self, n_splits=10, *, test_size=None, train_size=None, random_state=None + ): + super().__init__( + n_splits=n_splits, + test_size=test_size, + train_size=train_size, + random_state=random_state, + ) + self._default_test_size = 0.1 + + def _iter_indices(self, X, y, groups=None): + n_samples = _num_samples(X) + y = check_array(y, input_name="y", ensure_2d=False, dtype=None) + n_train, n_test = _validate_shuffle_split( + n_samples, + self.test_size, + self.train_size, + default_test_size=self._default_test_size, + ) + + # Convert to numpy as not all operations are supported by the Array API. + # `y` is probably never a very large array, which means that converting it + # should be cheap + xp, _ = get_namespace(y) + y = _convert_to_numpy(y, xp=xp) + + if y.ndim == 2: + # for multi-label y, map each distinct row to a string repr + # using join because str(row) uses an ellipsis if len(row) > 1000 + y = np.array([" ".join(row.astype("str")) for row in y]) + + classes, y_indices = np.unique(y, return_inverse=True) + n_classes = classes.shape[0] + + class_counts = np.bincount(y_indices) + if np.min(class_counts) < 2: + raise ValueError( + "The least populated class in y has only 1" + " member, which is too few. The minimum" + " number of groups for any class cannot" + " be less than 2." + ) + + if n_train < n_classes: + raise ValueError( + "The train_size = %d should be greater or " + "equal to the number of classes = %d" % (n_train, n_classes) + ) + if n_test < n_classes: + raise ValueError( + "The test_size = %d should be greater or " + "equal to the number of classes = %d" % (n_test, n_classes) + ) + + # Find the sorted list of instances for each class: + # (np.unique above performs a sort, so code is O(n logn) already) + class_indices = np.split( + np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1] + ) + + rng = check_random_state(self.random_state) + + for _ in range(self.n_splits): + # if there are ties in the class-counts, we want + # to make sure to break them anew in each iteration + n_i = _approximate_mode(class_counts, n_train, rng) + class_counts_remaining = class_counts - n_i + t_i = _approximate_mode(class_counts_remaining, n_test, rng) + + train = [] + test = [] + + for i in range(n_classes): + permutation = rng.permutation(class_counts[i]) + perm_indices_class_i = class_indices[i].take(permutation, mode="clip") + + train.extend(perm_indices_class_i[: n_i[i]]) + test.extend(perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]]) + + train = rng.permutation(train) + test = rng.permutation(test) + + yield train, test + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Note that providing ``y`` is sufficient to generate the splits and + hence ``np.zeros(n_samples)`` may be used as a placeholder for + ``X`` instead of actual training data. + + y : array-like of shape (n_samples,) or (n_samples, n_labels) + The target variable for supervised learning problems. + Stratification is done based on the y labels. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + if groups is not None: + warnings.warn( + f"The groups parameter is ignored by {self.__class__.__name__}", + UserWarning, + ) + y = check_array(y, input_name="y", ensure_2d=False, dtype=None) + return super().split(X, y, groups) + + +def _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None): + """ + Validation helper to check if the train/test sizes are meaningful w.r.t. the + size of the data (n_samples). + """ + if test_size is None and train_size is None: + test_size = default_test_size + + test_size_type = np.asarray(test_size).dtype.kind + train_size_type = np.asarray(train_size).dtype.kind + + if ( + test_size_type == "i" + and (test_size >= n_samples or test_size <= 0) + or test_size_type == "f" + and (test_size <= 0 or test_size >= 1) + ): + raise ValueError( + "test_size={0} should be either positive and smaller" + " than the number of samples {1} or a float in the " + "(0, 1) range".format(test_size, n_samples) + ) + + if ( + train_size_type == "i" + and (train_size >= n_samples or train_size <= 0) + or train_size_type == "f" + and (train_size <= 0 or train_size >= 1) + ): + raise ValueError( + "train_size={0} should be either positive and smaller" + " than the number of samples {1} or a float in the " + "(0, 1) range".format(train_size, n_samples) + ) + + if train_size is not None and train_size_type not in ("i", "f"): + raise ValueError("Invalid value for train_size: {}".format(train_size)) + if test_size is not None and test_size_type not in ("i", "f"): + raise ValueError("Invalid value for test_size: {}".format(test_size)) + + if train_size_type == "f" and test_size_type == "f" and train_size + test_size > 1: + raise ValueError( + "The sum of test_size and train_size = {}, should be in the (0, 1)" + " range. Reduce test_size and/or train_size.".format(train_size + test_size) + ) + + if test_size_type == "f": + n_test = ceil(test_size * n_samples) + elif test_size_type == "i": + n_test = float(test_size) + + if train_size_type == "f": + n_train = floor(train_size * n_samples) + elif train_size_type == "i": + n_train = float(train_size) + + if train_size is None: + n_train = n_samples - n_test + elif test_size is None: + n_test = n_samples - n_train + + if n_train + n_test > n_samples: + raise ValueError( + "The sum of train_size and test_size = %d, " + "should be smaller than the number of " + "samples %d. Reduce test_size and/or " + "train_size." % (n_train + n_test, n_samples) + ) + + n_train, n_test = int(n_train), int(n_test) + + if n_train == 0: + raise ValueError( + "With n_samples={}, test_size={} and train_size={}, the " + "resulting train set will be empty. Adjust any of the " + "aforementioned parameters.".format(n_samples, test_size, train_size) + ) + + return n_train, n_test + + +class PredefinedSplit(BaseCrossValidator): + """Predefined split cross-validator. + + Provides train/test indices to split data into train/test sets using a + predefined scheme specified by the user with the ``test_fold`` parameter. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.16 + + Parameters + ---------- + test_fold : array-like of shape (n_samples,) + The entry ``test_fold[i]`` represents the index of the test set that + sample ``i`` belongs to. It is possible to exclude sample ``i`` from + any test set (i.e. include sample ``i`` in every training set) by + setting ``test_fold[i]`` equal to -1. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import PredefinedSplit + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 1, 1]) + >>> test_fold = [0, 1, -1, 1] + >>> ps = PredefinedSplit(test_fold) + >>> ps.get_n_splits() + 2 + >>> print(ps) + PredefinedSplit(test_fold=array([ 0, 1, -1, 1])) + >>> for i, (train_index, test_index) in enumerate(ps.split()): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[1 2 3] + Test: index=[0] + Fold 1: + Train: index=[0 2] + Test: index=[1 3] + """ + + def __init__(self, test_fold): + self.test_fold = np.array(test_fold, dtype=int) + self.test_fold = column_or_1d(self.test_fold) + self.unique_folds = np.unique(self.test_fold) + self.unique_folds = self.unique_folds[self.unique_folds != -1] + + def split(self, X=None, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + if groups is not None: + warnings.warn( + f"The groups parameter is ignored by {self.__class__.__name__}", + UserWarning, + ) + return self._split() + + def _split(self): + """Generate indices to split data into training and test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + ind = np.arange(len(self.test_fold)) + for test_index in self._iter_test_masks(): + train_index = ind[np.logical_not(test_index)] + test_index = ind[test_index] + yield train_index, test_index + + def _iter_test_masks(self): + """Generates boolean masks corresponding to test sets.""" + for f in self.unique_folds: + test_index = np.where(self.test_fold == f)[0] + test_mask = np.zeros(len(self.test_fold), dtype=bool) + test_mask[test_index] = True + yield test_mask + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return len(self.unique_folds) + + +class _CVIterableWrapper(BaseCrossValidator): + """Wrapper class for old style cv objects and iterables.""" + + def __init__(self, cv): + self.cv = list(cv) + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return len(self.cv) + + def split(self, X=None, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + for train, test in self.cv: + yield train, test + + +def check_cv(cv=5, y=None, *, classifier=False): + """Input checker utility for building a cross-validator. + + Parameters + ---------- + cv : int, cross-validation generator, iterable or None, default=5 + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable that generates (train, test) splits as arrays of indices. + + For integer/None inputs, if classifier is True and ``y`` is either + binary or multiclass, :class:`StratifiedKFold` is used. In all other + cases, :class:`KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value changed from 3-fold to 5-fold. + + y : array-like, default=None + The target variable for supervised learning problems. + + classifier : bool, default=False + Whether the task is a classification task, in which case + stratified KFold will be used. + + Returns + ------- + checked_cv : a cross-validator instance. + The return value is a cross-validator which generates the train/test + splits via the ``split`` method. + + Examples + -------- + >>> from sklearn.model_selection import check_cv + >>> check_cv(cv=5, y=None, classifier=False) + KFold(...) + >>> check_cv(cv=5, y=[1, 1, 0, 0, 0, 0], classifier=True) + StratifiedKFold(...) + """ + cv = 5 if cv is None else cv + if isinstance(cv, numbers.Integral): + if ( + classifier + and (y is not None) + and (type_of_target(y, input_name="y") in ("binary", "multiclass")) + ): + return StratifiedKFold(cv) + else: + return KFold(cv) + + if not hasattr(cv, "split") or isinstance(cv, str): + if not isinstance(cv, Iterable) or isinstance(cv, str): + raise ValueError( + "Expected cv as an integer, cross-validation " + "object (from sklearn.model_selection) " + "or an iterable. Got %s." % cv + ) + return _CVIterableWrapper(cv) + + return cv # New style cv objects are passed without any modification + + +@validate_params( + { + "test_size": [ + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(numbers.Integral, 1, None, closed="left"), + None, + ], + "train_size": [ + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(numbers.Integral, 1, None, closed="left"), + None, + ], + "random_state": ["random_state"], + "shuffle": ["boolean"], + "stratify": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def train_test_split( + *arrays, + test_size=None, + train_size=None, + random_state=None, + shuffle=True, + stratify=None, +): + """Split arrays or matrices into random train and test subsets. + + Quick utility that wraps input validation, + ``next(ShuffleSplit().split(X, y))``, and application to input data + into a single call for splitting (and optionally subsampling) data into a + one-liner. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + *arrays : sequence of indexables with same length / shape[0] + Allowed inputs are lists, numpy arrays, scipy-sparse + matrices or pandas dataframes. + + test_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.25. + + train_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, default=None + Controls the shuffling applied to the data before applying the split. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + shuffle : bool, default=True + Whether or not to shuffle the data before splitting. If shuffle=False + then stratify must be None. + + stratify : array-like, default=None + If not None, data is split in a stratified fashion, using this as + the class labels. + Read more in the :ref:`User Guide `. + + Returns + ------- + splitting : list, length=2 * len(arrays) + List containing train-test split of inputs. + + .. versionadded:: 0.16 + If the input is sparse, the output will be a + ``scipy.sparse.csr_matrix``. Else, output type is the same as the + input type. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import train_test_split + >>> X, y = np.arange(10).reshape((5, 2)), range(5) + >>> X + array([[0, 1], + [2, 3], + [4, 5], + [6, 7], + [8, 9]]) + >>> list(y) + [0, 1, 2, 3, 4] + + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, test_size=0.33, random_state=42) + ... + >>> X_train + array([[4, 5], + [0, 1], + [6, 7]]) + >>> y_train + [2, 0, 3] + >>> X_test + array([[2, 3], + [8, 9]]) + >>> y_test + [1, 4] + + >>> train_test_split(y, shuffle=False) + [[0, 1, 2], [3, 4]] + """ + n_arrays = len(arrays) + if n_arrays == 0: + raise ValueError("At least one array required as input") + + arrays = indexable(*arrays) + + n_samples = _num_samples(arrays[0]) + n_train, n_test = _validate_shuffle_split( + n_samples, test_size, train_size, default_test_size=0.25 + ) + + if shuffle is False: + if stratify is not None: + raise ValueError( + "Stratified train/test split is not implemented for shuffle=False" + ) + + train = np.arange(n_train) + test = np.arange(n_train, n_train + n_test) + + else: + if stratify is not None: + CVClass = StratifiedShuffleSplit + else: + CVClass = ShuffleSplit + + cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state) + + train, test = next(cv.split(X=arrays[0], y=stratify)) + + train, test = ensure_common_namespace_device(arrays[0], train, test) + + return list( + chain.from_iterable( + (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays + ) + ) + + +# Tell nose that train_test_split is not a test. +# (Needed for external libraries that may use nose.) +# Use setattr to avoid mypy errors when monkeypatching. +setattr(train_test_split, "__test__", False) + + +def _pprint(params, offset=0, printer=repr): + """Pretty print the dictionary 'params' + + Parameters + ---------- + params : dict + The dictionary to pretty print + + offset : int, default=0 + The offset in characters to add at the begin of each line. + + printer : callable, default=repr + The function to convert entries to strings, typically + the builtin str or repr + + """ + # Do a multi-line justified repr: + options = np.get_printoptions() + np.set_printoptions(precision=5, threshold=64, edgeitems=2) + params_list = list() + this_line_length = offset + line_sep = ",\n" + (1 + offset // 2) * " " + for i, (k, v) in enumerate(sorted(params.items())): + if isinstance(v, float): + # use str for representing floating point numbers + # this way we get consistent representation across + # architectures and versions. + this_repr = "%s=%s" % (k, str(v)) + else: + # use repr of the rest + this_repr = "%s=%s" % (k, printer(v)) + if len(this_repr) > 500: + this_repr = this_repr[:300] + "..." + this_repr[-100:] + if i > 0: + if this_line_length + len(this_repr) >= 75 or "\n" in this_repr: + params_list.append(line_sep) + this_line_length = len(line_sep) + else: + params_list.append(", ") + this_line_length += 2 + params_list.append(this_repr) + this_line_length += len(this_repr) + + np.set_printoptions(**options) + lines = "".join(params_list) + # Strip trailing space to avoid nightmare in doctests + lines = "\n".join(l.rstrip(" ") for l in lines.split("\n")) + return lines + + +def _build_repr(self): + # XXX This is copied from BaseEstimator's get_params + cls = self.__class__ + init = getattr(cls.__init__, "deprecated_original", cls.__init__) + # Ignore varargs, kw and default values and pop self + init_signature = signature(init) + # Consider the constructor parameters excluding 'self' + if init is object.__init__: + args = [] + else: + args = sorted( + [ + p.name + for p in init_signature.parameters.values() + if p.name != "self" and p.kind != p.VAR_KEYWORD + ] + ) + class_name = self.__class__.__name__ + params = dict() + for key in args: + # We need deprecation warnings to always be on in order to + # catch deprecated param values. + # This is set in utils/__init__.py but it gets overwritten + # when running under python3 somehow. + warnings.simplefilter("always", FutureWarning) + try: + with warnings.catch_warnings(record=True) as w: + value = getattr(self, key, None) + if value is None and hasattr(self, "cvargs"): + value = self.cvargs.get(key, None) + if len(w) and w[0].category is FutureWarning: + # if the parameter is deprecated, don't show it + continue + finally: + warnings.filters.pop(0) + params[key] = value + + return "%s(%s)" % (class_name, _pprint(params, offset=len(class_name))) + + +def _yields_constant_splits(cv): + # Return True if calling cv.split() always returns the same splits + # We assume that if a cv doesn't have a shuffle parameter, it shuffles by + # default (e.g. ShuffleSplit). If it actually doesn't shuffle (e.g. + # LeaveOneOut), then it won't have a random_state parameter anyway, in + # which case it will default to 0, leading to output=True + shuffle = getattr(cv, "shuffle", True) + random_state = getattr(cv, "random_state", 0) + return isinstance(random_state, numbers.Integral) or not shuffle diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_validation.py b/.venv/Lib/site-packages/sklearn/model_selection/_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..08e7c4bff566aefdf008c0470cc680fe8c76cdd0 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/_validation.py @@ -0,0 +1,2550 @@ +""" +The :mod:`sklearn.model_selection._validation` module includes classes and +functions to validate the model. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +import numbers +import time +import warnings +from collections import Counter +from contextlib import suppress +from functools import partial +from numbers import Real +from traceback import format_exc + +import numpy as np +import scipy.sparse as sp +from joblib import logger + +from ..base import clone, is_classifier +from ..exceptions import FitFailedWarning, UnsetMetadataPassedError +from ..metrics import check_scoring, get_scorer_names +from ..metrics._scorer import _MultimetricScorer +from ..preprocessing import LabelEncoder +from ..utils import Bunch, _safe_indexing, check_random_state, indexable +from ..utils._array_api import device, get_namespace +from ..utils._param_validation import ( + HasMethods, + Integral, + Interval, + StrOptions, + validate_params, +) +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _routing_enabled, + process_routing, +) +from ..utils.metaestimators import _safe_split +from ..utils.parallel import Parallel, delayed +from ..utils.validation import _check_method_params, _num_samples +from ._split import check_cv + +__all__ = [ + "cross_validate", + "cross_val_score", + "cross_val_predict", + "permutation_test_score", + "learning_curve", + "validation_curve", +] + + +def _check_params_groups_deprecation(fit_params, params, groups, version): + """A helper function to check deprecations on `groups` and `fit_params`. + + # TODO(SLEP6): To be removed when set_config(enable_metadata_routing=False) is not + # possible. + """ + if params is not None and fit_params is not None: + raise ValueError( + "`params` and `fit_params` cannot both be provided. Pass parameters " + "via `params`. `fit_params` is deprecated and will be removed in " + f"version {version}." + ) + elif fit_params is not None: + warnings.warn( + ( + "`fit_params` is deprecated and will be removed in version {version}. " + "Pass parameters via `params` instead." + ), + FutureWarning, + ) + params = fit_params + + params = {} if params is None else params + + _check_groups_routing_disabled(groups) + + return params + + +# TODO(SLEP6): To be removed when set_config(enable_metadata_routing=False) is not +# possible. +def _check_groups_routing_disabled(groups): + if groups is not None and _routing_enabled(): + raise ValueError( + "`groups` can only be passed if metadata routing is not enabled via" + " `sklearn.set_config(enable_metadata_routing=True)`. When routing is" + " enabled, pass `groups` alongside other metadata via the `params` argument" + " instead." + ) + + +@validate_params( + { + "estimator": [HasMethods("fit")], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "groups": ["array-like", None], + "scoring": [ + StrOptions(set(get_scorer_names())), + callable, + list, + tuple, + dict, + None, + ], + "cv": ["cv_object"], + "n_jobs": [Integral, None], + "verbose": ["verbose"], + "params": [dict, None], + "pre_dispatch": [Integral, str], + "return_train_score": ["boolean"], + "return_estimator": ["boolean"], + "return_indices": ["boolean"], + "error_score": [StrOptions({"raise"}), Real], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def cross_validate( + estimator, + X, + y=None, + *, + groups=None, + scoring=None, + cv=None, + n_jobs=None, + verbose=0, + params=None, + pre_dispatch="2*n_jobs", + return_train_score=False, + return_estimator=False, + return_indices=False, + error_score=np.nan, +): + """Evaluate metric(s) by cross-validation and also record fit/score times. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to fit. Can be for example a list, or an array. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None + The target variable to try to predict in the case of + supervised learning. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + .. versionchanged:: 1.4 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``cross_validate(..., params={'groups': groups})``. + + scoring : str, callable, list, tuple, or dict, default=None + Strategy to evaluate the performance of the cross-validated model on + the test set. If `None`, the + :ref:`default evaluation criterion ` of the estimator + is used. + + If `scoring` represents a single score, one can use: + + - a single string (see :ref:`scoring_parameter`); + - a callable (see :ref:`scoring_callable`) that returns a single value. + + If `scoring` represents multiple scores, one can use: + + - a list or tuple of unique strings; + - a callable returning a dictionary where the keys are the metric + names and the values are the metric scores; + - a dictionary with metric names as keys and callables a values. + + See :ref:`multimetric_grid_search` for an example. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the cross-validation splits. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int, default=0 + The verbosity level. + + params : dict, default=None + Parameters to pass to the underlying estimator's ``fit``, the scorer, + and the CV splitter. + + .. versionadded:: 1.4 + + pre_dispatch : int or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - An int, giving the exact number of total jobs that are spawned + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' + + return_train_score : bool, default=False + Whether to include train scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + .. versionadded:: 0.19 + + .. versionchanged:: 0.21 + Default value was changed from ``True`` to ``False`` + + return_estimator : bool, default=False + Whether to return the estimators fitted on each split. + + .. versionadded:: 0.20 + + return_indices : bool, default=False + Whether to return the train-test indices selected for each split. + + .. versionadded:: 1.3 + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. + If a numeric value is given, FitFailedWarning is raised. + + .. versionadded:: 0.20 + + Returns + ------- + scores : dict of float arrays of shape (n_splits,) + Array of scores of the estimator for each run of the cross validation. + + A dict of arrays containing the score/time arrays for each scorer is + returned. The possible keys for this ``dict`` are: + + ``test_score`` + The score array for test scores on each cv split. + Suffix ``_score`` in ``test_score`` changes to a specific + metric like ``test_r2`` or ``test_auc`` if there are + multiple scoring metrics in the scoring parameter. + ``train_score`` + The score array for train scores on each cv split. + Suffix ``_score`` in ``train_score`` changes to a specific + metric like ``train_r2`` or ``train_auc`` if there are + multiple scoring metrics in the scoring parameter. + This is available only if ``return_train_score`` parameter + is ``True``. + ``fit_time`` + The time for fitting the estimator on the train + set for each cv split. + ``score_time`` + The time for scoring the estimator on the test set for each + cv split. (Note time for scoring on the train set is not + included even if ``return_train_score`` is set to ``True`` + ``estimator`` + The estimator objects for each cv split. + This is available only if ``return_estimator`` parameter + is set to ``True``. + ``indices`` + The train/test positional indices for each cv split. A dictionary + is returned where the keys are either `"train"` or `"test"` + and the associated values are a list of integer-dtyped NumPy + arrays with the indices. Available only if `return_indices=True`. + + See Also + -------- + cross_val_score : Run cross-validation for single metric evaluation. + + cross_val_predict : Get predictions from each split of cross-validation for + diagnostic purposes. + + sklearn.metrics.make_scorer : Make a scorer from a performance metric or + loss function. + + Examples + -------- + >>> from sklearn import datasets, linear_model + >>> from sklearn.model_selection import cross_validate + >>> from sklearn.metrics import make_scorer + >>> from sklearn.metrics import confusion_matrix + >>> from sklearn.svm import LinearSVC + >>> diabetes = datasets.load_diabetes() + >>> X = diabetes.data[:150] + >>> y = diabetes.target[:150] + >>> lasso = linear_model.Lasso() + + Single metric evaluation using ``cross_validate`` + + >>> cv_results = cross_validate(lasso, X, y, cv=3) + >>> sorted(cv_results.keys()) + ['fit_time', 'score_time', 'test_score'] + >>> cv_results['test_score'] + array([0.3315057 , 0.08022103, 0.03531816]) + + Multiple metric evaluation using ``cross_validate`` + (please refer the ``scoring`` parameter doc for more information) + + >>> scores = cross_validate(lasso, X, y, cv=3, + ... scoring=('r2', 'neg_mean_squared_error'), + ... return_train_score=True) + >>> print(scores['test_neg_mean_squared_error']) + [-3635.5... -3573.3... -6114.7...] + >>> print(scores['train_r2']) + [0.28009951 0.3908844 0.22784907] + """ + _check_groups_routing_disabled(groups) + + X, y = indexable(X, y) + + cv = check_cv(cv, y, classifier=is_classifier(estimator)) + + scorers = check_scoring( + estimator, scoring=scoring, raise_exc=(error_score == "raise") + ) + + if _routing_enabled(): + # For estimators, a MetadataRouter is created in get_metadata_routing + # methods. For these router methods, we create the router to use + # `process_routing` on it. + router = ( + MetadataRouter(owner="cross_validate") + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata to the predict method for + # scoring? + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + .add( + scorer=scorers, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + unrequested_params = sorted(e.unrequested_params) + raise UnsetMetadataPassedError( + message=( + f"{unrequested_params} are passed to cross validation but are not" + " explicitly set as requested or not requested for cross_validate's" + f" estimator: {estimator.__class__.__name__}. Call" + " `.set_fit_request({{metadata}}=True)` on the estimator for" + f" each metadata in {unrequested_params} that you" + " want to use and `metadata=False` for not using it. See the" + " Metadata Routing User guide" + " for more" + " information." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + else: + routed_params = Bunch() + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.estimator = Bunch(fit=params) + routed_params.scorer = Bunch(score={}) + + indices = cv.split(X, y, **routed_params.splitter.split) + if return_indices: + # materialize the indices since we need to store them in the returned dict + indices = list(indices) + + # We clone the estimator to make sure that all the folds are + # independent, and that it is pickle-able. + parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) + results = parallel( + delayed(_fit_and_score)( + clone(estimator), + X, + y, + scorer=scorers, + train=train, + test=test, + verbose=verbose, + parameters=None, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + return_train_score=return_train_score, + return_times=True, + return_estimator=return_estimator, + error_score=error_score, + ) + for train, test in indices + ) + + _warn_or_raise_about_fit_failures(results, error_score) + + # For callable scoring, the return type is only know after calling. If the + # return type is a dictionary, the error scores can now be inserted with + # the correct key. + if callable(scoring): + _insert_error_scores(results, error_score) + + results = _aggregate_score_dicts(results) + + ret = {} + ret["fit_time"] = results["fit_time"] + ret["score_time"] = results["score_time"] + + if return_estimator: + ret["estimator"] = results["estimator"] + + if return_indices: + ret["indices"] = {} + ret["indices"]["train"], ret["indices"]["test"] = zip(*indices) + + test_scores_dict = _normalize_score_results(results["test_scores"]) + if return_train_score: + train_scores_dict = _normalize_score_results(results["train_scores"]) + + for name in test_scores_dict: + ret["test_%s" % name] = test_scores_dict[name] + if return_train_score: + key = "train_%s" % name + ret[key] = train_scores_dict[name] + + return ret + + +def _insert_error_scores(results, error_score): + """Insert error in `results` by replacing them inplace with `error_score`. + + This only applies to multimetric scores because `_fit_and_score` will + handle the single metric case. + """ + successful_score = None + failed_indices = [] + for i, result in enumerate(results): + if result["fit_error"] is not None: + failed_indices.append(i) + elif successful_score is None: + successful_score = result["test_scores"] + + if isinstance(successful_score, dict): + formatted_error = {name: error_score for name in successful_score} + for i in failed_indices: + results[i]["test_scores"] = formatted_error.copy() + if "train_scores" in results[i]: + results[i]["train_scores"] = formatted_error.copy() + + +def _normalize_score_results(scores, scaler_score_key="score"): + """Creates a scoring dictionary based on the type of `scores`""" + if isinstance(scores[0], dict): + # multimetric scoring + return _aggregate_score_dicts(scores) + # scaler + return {scaler_score_key: scores} + + +def _warn_or_raise_about_fit_failures(results, error_score): + fit_errors = [ + result["fit_error"] for result in results if result["fit_error"] is not None + ] + if fit_errors: + num_failed_fits = len(fit_errors) + num_fits = len(results) + fit_errors_counter = Counter(fit_errors) + delimiter = "-" * 80 + "\n" + fit_errors_summary = "\n".join( + f"{delimiter}{n} fits failed with the following error:\n{error}" + for error, n in fit_errors_counter.items() + ) + + if num_failed_fits == num_fits: + all_fits_failed_message = ( + f"\nAll the {num_fits} fits failed.\n" + "It is very likely that your model is misconfigured.\n" + "You can try to debug the error by setting error_score='raise'.\n\n" + f"Below are more details about the failures:\n{fit_errors_summary}" + ) + raise ValueError(all_fits_failed_message) + + else: + some_fits_failed_message = ( + f"\n{num_failed_fits} fits failed out of a total of {num_fits}.\n" + "The score on these train-test partitions for these parameters" + f" will be set to {error_score}.\n" + "If these failures are not expected, you can try to debug them " + "by setting error_score='raise'.\n\n" + f"Below are more details about the failures:\n{fit_errors_summary}" + ) + warnings.warn(some_fits_failed_message, FitFailedWarning) + + +@validate_params( + { + "estimator": [HasMethods("fit")], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "groups": ["array-like", None], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "cv": ["cv_object"], + "n_jobs": [Integral, None], + "verbose": ["verbose"], + "params": [dict, None], + "pre_dispatch": [Integral, str, None], + "error_score": [StrOptions({"raise"}), Real], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def cross_val_score( + estimator, + X, + y=None, + *, + groups=None, + scoring=None, + cv=None, + n_jobs=None, + verbose=0, + params=None, + pre_dispatch="2*n_jobs", + error_score=np.nan, +): + """Evaluate a score by cross-validation. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to fit. Can be for example a list, or an array. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ + default=None + The target variable to try to predict in the case of + supervised learning. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + .. versionchanged:: 1.4 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``cross_val_score(..., params={'groups': groups})``. + + scoring : str or callable, default=None + A str (see :ref:`scoring_parameter`) or a scorer callable object / function with + signature ``scorer(estimator, X, y)`` which should return only a single value. + + Similar to :func:`cross_validate` + but only a single metric is permitted. + + If `None`, the estimator's default scorer (if available) is used. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - `None`, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable that generates (train, test) splits as arrays of indices. + + For `int`/`None` inputs, if the estimator is a classifier and `y` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + `cv` default value if `None` changed from 3-fold to 5-fold. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the cross-validation splits. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int, default=0 + The verbosity level. + + params : dict, default=None + Parameters to pass to the underlying estimator's ``fit``, the scorer, + and the CV splitter. + + .. versionadded:: 1.4 + + pre_dispatch : int or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - ``None``, in which case all the jobs are immediately created and spawned. Use + this for lightweight and fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + - An int, giving the exact number of total jobs that are spawned + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. + If a numeric value is given, FitFailedWarning is raised. + + .. versionadded:: 0.20 + + Returns + ------- + scores : ndarray of float of shape=(len(list(cv)),) + Array of scores of the estimator for each run of the cross validation. + + See Also + -------- + cross_validate : To run cross-validation on multiple metrics and also to + return train scores, fit times and score times. + + cross_val_predict : Get predictions from each split of cross-validation for + diagnostic purposes. + + sklearn.metrics.make_scorer : Make a scorer from a performance metric or + loss function. + + Examples + -------- + >>> from sklearn import datasets, linear_model + >>> from sklearn.model_selection import cross_val_score + >>> diabetes = datasets.load_diabetes() + >>> X = diabetes.data[:150] + >>> y = diabetes.target[:150] + >>> lasso = linear_model.Lasso() + >>> print(cross_val_score(lasso, X, y, cv=3)) + [0.3315057 0.08022103 0.03531816] + """ + # To ensure multimetric format is not supported + scorer = check_scoring(estimator, scoring=scoring) + + cv_results = cross_validate( + estimator=estimator, + X=X, + y=y, + groups=groups, + scoring={"score": scorer}, + cv=cv, + n_jobs=n_jobs, + verbose=verbose, + params=params, + pre_dispatch=pre_dispatch, + error_score=error_score, + ) + return cv_results["test_score"] + + +def _fit_and_score( + estimator, + X, + y, + *, + scorer, + train, + test, + verbose, + parameters, + fit_params, + score_params, + return_train_score=False, + return_parameters=False, + return_n_test_samples=False, + return_times=False, + return_estimator=False, + split_progress=None, + candidate_progress=None, + error_score=np.nan, +): + """Fit estimator and compute scores for a given dataset split. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : array-like of shape (n_samples, n_features) + The data to fit. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + The target variable to try to predict in the case of + supervised learning. + + scorer : A single callable or dict mapping scorer name to the callable + If it is a single callable, the return value for ``train_scores`` and + ``test_scores`` is a single float. + + For a dict, it should be one mapping the scorer name to the scorer + callable object / function. + + The callable object / fn should have signature + ``scorer(estimator, X, y)``. + + train : array-like of shape (n_train_samples,) + Indices of training samples. + + test : array-like of shape (n_test_samples,) + Indices of test samples. + + verbose : int + The verbosity level. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. + If a numeric value is given, FitFailedWarning is raised. + + parameters : dict or None + Parameters to be set on the estimator. + + fit_params : dict or None + Parameters that will be passed to ``estimator.fit``. + + score_params : dict or None + Parameters that will be passed to the scorer. + + return_train_score : bool, default=False + Compute and return score on training set. + + return_parameters : bool, default=False + Return parameters that has been used for the estimator. + + split_progress : {list, tuple} of int, default=None + A list or tuple of format (, ). + + candidate_progress : {list, tuple} of int, default=None + A list or tuple of format + (, ). + + return_n_test_samples : bool, default=False + Whether to return the ``n_test_samples``. + + return_times : bool, default=False + Whether to return the fit/score times. + + return_estimator : bool, default=False + Whether to return the fitted estimator. + + Returns + ------- + result : dict with the following attributes + train_scores : dict of scorer name -> float + Score on training set (for all the scorers), + returned only if `return_train_score` is `True`. + test_scores : dict of scorer name -> float + Score on testing set (for all the scorers). + n_test_samples : int + Number of test samples. + fit_time : float + Time spent for fitting in seconds. + score_time : float + Time spent for scoring in seconds. + parameters : dict or None + The parameters that have been evaluated. + estimator : estimator object + The fitted estimator. + fit_error : str or None + Traceback str if the fit failed, None if the fit succeeded. + """ + xp, _ = get_namespace(X) + X_device = device(X) + + # Make sure that we can fancy index X even if train and test are provided + # as NumPy arrays by NumPy only cross-validation splitters. + train, test = xp.asarray(train, device=X_device), xp.asarray(test, device=X_device) + + if not isinstance(error_score, numbers.Number) and error_score != "raise": + raise ValueError( + "error_score must be the string 'raise' or a numeric value. " + "(Hint: if using 'raise', please make sure that it has been " + "spelled correctly.)" + ) + + progress_msg = "" + if verbose > 2: + if split_progress is not None: + progress_msg = f" {split_progress[0]+1}/{split_progress[1]}" + if candidate_progress and verbose > 9: + progress_msg += f"; {candidate_progress[0]+1}/{candidate_progress[1]}" + + if verbose > 1: + if parameters is None: + params_msg = "" + else: + sorted_keys = sorted(parameters) # Ensure deterministic o/p + params_msg = ", ".join(f"{k}={parameters[k]}" for k in sorted_keys) + if verbose > 9: + start_msg = f"[CV{progress_msg}] START {params_msg}" + print(f"{start_msg}{(80 - len(start_msg)) * '.'}") + + # Adjust length of sample weights + fit_params = fit_params if fit_params is not None else {} + fit_params = _check_method_params(X, params=fit_params, indices=train) + score_params = score_params if score_params is not None else {} + score_params_train = _check_method_params(X, params=score_params, indices=train) + score_params_test = _check_method_params(X, params=score_params, indices=test) + + if parameters is not None: + # here we clone the parameters, since sometimes the parameters + # themselves might be estimators, e.g. when we search over different + # estimators in a pipeline. + # ref: https://github.com/scikit-learn/scikit-learn/pull/26786 + estimator = estimator.set_params(**clone(parameters, safe=False)) + + start_time = time.time() + + X_train, y_train = _safe_split(estimator, X, y, train) + X_test, y_test = _safe_split(estimator, X, y, test, train) + + result = {} + try: + if y_train is None: + estimator.fit(X_train, **fit_params) + else: + estimator.fit(X_train, y_train, **fit_params) + + except Exception: + # Note fit time as time until error + fit_time = time.time() - start_time + score_time = 0.0 + if error_score == "raise": + raise + elif isinstance(error_score, numbers.Number): + if isinstance(scorer, _MultimetricScorer): + test_scores = {name: error_score for name in scorer._scorers} + if return_train_score: + train_scores = test_scores.copy() + else: + test_scores = error_score + if return_train_score: + train_scores = error_score + result["fit_error"] = format_exc() + else: + result["fit_error"] = None + + fit_time = time.time() - start_time + test_scores = _score( + estimator, X_test, y_test, scorer, score_params_test, error_score + ) + score_time = time.time() - start_time - fit_time + if return_train_score: + train_scores = _score( + estimator, X_train, y_train, scorer, score_params_train, error_score + ) + + if verbose > 1: + total_time = score_time + fit_time + end_msg = f"[CV{progress_msg}] END " + result_msg = params_msg + (";" if params_msg else "") + if verbose > 2: + if isinstance(test_scores, dict): + for scorer_name in sorted(test_scores): + result_msg += f" {scorer_name}: (" + if return_train_score: + scorer_scores = train_scores[scorer_name] + result_msg += f"train={scorer_scores:.3f}, " + result_msg += f"test={test_scores[scorer_name]:.3f})" + else: + result_msg += ", score=" + if return_train_score: + result_msg += f"(train={train_scores:.3f}, test={test_scores:.3f})" + else: + result_msg += f"{test_scores:.3f}" + result_msg += f" total time={logger.short_format_time(total_time)}" + + # Right align the result_msg + end_msg += "." * (80 - len(end_msg) - len(result_msg)) + end_msg += result_msg + print(end_msg) + + result["test_scores"] = test_scores + if return_train_score: + result["train_scores"] = train_scores + if return_n_test_samples: + result["n_test_samples"] = _num_samples(X_test) + if return_times: + result["fit_time"] = fit_time + result["score_time"] = score_time + if return_parameters: + result["parameters"] = parameters + if return_estimator: + result["estimator"] = estimator + return result + + +def _score(estimator, X_test, y_test, scorer, score_params, error_score="raise"): + """Compute the score(s) of an estimator on a given test set. + + Will return a dict of floats if `scorer` is a _MultiMetricScorer, otherwise a single + float is returned. + """ + score_params = {} if score_params is None else score_params + + try: + if y_test is None: + scores = scorer(estimator, X_test, **score_params) + else: + scores = scorer(estimator, X_test, y_test, **score_params) + except Exception: + if isinstance(scorer, _MultimetricScorer): + # If `_MultimetricScorer` raises exception, the `error_score` + # parameter is equal to "raise". + raise + else: + if error_score == "raise": + raise + else: + scores = error_score + warnings.warn( + ( + "Scoring failed. The score on this train-test partition for " + f"these parameters will be set to {error_score}. Details: \n" + f"{format_exc()}" + ), + UserWarning, + ) + + # Check non-raised error messages in `_MultimetricScorer` + if isinstance(scorer, _MultimetricScorer): + exception_messages = [ + (name, str_e) for name, str_e in scores.items() if isinstance(str_e, str) + ] + if exception_messages: + # error_score != "raise" + for name, str_e in exception_messages: + scores[name] = error_score + warnings.warn( + ( + "Scoring failed. The score on this train-test partition for " + f"these parameters will be set to {error_score}. Details: \n" + f"{str_e}" + ), + UserWarning, + ) + + error_msg = "scoring must return a number, got %s (%s) instead. (scorer=%s)" + if isinstance(scores, dict): + for name, score in scores.items(): + if hasattr(score, "item"): + with suppress(ValueError): + # e.g. unwrap memmapped scalars + score = score.item() + if not isinstance(score, numbers.Number): + raise ValueError(error_msg % (score, type(score), name)) + scores[name] = score + else: # scalar + if hasattr(scores, "item"): + with suppress(ValueError): + # e.g. unwrap memmapped scalars + scores = scores.item() + if not isinstance(scores, numbers.Number): + raise ValueError(error_msg % (scores, type(scores), scorer)) + return scores + + +@validate_params( + { + "estimator": [HasMethods(["fit", "predict"])], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", "sparse matrix", None], + "groups": ["array-like", None], + "cv": ["cv_object"], + "n_jobs": [Integral, None], + "verbose": ["verbose"], + "params": [dict, None], + "pre_dispatch": [Integral, str, None], + "method": [ + StrOptions( + { + "predict", + "predict_proba", + "predict_log_proba", + "decision_function", + } + ) + ], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def cross_val_predict( + estimator, + X, + y=None, + *, + groups=None, + cv=None, + n_jobs=None, + verbose=0, + params=None, + pre_dispatch="2*n_jobs", + method="predict", +): + """Generate cross-validated estimates for each input data point. + + The data is split according to the cv parameter. Each sample belongs + to exactly one test set, and its prediction is computed with an + estimator fitted on the corresponding training set. + + Passing these predictions into an evaluation metric may not be a valid + way to measure generalization performance. Results can differ from + :func:`cross_validate` and :func:`cross_val_score` unless all tests sets + have equal size and the metric decomposes over samples. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator + The estimator instance to use to fit the data. It must implement a `fit` + method and the method given by the `method` parameter. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to fit. Can be, for example a list, or an array at least 2d. + + y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs), \ + default=None + The target variable to try to predict in the case of + supervised learning. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + .. versionchanged:: 1.4 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``cross_val_predict(..., params={'groups': groups})``. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable that generates (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and + predicting are parallelized over the cross-validation splits. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int, default=0 + The verbosity level. + + params : dict, default=None + Parameters to pass to the underlying estimator's ``fit`` and the CV + splitter. + + .. versionadded:: 1.4 + + pre_dispatch : int or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately created and spawned. Use + this for lightweight and fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + - An int, giving the exact number of total jobs that are spawned + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' + + method : {'predict', 'predict_proba', 'predict_log_proba', \ + 'decision_function'}, default='predict' + The method to be invoked by `estimator`. + + Returns + ------- + predictions : ndarray + This is the result of calling `method`. Shape: + + - When `method` is 'predict' and in special case where `method` is + 'decision_function' and the target is binary: (n_samples,) + - When `method` is one of {'predict_proba', 'predict_log_proba', + 'decision_function'} (unless special case above): + (n_samples, n_classes) + - If `estimator` is :term:`multioutput`, an extra dimension + 'n_outputs' is added to the end of each shape above. + + See Also + -------- + cross_val_score : Calculate score for each CV split. + cross_validate : Calculate one or more scores and timings for each CV + split. + + Notes + ----- + In the case that one or more classes are absent in a training portion, a + default score needs to be assigned to all instances for that class if + ``method`` produces columns per class, as in {'decision_function', + 'predict_proba', 'predict_log_proba'}. For ``predict_proba`` this value is + 0. In order to ensure finite output, we approximate negative infinity by + the minimum finite float value for the dtype in other cases. + + Examples + -------- + >>> from sklearn import datasets, linear_model + >>> from sklearn.model_selection import cross_val_predict + >>> diabetes = datasets.load_diabetes() + >>> X = diabetes.data[:150] + >>> y = diabetes.target[:150] + >>> lasso = linear_model.Lasso() + >>> y_pred = cross_val_predict(lasso, X, y, cv=3) + """ + _check_groups_routing_disabled(groups) + X, y = indexable(X, y) + + if _routing_enabled(): + # For estimators, a MetadataRouter is created in get_metadata_routing + # methods. For these router methods, we create the router to use + # `process_routing` on it. + router = ( + MetadataRouter(owner="cross_validate") + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata for the predict method. + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + ) + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + unrequested_params = sorted(e.unrequested_params) + raise UnsetMetadataPassedError( + message=( + f"{unrequested_params} are passed to `cross_val_predict` but are" + " not explicitly set as requested or not requested for" + f" cross_validate's estimator: {estimator.__class__.__name__} Call" + " `.set_fit_request({{metadata}}=True)` on the estimator for" + f" each metadata in {unrequested_params} that you want to use and" + " `metadata=False` for not using it. See the Metadata Routing User" + " guide " + " for more information." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + else: + routed_params = Bunch() + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.estimator = Bunch(fit=params) + + cv = check_cv(cv, y, classifier=is_classifier(estimator)) + splits = list(cv.split(X, y, **routed_params.splitter.split)) + + test_indices = np.concatenate([test for _, test in splits]) + if not _check_is_permutation(test_indices, _num_samples(X)): + raise ValueError("cross_val_predict only works for partitions") + + # If classification methods produce multiple columns of output, + # we need to manually encode classes to ensure consistent column ordering. + encode = ( + method in ["decision_function", "predict_proba", "predict_log_proba"] + and y is not None + ) + if encode: + y = np.asarray(y) + if y.ndim == 1: + le = LabelEncoder() + y = le.fit_transform(y) + elif y.ndim == 2: + y_enc = np.zeros_like(y, dtype=int) + for i_label in range(y.shape[1]): + y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label]) + y = y_enc + + # We clone the estimator to make sure that all the folds are + # independent, and that it is pickle-able. + parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) + predictions = parallel( + delayed(_fit_and_predict)( + clone(estimator), + X, + y, + train, + test, + routed_params.estimator.fit, + method, + ) + for train, test in splits + ) + + inv_test_indices = np.empty(len(test_indices), dtype=int) + inv_test_indices[test_indices] = np.arange(len(test_indices)) + + if sp.issparse(predictions[0]): + predictions = sp.vstack(predictions, format=predictions[0].format) + elif encode and isinstance(predictions[0], list): + # `predictions` is a list of method outputs from each fold. + # If each of those is also a list, then treat this as a + # multioutput-multiclass task. We need to separately concatenate + # the method outputs for each label into an `n_labels` long list. + n_labels = y.shape[1] + concat_pred = [] + for i_label in range(n_labels): + label_preds = np.concatenate([p[i_label] for p in predictions]) + concat_pred.append(label_preds) + predictions = concat_pred + else: + predictions = np.concatenate(predictions) + + if isinstance(predictions, list): + return [p[inv_test_indices] for p in predictions] + else: + return predictions[inv_test_indices] + + +def _fit_and_predict(estimator, X, y, train, test, fit_params, method): + """Fit estimator and predict values for a given dataset split. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object implementing 'fit' and 'predict' + The object to use to fit the data. + + X : array-like of shape (n_samples, n_features) + The data to fit. + + .. versionchanged:: 0.20 + X is only required to be an object with finite length or shape now + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + The target variable to try to predict in the case of + supervised learning. + + train : array-like of shape (n_train_samples,) + Indices of training samples. + + test : array-like of shape (n_test_samples,) + Indices of test samples. + + fit_params : dict or None + Parameters that will be passed to ``estimator.fit``. + + method : str + Invokes the passed method name of the passed estimator. + + Returns + ------- + predictions : sequence + Result of calling 'estimator.method' + """ + # Adjust length of sample weights + fit_params = fit_params if fit_params is not None else {} + fit_params = _check_method_params(X, params=fit_params, indices=train) + + X_train, y_train = _safe_split(estimator, X, y, train) + X_test, _ = _safe_split(estimator, X, y, test, train) + + if y_train is None: + estimator.fit(X_train, **fit_params) + else: + estimator.fit(X_train, y_train, **fit_params) + func = getattr(estimator, method) + predictions = func(X_test) + + encode = ( + method in ["decision_function", "predict_proba", "predict_log_proba"] + and y is not None + ) + + if encode: + if isinstance(predictions, list): + predictions = [ + _enforce_prediction_order( + estimator.classes_[i_label], + predictions[i_label], + n_classes=len(set(y[:, i_label])), + method=method, + ) + for i_label in range(len(predictions)) + ] + else: + # A 2D y array should be a binary label indicator matrix + n_classes = len(set(y)) if y.ndim == 1 else y.shape[1] + predictions = _enforce_prediction_order( + estimator.classes_, predictions, n_classes, method + ) + return predictions + + +def _enforce_prediction_order(classes, predictions, n_classes, method): + """Ensure that prediction arrays have correct column order + + When doing cross-validation, if one or more classes are + not present in the subset of data used for training, + then the output prediction array might not have the same + columns as other folds. Use the list of class names + (assumed to be ints) to enforce the correct column order. + + Note that `classes` is the list of classes in this fold + (a subset of the classes in the full training set) + and `n_classes` is the number of classes in the full training set. + """ + if n_classes != len(classes): + recommendation = ( + "To fix this, use a cross-validation " + "technique resulting in properly " + "stratified folds" + ) + warnings.warn( + "Number of classes in training fold ({}) does " + "not match total number of classes ({}). " + "Results may not be appropriate for your use case. " + "{}".format(len(classes), n_classes, recommendation), + RuntimeWarning, + ) + if method == "decision_function": + if predictions.ndim == 2 and predictions.shape[1] != len(classes): + # This handles the case when the shape of predictions + # does not match the number of classes used to train + # it with. This case is found when sklearn.svm.SVC is + # set to `decision_function_shape='ovo'`. + raise ValueError( + "Output shape {} of {} does not match " + "number of classes ({}) in fold. " + "Irregular decision_function outputs " + "are not currently supported by " + "cross_val_predict".format(predictions.shape, method, len(classes)) + ) + if len(classes) <= 2: + # In this special case, `predictions` contains a 1D array. + raise ValueError( + "Only {} class/es in training fold, but {} " + "in overall dataset. This " + "is not supported for decision_function " + "with imbalanced folds. {}".format( + len(classes), n_classes, recommendation + ) + ) + + float_min = np.finfo(predictions.dtype).min + default_values = { + "decision_function": float_min, + "predict_log_proba": float_min, + "predict_proba": 0, + } + predictions_for_all_classes = np.full( + (_num_samples(predictions), n_classes), + default_values[method], + dtype=predictions.dtype, + ) + predictions_for_all_classes[:, classes] = predictions + predictions = predictions_for_all_classes + return predictions + + +def _check_is_permutation(indices, n_samples): + """Check whether indices is a reordering of the array np.arange(n_samples) + + Parameters + ---------- + indices : ndarray + int array to test + n_samples : int + number of expected elements + + Returns + ------- + is_partition : bool + True iff sorted(indices) is np.arange(n) + """ + if len(indices) != n_samples: + return False + hit = np.zeros(n_samples, dtype=bool) + hit[indices] = True + if not np.all(hit): + return False + return True + + +@validate_params( + { + "estimator": [HasMethods("fit")], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "groups": ["array-like", None], + "cv": ["cv_object"], + "n_permutations": [Interval(Integral, 1, None, closed="left")], + "n_jobs": [Integral, None], + "random_state": ["random_state"], + "verbose": ["verbose"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "fit_params": [dict, None], + "params": [dict, None], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def permutation_test_score( + estimator, + X, + y, + *, + groups=None, + cv=None, + n_permutations=100, + n_jobs=None, + random_state=0, + verbose=0, + scoring=None, + fit_params=None, + params=None, +): + """Evaluate the significance of a cross-validated score with permutations. + + Permutes targets to generate 'randomized data' and compute the empirical + p-value against the null hypothesis that features and targets are + independent. + + The p-value represents the fraction of randomized data sets where the + estimator performed as well or better than in the original data. A small + p-value suggests that there is a real dependency between features and + targets which has been used by the estimator to give good predictions. + A large p-value may be due to lack of real dependency between features + and targets or the estimator was not able to use the dependency to + give good predictions. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : array-like of shape at least 2D + The data to fit. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + The target variable to try to predict in the case of + supervised learning. + + groups : array-like of shape (n_samples,), default=None + Labels to constrain permutation within groups, i.e. ``y`` values + are permuted among samples with the same group identifier. + When not specified, ``y`` values are permuted among all samples. + + When a grouped cross-validator is used, the group labels are + also passed on to the ``split`` method of the cross-validator. The + cross-validator uses them for grouping the samples while splitting + the dataset into train/test set. + + .. versionchanged:: 1.6 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``permutation_test_score(..., params={'groups': groups})``. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - `None`, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For `int`/`None` inputs, if the estimator is a classifier and `y` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + `cv` default value if `None` changed from 3-fold to 5-fold. + + n_permutations : int, default=100 + Number of times to permute ``y``. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the cross-validated score are parallelized over the permutations. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + random_state : int, RandomState instance or None, default=0 + Pass an int for reproducible output for permutation of + ``y`` values among samples. See :term:`Glossary `. + + verbose : int, default=0 + The verbosity level. + + scoring : str or callable, default=None + A single str (see :ref:`scoring_parameter`) or a callable + (see :ref:`scoring_callable`) to evaluate the predictions on the test set. + + If `None` the estimator's score method is used. + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + .. deprecated:: 1.6 + This parameter is deprecated and will be removed in version 1.6. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the `fit` method of the estimator, the scorer + and the cv splitter. + + - If `enable_metadata_routing=False` (default): Parameters directly passed to + the `fit` method of the estimator. + + - If `enable_metadata_routing=True`: Parameters safely routed to the `fit` + method of the estimator, `cv` object and `scorer`. See :ref:`Metadata Routing + User Guide ` for more details. + + .. versionadded:: 1.6 + + Returns + ------- + score : float + The true score without permuting targets. + + permutation_scores : array of shape (n_permutations,) + The scores obtained for each permutations. + + pvalue : float + The p-value, which approximates the probability that the score would + be obtained by chance. This is calculated as: + + `(C + 1) / (n_permutations + 1)` + + Where C is the number of permutations whose score >= the true score. + + The best possible p-value is 1/(n_permutations + 1), the worst is 1.0. + + Notes + ----- + This function implements Test 1 in: + + Ojala and Garriga. `Permutation Tests for Studying Classifier Performance + `_. The + Journal of Machine Learning Research (2010) vol. 11 + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import permutation_test_score + >>> X, y = make_classification(random_state=0) + >>> estimator = LogisticRegression() + >>> score, permutation_scores, pvalue = permutation_test_score( + ... estimator, X, y, random_state=0 + ... ) + >>> print(f"Original Score: {score:.3f}") + Original Score: 0.810 + >>> print( + ... f"Permutation Scores: {permutation_scores.mean():.3f} +/- " + ... f"{permutation_scores.std():.3f}" + ... ) + Permutation Scores: 0.505 +/- 0.057 + >>> print(f"P-value: {pvalue:.3f}") + P-value: 0.010 + """ + params = _check_params_groups_deprecation(fit_params, params, groups, "1.8") + + X, y, groups = indexable(X, y, groups) + + cv = check_cv(cv, y, classifier=is_classifier(estimator)) + scorer = check_scoring(estimator, scoring=scoring) + random_state = check_random_state(random_state) + + if _routing_enabled(): + router = ( + MetadataRouter(owner="permutation_test_score") + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata to the predict method for + # scoring? + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + scorer=scorer, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + unrequested_params = sorted(e.unrequested_params) + raise UnsetMetadataPassedError( + message=( + f"{unrequested_params} are passed to `permutation_test_score`" + " but are not explicitly set as requested or not requested" + " for permutation_test_score's" + f" estimator: {estimator.__class__.__name__}. Call" + " `.set_fit_request({{metadata}}=True)` on the estimator for" + f" each metadata in {unrequested_params} that you" + " want to use and `metadata=False` for not using it. See the" + " Metadata Routing User guide" + " for more" + " information." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + + else: + routed_params = Bunch() + routed_params.estimator = Bunch(fit=params) + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.scorer = Bunch(score={}) + + # We clone the estimator to make sure that all the folds are + # independent, and that it is pickle-able. + score = _permutation_test_score( + clone(estimator), + X, + y, + cv, + scorer, + split_params=routed_params.splitter.split, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + ) + permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( + delayed(_permutation_test_score)( + clone(estimator), + X, + _shuffle(y, groups, random_state), + cv, + scorer, + split_params=routed_params.splitter.split, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + ) + for _ in range(n_permutations) + ) + permutation_scores = np.array(permutation_scores) + pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1) + return score, permutation_scores, pvalue + + +def _permutation_test_score( + estimator, X, y, cv, scorer, split_params, fit_params, score_params +): + """Auxiliary function for permutation_test_score""" + # Adjust length of sample weights + fit_params = fit_params if fit_params is not None else {} + score_params = score_params if score_params is not None else {} + + avg_score = [] + for train, test in cv.split(X, y, **split_params): + X_train, y_train = _safe_split(estimator, X, y, train) + X_test, y_test = _safe_split(estimator, X, y, test, train) + fit_params_train = _check_method_params(X, params=fit_params, indices=train) + score_params_test = _check_method_params(X, params=score_params, indices=test) + estimator.fit(X_train, y_train, **fit_params_train) + avg_score.append(scorer(estimator, X_test, y_test, **score_params_test)) + return np.mean(avg_score) + + +def _shuffle(y, groups, random_state): + """Return a shuffled copy of y eventually shuffle among same groups.""" + if groups is None: + indices = random_state.permutation(len(y)) + else: + indices = np.arange(len(groups)) + for group in np.unique(groups): + this_mask = groups == group + indices[this_mask] = random_state.permutation(indices[this_mask]) + return _safe_indexing(y, indices) + + +@validate_params( + { + "estimator": [HasMethods(["fit"])], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "groups": ["array-like", None], + "train_sizes": ["array-like"], + "cv": ["cv_object"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "exploit_incremental_learning": ["boolean"], + "n_jobs": [Integral, None], + "pre_dispatch": [Integral, str], + "verbose": ["verbose"], + "shuffle": ["boolean"], + "random_state": ["random_state"], + "error_score": [StrOptions({"raise"}), Real], + "return_times": ["boolean"], + "fit_params": [dict, None], + "params": [dict, None], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def learning_curve( + estimator, + X, + y, + *, + groups=None, + train_sizes=np.linspace(0.1, 1.0, 5), + cv=None, + scoring=None, + exploit_incremental_learning=False, + n_jobs=None, + pre_dispatch="all", + verbose=0, + shuffle=False, + random_state=None, + error_score=np.nan, + return_times=False, + fit_params=None, + params=None, +): + """Learning curve. + + Determines cross-validated training and test scores for different training + set sizes. + + A cross-validation generator splits the whole dataset k times in training + and test data. Subsets of the training set with varying sizes will be used + to train the estimator and a score for each training subset size and the + test set will be computed. Afterwards, the scores will be averaged over + all k runs for each training subset size. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : object type that implements the "fit" method + An object of that type which is cloned for each validation. It must + also implement "predict" unless `scoring` is a callable that doesn't + rely on "predict" to compute a score. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + Target relative to X for classification or regression; + None for unsupervised learning. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + .. versionchanged:: 1.6 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``learning_curve(..., params={'groups': groups})``. + + train_sizes : array-like of shape (n_ticks,), \ + default=np.linspace(0.1, 1.0, 5) + Relative or absolute numbers of training examples that will be used to + generate the learning curve. If the dtype is float, it is regarded as a + fraction of the maximum size of the training set (that is determined + by the selected validation method), i.e. it has to be within (0, 1]. + Otherwise it is interpreted as absolute sizes of the training sets. + Note that for classification the number of samples usually has to + be big enough to contain at least one sample from each class. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + scoring : str or callable, default=None + A str (see :ref:`scoring_parameter`) or a scorer callable object / function with + signature ``scorer(estimator, X, y)``. + + exploit_incremental_learning : bool, default=False + If the estimator supports incremental learning, this will be + used to speed up fitting for different training set sizes. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the different training and test sets. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + pre_dispatch : int or str, default='all' + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The str can + be an expression like '2*n_jobs'. + + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + shuffle : bool, default=False + Whether to shuffle training data before taking prefixes of it + based on``train_sizes``. + + random_state : int, RandomState instance or None, default=None + Used when ``shuffle`` is True. Pass an int for reproducible + output across multiple function calls. + See :term:`Glossary `. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. + If a numeric value is given, FitFailedWarning is raised. + + .. versionadded:: 0.20 + + return_times : bool, default=False + Whether to return the fit and score times. + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + .. deprecated:: 1.6 + This parameter is deprecated and will be removed in version 1.8. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the `fit` method of the estimator and to the scorer. + + - If `enable_metadata_routing=False` (default): Parameters directly passed to + the `fit` method of the estimator. + + - If `enable_metadata_routing=True`: Parameters safely routed to the `fit` + method of the estimator. See :ref:`Metadata Routing User Guide + ` for more details. + + .. versionadded:: 1.6 + + Returns + ------- + train_sizes_abs : array of shape (n_unique_ticks,) + Numbers of training examples that has been used to generate the + learning curve. Note that the number of ticks might be less + than n_ticks because duplicate entries will be removed. + + train_scores : array of shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : array of shape (n_ticks, n_cv_folds) + Scores on test set. + + fit_times : array of shape (n_ticks, n_cv_folds) + Times spent for fitting in seconds. Only present if ``return_times`` + is True. + + score_times : array of shape (n_ticks, n_cv_folds) + Times spent for scoring in seconds. Only present if ``return_times`` + is True. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.tree import DecisionTreeClassifier + >>> from sklearn.model_selection import learning_curve + >>> X, y = make_classification(n_samples=100, n_features=10, random_state=42) + >>> tree = DecisionTreeClassifier(max_depth=4, random_state=42) + >>> train_size_abs, train_scores, test_scores = learning_curve( + ... tree, X, y, train_sizes=[0.3, 0.6, 0.9] + ... ) + >>> for train_size, cv_train_scores, cv_test_scores in zip( + ... train_size_abs, train_scores, test_scores + ... ): + ... print(f"{train_size} samples were used to train the model") + ... print(f"The average train accuracy is {cv_train_scores.mean():.2f}") + ... print(f"The average test accuracy is {cv_test_scores.mean():.2f}") + 24 samples were used to train the model + The average train accuracy is 1.00 + The average test accuracy is 0.85 + 48 samples were used to train the model + The average train accuracy is 1.00 + The average test accuracy is 0.90 + 72 samples were used to train the model + The average train accuracy is 1.00 + The average test accuracy is 0.93 + """ + if exploit_incremental_learning and not hasattr(estimator, "partial_fit"): + raise ValueError( + "An estimator must support the partial_fit interface " + "to exploit incremental learning" + ) + + params = _check_params_groups_deprecation(fit_params, params, groups, "1.8") + + X, y, groups = indexable(X, y, groups) + + cv = check_cv(cv, y, classifier=is_classifier(estimator)) + + scorer = check_scoring(estimator, scoring=scoring) + + if _routing_enabled(): + router = ( + MetadataRouter(owner="learning_curve") + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata to the predict method for + # scoring? + method_mapping=MethodMapping() + .add(caller="fit", callee="fit") + .add(caller="fit", callee="partial_fit"), + ) + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + scorer=scorer, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + unrequested_params = sorted(e.unrequested_params) + raise UnsetMetadataPassedError( + message=( + f"{unrequested_params} are passed to `learning_curve` but are not" + " explicitly set as requested or not requested for learning_curve's" + f" estimator: {estimator.__class__.__name__}. Call" + " `.set_fit_request({{metadata}}=True)` on the estimator for" + f" each metadata in {unrequested_params} that you" + " want to use and `metadata=False` for not using it. See the" + " Metadata Routing User guide" + " for more" + " information." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + + else: + routed_params = Bunch() + routed_params.estimator = Bunch(fit=params, partial_fit=params) + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.scorer = Bunch(score={}) + + # Store cv as list as we will be iterating over the list multiple times + cv_iter = list(cv.split(X, y, **routed_params.splitter.split)) + + n_max_training_samples = len(cv_iter[0][0]) + # Because the lengths of folds can be significantly different, it is + # not guaranteed that we use all of the available training data when we + # use the first 'n_max_training_samples' samples. + train_sizes_abs = _translate_train_sizes(train_sizes, n_max_training_samples) + n_unique_ticks = train_sizes_abs.shape[0] + if verbose > 0: + print("[learning_curve] Training set sizes: " + str(train_sizes_abs)) + + parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) + + if shuffle: + rng = check_random_state(random_state) + cv_iter = ((rng.permutation(train), test) for train, test in cv_iter) + + if exploit_incremental_learning: + classes = np.unique(y) if is_classifier(estimator) else None + out = parallel( + delayed(_incremental_fit_estimator)( + clone(estimator), + X, + y, + classes, + train, + test, + train_sizes_abs, + scorer, + return_times, + error_score=error_score, + fit_params=routed_params.estimator.partial_fit, + score_params=routed_params.scorer.score, + ) + for train, test in cv_iter + ) + out = np.asarray(out).transpose((2, 1, 0)) + else: + train_test_proportions = [] + for train, test in cv_iter: + for n_train_samples in train_sizes_abs: + train_test_proportions.append((train[:n_train_samples], test)) + + results = parallel( + delayed(_fit_and_score)( + clone(estimator), + X, + y, + scorer=scorer, + train=train, + test=test, + verbose=verbose, + parameters=None, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + return_train_score=True, + error_score=error_score, + return_times=return_times, + ) + for train, test in train_test_proportions + ) + _warn_or_raise_about_fit_failures(results, error_score) + results = _aggregate_score_dicts(results) + train_scores = results["train_scores"].reshape(-1, n_unique_ticks).T + test_scores = results["test_scores"].reshape(-1, n_unique_ticks).T + out = [train_scores, test_scores] + + if return_times: + fit_times = results["fit_time"].reshape(-1, n_unique_ticks).T + score_times = results["score_time"].reshape(-1, n_unique_ticks).T + out.extend([fit_times, score_times]) + + ret = train_sizes_abs, out[0], out[1] + + if return_times: + ret = ret + (out[2], out[3]) + + return ret + + +def _translate_train_sizes(train_sizes, n_max_training_samples): + """Determine absolute sizes of training subsets and validate 'train_sizes'. + + Examples: + _translate_train_sizes([0.5, 1.0], 10) -> [5, 10] + _translate_train_sizes([5, 10], 10) -> [5, 10] + + Parameters + ---------- + train_sizes : array-like of shape (n_ticks,) + Numbers of training examples that will be used to generate the + learning curve. If the dtype is float, it is regarded as a + fraction of 'n_max_training_samples', i.e. it has to be within (0, 1]. + + n_max_training_samples : int + Maximum number of training samples (upper bound of 'train_sizes'). + + Returns + ------- + train_sizes_abs : array of shape (n_unique_ticks,) + Numbers of training examples that will be used to generate the + learning curve. Note that the number of ticks might be less + than n_ticks because duplicate entries will be removed. + """ + train_sizes_abs = np.asarray(train_sizes) + n_ticks = train_sizes_abs.shape[0] + n_min_required_samples = np.min(train_sizes_abs) + n_max_required_samples = np.max(train_sizes_abs) + if np.issubdtype(train_sizes_abs.dtype, np.floating): + if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: + raise ValueError( + "train_sizes has been interpreted as fractions " + "of the maximum number of training samples and " + "must be within (0, 1], but is within [%f, %f]." + % (n_min_required_samples, n_max_required_samples) + ) + train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype( + dtype=int, copy=False + ) + train_sizes_abs = np.clip(train_sizes_abs, 1, n_max_training_samples) + else: + if ( + n_min_required_samples <= 0 + or n_max_required_samples > n_max_training_samples + ): + raise ValueError( + "train_sizes has been interpreted as absolute " + "numbers of training samples and must be within " + "(0, %d], but is within [%d, %d]." + % ( + n_max_training_samples, + n_min_required_samples, + n_max_required_samples, + ) + ) + + train_sizes_abs = np.unique(train_sizes_abs) + if n_ticks > train_sizes_abs.shape[0]: + warnings.warn( + "Removed duplicate entries from 'train_sizes'. Number " + "of ticks will be less than the size of " + "'train_sizes': %d instead of %d." % (train_sizes_abs.shape[0], n_ticks), + RuntimeWarning, + ) + + return train_sizes_abs + + +def _incremental_fit_estimator( + estimator, + X, + y, + classes, + train, + test, + train_sizes, + scorer, + return_times, + error_score, + fit_params, + score_params, +): + """Train estimator on training subsets incrementally and compute scores.""" + train_scores, test_scores, fit_times, score_times = [], [], [], [] + partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) + if fit_params is None: + fit_params = {} + if classes is None: + partial_fit_func = partial(estimator.partial_fit, **fit_params) + else: + partial_fit_func = partial(estimator.partial_fit, classes=classes, **fit_params) + score_params = score_params if score_params is not None else {} + score_params_train = _check_method_params(X, params=score_params, indices=train) + score_params_test = _check_method_params(X, params=score_params, indices=test) + + for n_train_samples, partial_train in partitions: + train_subset = train[:n_train_samples] + X_train, y_train = _safe_split(estimator, X, y, train_subset) + X_partial_train, y_partial_train = _safe_split(estimator, X, y, partial_train) + X_test, y_test = _safe_split(estimator, X, y, test, train_subset) + start_fit = time.time() + if y_partial_train is None: + partial_fit_func(X_partial_train) + else: + partial_fit_func(X_partial_train, y_partial_train) + fit_time = time.time() - start_fit + fit_times.append(fit_time) + + start_score = time.time() + + test_scores.append( + _score( + estimator, + X_test, + y_test, + scorer, + score_params=score_params_test, + error_score=error_score, + ) + ) + train_scores.append( + _score( + estimator, + X_train, + y_train, + scorer, + score_params=score_params_train, + error_score=error_score, + ) + ) + score_time = time.time() - start_score + score_times.append(score_time) + + ret = ( + (train_scores, test_scores, fit_times, score_times) + if return_times + else (train_scores, test_scores) + ) + + return np.array(ret).T + + +@validate_params( + { + "estimator": [HasMethods(["fit"])], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "param_name": [str], + "param_range": ["array-like"], + "groups": ["array-like", None], + "cv": ["cv_object"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "n_jobs": [Integral, None], + "pre_dispatch": [Integral, str], + "verbose": ["verbose"], + "error_score": [StrOptions({"raise"}), Real], + "fit_params": [dict, None], + "params": [dict, None], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def validation_curve( + estimator, + X, + y, + *, + param_name, + param_range, + groups=None, + cv=None, + scoring=None, + n_jobs=None, + pre_dispatch="all", + verbose=0, + error_score=np.nan, + fit_params=None, + params=None, +): + """Validation curve. + + Determine training and test scores for varying parameter values. + + Compute scores for an estimator with different values of a specified + parameter. This is similar to grid search with one parameter. However, this + will also compute training scores and is merely a utility for plotting the + results. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : object type that implements the "fit" method + An object of that type which is cloned for each validation. It must + also implement "predict" unless `scoring` is a callable that doesn't + rely on "predict" to compute a score. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + Target relative to X for classification or regression; + None for unsupervised learning. + + param_name : str + Name of the parameter that will be varied. + + param_range : array-like of shape (n_values,) + The values of the parameter that will be evaluated. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + .. versionchanged:: 1.6 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``validation_curve(..., params={'groups': groups})``. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + scoring : str or callable, default=None + A str (see :ref:`scoring_parameter`) or a scorer callable object / function with + signature ``scorer(estimator, X, y)``. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the combinations of each parameter + value and each cross-validation split. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + pre_dispatch : int or str, default='all' + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The str can + be an expression like '2*n_jobs'. + + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. + If a numeric value is given, FitFailedWarning is raised. + + .. versionadded:: 0.20 + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + .. deprecated:: 1.6 + This parameter is deprecated and will be removed in version 1.8. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the estimator, scorer and cross-validation object. + + - If `enable_metadata_routing=False` (default): Parameters directly passed to + the `fit` method of the estimator. + + - If `enable_metadata_routing=True`: Parameters safely routed to the `fit` + method of the estimator, to the scorer and to the cross-validation object. + See :ref:`Metadata Routing User Guide ` for more details. + + .. versionadded:: 1.6 + + Returns + ------- + train_scores : array of shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : array of shape (n_ticks, n_cv_folds) + Scores on test set. + + Notes + ----- + See :ref:`sphx_glr_auto_examples_model_selection_plot_train_error_vs_test_error.py` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import validation_curve + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=1_000, random_state=0) + >>> logistic_regression = LogisticRegression() + >>> param_name, param_range = "C", np.logspace(-8, 3, 10) + >>> train_scores, test_scores = validation_curve( + ... logistic_regression, X, y, param_name=param_name, param_range=param_range + ... ) + >>> print(f"The average train accuracy is {train_scores.mean():.2f}") + The average train accuracy is 0.81 + >>> print(f"The average test accuracy is {test_scores.mean():.2f}") + The average test accuracy is 0.81 + """ + params = _check_params_groups_deprecation(fit_params, params, groups, "1.8") + X, y, groups = indexable(X, y, groups) + + cv = check_cv(cv, y, classifier=is_classifier(estimator)) + scorer = check_scoring(estimator, scoring=scoring) + + if _routing_enabled(): + router = ( + MetadataRouter(owner="validation_curve") + .add( + estimator=estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + scorer=scorer, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + unrequested_params = sorted(e.unrequested_params) + raise UnsetMetadataPassedError( + message=( + f"{unrequested_params} are passed to `validation_curve` but are not" + " explicitly set as requested or not requested for" + f" validation_curve's estimator: {estimator.__class__.__name__}." + " Call `.set_fit_request({{metadata}}=True)` on the estimator for" + f" each metadata in {unrequested_params} that you" + " want to use and `metadata=False` for not using it. See the" + " Metadata Routing User guide" + " for more" + " information." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + + else: + routed_params = Bunch() + routed_params.estimator = Bunch(fit=params) + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.scorer = Bunch(score={}) + + parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) + results = parallel( + delayed(_fit_and_score)( + clone(estimator), + X, + y, + scorer=scorer, + train=train, + test=test, + verbose=verbose, + parameters={param_name: v}, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + return_train_score=True, + error_score=error_score, + ) + # NOTE do not change order of iteration to allow one time cv splitters + for train, test in cv.split(X, y, **routed_params.splitter.split) + for v in param_range + ) + n_params = len(param_range) + + results = _aggregate_score_dicts(results) + train_scores = results["train_scores"].reshape(-1, n_params).T + test_scores = results["test_scores"].reshape(-1, n_params).T + + return train_scores, test_scores + + +def _aggregate_score_dicts(scores): + """Aggregate the list of dict to dict of np ndarray + + The aggregated output of _aggregate_score_dicts will be a list of dict + of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...] + Convert it to a dict of array {'prec': np.array([0.1 ...]), ...} + + Parameters + ---------- + + scores : list of dict + List of dicts of the scores for all scorers. This is a flat list, + assumed originally to be of row major order. + + Example + ------- + + >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3}, + ... {'a': 10, 'b': 10}] # doctest: +SKIP + >>> _aggregate_score_dicts(scores) # doctest: +SKIP + {'a': array([1, 2, 3, 10]), + 'b': array([10, 2, 3, 10])} + """ + return { + key: ( + np.asarray([score[key] for score in scores]) + if isinstance(scores[0][key], numbers.Number) + else [score[key] for score in scores] + ) + for key in scores[0] + } diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/__init__.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/common.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/common.py new file mode 100644 index 0000000000000000000000000000000000000000..007843822febc5f4cf9145f41e667cef17b9e988 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/common.py @@ -0,0 +1,24 @@ +""" +Common utilities for testing model selection. +""" + +import numpy as np + +from sklearn.model_selection import KFold + + +class OneTimeSplitter: + """A wrapper to make KFold single entry cv iterator""" + + def __init__(self, n_splits=4, n_samples=99): + self.n_splits = n_splits + self.n_samples = n_samples + self.indices = iter(KFold(n_splits=n_splits).split(np.ones(n_samples))) + + def split(self, X=None, y=None, groups=None): + """Split can be called only once""" + for index in self.indices: + yield index + + def get_n_splits(self, X=None, y=None, groups=None): + return self.n_splits diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_classification_threshold.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_classification_threshold.py new file mode 100644 index 0000000000000000000000000000000000000000..2920bc7a68ab9cd6ff1cb8b5b736b9f647d65e46 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_classification_threshold.py @@ -0,0 +1,618 @@ +import numpy as np +import pytest + +from sklearn import config_context +from sklearn.base import clone +from sklearn.datasets import ( + load_breast_cancer, + load_iris, + make_classification, + make_multilabel_classification, +) +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import ( + balanced_accuracy_score, + f1_score, + fbeta_score, + make_scorer, +) +from sklearn.metrics._scorer import _CurveScorer +from sklearn.model_selection import ( + FixedThresholdClassifier, + StratifiedShuffleSplit, + TunedThresholdClassifierCV, +) +from sklearn.model_selection._classification_threshold import ( + _fit_and_score_over_thresholds, +) +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils._mocking import CheckingClassifier +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_array_equal, +) + + +def test_fit_and_score_over_thresholds_curve_scorers(): + """Check that `_fit_and_score_over_thresholds` returns thresholds in ascending order + for the different accepted curve scorers.""" + X, y = make_classification(n_samples=100, random_state=0) + train_idx, val_idx = np.arange(50), np.arange(50, 100) + classifier = LogisticRegression() + + curve_scorer = _CurveScorer( + score_func=balanced_accuracy_score, + sign=1, + response_method="predict_proba", + thresholds=10, + kwargs={}, + ) + scores, thresholds = _fit_and_score_over_thresholds( + classifier, + X, + y, + fit_params={}, + train_idx=train_idx, + val_idx=val_idx, + curve_scorer=curve_scorer, + score_params={}, + ) + + assert np.all(thresholds[:-1] <= thresholds[1:]) + assert isinstance(scores, np.ndarray) + assert np.logical_and(scores >= 0, scores <= 1).all() + + +def test_fit_and_score_over_thresholds_prefit(): + """Check the behaviour with a prefit classifier.""" + X, y = make_classification(n_samples=100, random_state=0) + + # `train_idx is None` to indicate that the classifier is prefit + train_idx, val_idx = None, np.arange(50, 100) + classifier = DecisionTreeClassifier(random_state=0).fit(X, y) + # make sure that the classifier memorized the full dataset such that + # we get perfect predictions and thus match the expected score + assert classifier.score(X[val_idx], y[val_idx]) == pytest.approx(1.0) + + curve_scorer = _CurveScorer( + score_func=balanced_accuracy_score, + sign=1, + response_method="predict_proba", + thresholds=2, + kwargs={}, + ) + scores, thresholds = _fit_and_score_over_thresholds( + classifier, + X, + y, + fit_params={}, + train_idx=train_idx, + val_idx=val_idx, + curve_scorer=curve_scorer, + score_params={}, + ) + assert np.all(thresholds[:-1] <= thresholds[1:]) + assert_allclose(scores, [0.5, 1.0]) + + +@config_context(enable_metadata_routing=True) +def test_fit_and_score_over_thresholds_sample_weight(): + """Check that we dispatch the sample-weight to fit and score the classifier.""" + X, y = load_iris(return_X_y=True) + X, y = X[:100], y[:100] # only 2 classes + + # create a dataset and repeat twice the sample of class #0 + X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]]) + # create a sample weight vector that is equivalent to the repeated dataset + sample_weight = np.ones_like(y) + sample_weight[:50] *= 2 + + classifier = LogisticRegression() + train_repeated_idx = np.arange(X_repeated.shape[0]) + val_repeated_idx = np.arange(X_repeated.shape[0]) + curve_scorer = _CurveScorer( + score_func=balanced_accuracy_score, + sign=1, + response_method="predict_proba", + thresholds=10, + kwargs={}, + ) + scores_repeated, thresholds_repeated = _fit_and_score_over_thresholds( + classifier, + X_repeated, + y_repeated, + fit_params={}, + train_idx=train_repeated_idx, + val_idx=val_repeated_idx, + curve_scorer=curve_scorer, + score_params={}, + ) + + train_idx, val_idx = np.arange(X.shape[0]), np.arange(X.shape[0]) + scores, thresholds = _fit_and_score_over_thresholds( + classifier.set_fit_request(sample_weight=True), + X, + y, + fit_params={"sample_weight": sample_weight}, + train_idx=train_idx, + val_idx=val_idx, + curve_scorer=curve_scorer.set_score_request(sample_weight=True), + score_params={"sample_weight": sample_weight}, + ) + + assert_allclose(thresholds_repeated, thresholds) + assert_allclose(scores_repeated, scores) + + +@pytest.mark.parametrize("fit_params_type", ["list", "array"]) +@config_context(enable_metadata_routing=True) +def test_fit_and_score_over_thresholds_fit_params(fit_params_type): + """Check that we pass `fit_params` to the classifier when calling `fit`.""" + X, y = make_classification(n_samples=100, random_state=0) + fit_params = { + "a": _convert_container(y, fit_params_type), + "b": _convert_container(y, fit_params_type), + } + + classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0) + classifier.set_fit_request(a=True, b=True) + train_idx, val_idx = np.arange(50), np.arange(50, 100) + + curve_scorer = _CurveScorer( + score_func=balanced_accuracy_score, + sign=1, + response_method="predict_proba", + thresholds=10, + kwargs={}, + ) + _fit_and_score_over_thresholds( + classifier, + X, + y, + fit_params=fit_params, + train_idx=train_idx, + val_idx=val_idx, + curve_scorer=curve_scorer, + score_params={}, + ) + + +@pytest.mark.parametrize( + "data", + [ + make_classification(n_classes=3, n_clusters_per_class=1, random_state=0), + make_multilabel_classification(random_state=0), + ], +) +def test_tuned_threshold_classifier_no_binary(data): + """Check that we raise an informative error message for non-binary problem.""" + err_msg = "Only binary classification is supported." + with pytest.raises(ValueError, match=err_msg): + TunedThresholdClassifierCV(LogisticRegression()).fit(*data) + + +@pytest.mark.parametrize( + "params, err_type, err_msg", + [ + ( + {"cv": "prefit", "refit": True}, + ValueError, + "When cv='prefit', refit cannot be True.", + ), + ( + {"cv": 10, "refit": False}, + ValueError, + "When cv has several folds, refit cannot be False.", + ), + ( + {"cv": "prefit", "refit": False}, + NotFittedError, + "`estimator` must be fitted.", + ), + ], +) +def test_tuned_threshold_classifier_conflict_cv_refit(params, err_type, err_msg): + """Check that we raise an informative error message when `cv` and `refit` + cannot be used together. + """ + X, y = make_classification(n_samples=100, random_state=0) + with pytest.raises(err_type, match=err_msg): + TunedThresholdClassifierCV(LogisticRegression(), **params).fit(X, y) + + +@pytest.mark.parametrize( + "estimator", + [LogisticRegression(), SVC(), GradientBoostingClassifier(n_estimators=4)], +) +@pytest.mark.parametrize( + "response_method", ["predict_proba", "predict_log_proba", "decision_function"] +) +@pytest.mark.parametrize( + "ThresholdClassifier", [FixedThresholdClassifier, TunedThresholdClassifierCV] +) +def test_threshold_classifier_estimator_response_methods( + ThresholdClassifier, estimator, response_method +): + """Check that `TunedThresholdClassifierCV` exposes the same response methods as the + underlying estimator. + """ + X, y = make_classification(n_samples=100, random_state=0) + + model = ThresholdClassifier(estimator=estimator) + assert hasattr(model, response_method) == hasattr(estimator, response_method) + + model.fit(X, y) + assert hasattr(model, response_method) == hasattr(estimator, response_method) + + if hasattr(model, response_method): + y_pred_cutoff = getattr(model, response_method)(X) + y_pred_underlying_estimator = getattr(model.estimator_, response_method)(X) + + assert_allclose(y_pred_cutoff, y_pred_underlying_estimator) + + +@pytest.mark.parametrize( + "response_method", ["auto", "decision_function", "predict_proba"] +) +def test_tuned_threshold_classifier_without_constraint_value(response_method): + """Check that `TunedThresholdClassifierCV` is optimizing a given objective + metric.""" + X, y = load_breast_cancer(return_X_y=True) + # remove feature to degrade performances + X = X[:, :5] + + # make the problem completely imbalanced such that the balanced accuracy is low + indices_pos = np.flatnonzero(y == 1) + indices_pos = indices_pos[: indices_pos.size // 50] + indices_neg = np.flatnonzero(y == 0) + + X = np.vstack([X[indices_neg], X[indices_pos]]) + y = np.hstack([y[indices_neg], y[indices_pos]]) + + lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y) + thresholds = 100 + model = TunedThresholdClassifierCV( + estimator=lr, + scoring="balanced_accuracy", + response_method=response_method, + thresholds=thresholds, + store_cv_results=True, + ) + score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X)) + score_baseline = balanced_accuracy_score(y, lr.predict(X)) + assert score_optimized > score_baseline + assert model.cv_results_["thresholds"].shape == (thresholds,) + assert model.cv_results_["scores"].shape == (thresholds,) + + +def test_tuned_threshold_classifier_metric_with_parameter(): + """Check that we can pass a metric with a parameter in addition check that + `f_beta` with `beta=1` is equivalent to `f1` and different from `f_beta` with + `beta=2`. + """ + X, y = load_breast_cancer(return_X_y=True) + lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y) + model_fbeta_1 = TunedThresholdClassifierCV( + estimator=lr, scoring=make_scorer(fbeta_score, beta=1) + ).fit(X, y) + model_fbeta_2 = TunedThresholdClassifierCV( + estimator=lr, scoring=make_scorer(fbeta_score, beta=2) + ).fit(X, y) + model_f1 = TunedThresholdClassifierCV( + estimator=lr, scoring=make_scorer(f1_score) + ).fit(X, y) + + assert model_fbeta_1.best_threshold_ == pytest.approx(model_f1.best_threshold_) + assert model_fbeta_1.best_threshold_ != pytest.approx(model_fbeta_2.best_threshold_) + + +@pytest.mark.parametrize( + "response_method", ["auto", "decision_function", "predict_proba"] +) +@pytest.mark.parametrize( + "metric", + [ + make_scorer(balanced_accuracy_score), + make_scorer(f1_score, pos_label="cancer"), + ], +) +def test_tuned_threshold_classifier_with_string_targets(response_method, metric): + """Check that targets represented by str are properly managed. + Also, check with several metrics to be sure that `pos_label` is properly + dispatched. + """ + X, y = load_breast_cancer(return_X_y=True) + # Encode numeric targets by meaningful strings. We purposely designed the class + # names such that the `pos_label` is the first alphabetically sorted class and thus + # encoded as 0. + classes = np.array(["cancer", "healthy"], dtype=object) + y = classes[y] + model = TunedThresholdClassifierCV( + estimator=make_pipeline(StandardScaler(), LogisticRegression()), + scoring=metric, + response_method=response_method, + thresholds=100, + ).fit(X, y) + assert_array_equal(model.classes_, np.sort(classes)) + y_pred = model.predict(X) + assert_array_equal(np.unique(y_pred), np.sort(classes)) + + +@pytest.mark.parametrize("with_sample_weight", [True, False]) +@config_context(enable_metadata_routing=True) +def test_tuned_threshold_classifier_refit(with_sample_weight, global_random_seed): + """Check the behaviour of the `refit` parameter.""" + rng = np.random.RandomState(global_random_seed) + X, y = make_classification(n_samples=100, random_state=0) + if with_sample_weight: + sample_weight = rng.randn(X.shape[0]) + sample_weight = np.abs(sample_weight, out=sample_weight) + else: + sample_weight = None + + # check that `estimator_` if fitted on the full dataset when `refit=True` + estimator = LogisticRegression().set_fit_request(sample_weight=True) + model = TunedThresholdClassifierCV(estimator, refit=True).fit( + X, y, sample_weight=sample_weight + ) + + assert model.estimator_ is not estimator + estimator.fit(X, y, sample_weight=sample_weight) + assert_allclose(model.estimator_.coef_, estimator.coef_) + assert_allclose(model.estimator_.intercept_, estimator.intercept_) + + # check that `estimator_` was not altered when `refit=False` and `cv="prefit"` + estimator = LogisticRegression().set_fit_request(sample_weight=True) + estimator.fit(X, y, sample_weight=sample_weight) + coef = estimator.coef_.copy() + model = TunedThresholdClassifierCV(estimator, cv="prefit", refit=False).fit( + X, y, sample_weight=sample_weight + ) + + assert model.estimator_ is estimator + assert_allclose(model.estimator_.coef_, coef) + + # check that we train `estimator_` on the training split of a given cross-validation + estimator = LogisticRegression().set_fit_request(sample_weight=True) + cv = [ + (np.arange(50), np.arange(50, 100)), + ] # single split + model = TunedThresholdClassifierCV(estimator, cv=cv, refit=False).fit( + X, y, sample_weight=sample_weight + ) + + assert model.estimator_ is not estimator + if with_sample_weight: + sw_train = sample_weight[cv[0][0]] + else: + sw_train = None + estimator.fit(X[cv[0][0]], y[cv[0][0]], sample_weight=sw_train) + assert_allclose(model.estimator_.coef_, estimator.coef_) + + +@pytest.mark.parametrize("fit_params_type", ["list", "array"]) +@config_context(enable_metadata_routing=True) +def test_tuned_threshold_classifier_fit_params(fit_params_type): + """Check that we pass `fit_params` to the classifier when calling `fit`.""" + X, y = make_classification(n_samples=100, random_state=0) + fit_params = { + "a": _convert_container(y, fit_params_type), + "b": _convert_container(y, fit_params_type), + } + + classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0) + classifier.set_fit_request(a=True, b=True) + model = TunedThresholdClassifierCV(classifier) + model.fit(X, y, **fit_params) + + +@config_context(enable_metadata_routing=True) +def test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence(): + """Check that passing removing some sample from the dataset `X` is + equivalent to passing a `sample_weight` with a factor 0.""" + X, y = load_iris(return_X_y=True) + # Scale the data to avoid any convergence issue + X = StandardScaler().fit_transform(X) + # Only use 2 classes and select samples such that 2-fold cross-validation + # split will lead to an equivalence with a `sample_weight` of 0 + X = np.vstack((X[:40], X[50:90])) + y = np.hstack((y[:40], y[50:90])) + sample_weight = np.zeros_like(y) + sample_weight[::2] = 1 + + estimator = LogisticRegression().set_fit_request(sample_weight=True) + model_without_weights = TunedThresholdClassifierCV(estimator, cv=2) + model_with_weights = clone(model_without_weights) + + model_with_weights.fit(X, y, sample_weight=sample_weight) + model_without_weights.fit(X[::2], y[::2]) + + assert_allclose( + model_with_weights.estimator_.coef_, model_without_weights.estimator_.coef_ + ) + + y_pred_with_weights = model_with_weights.predict_proba(X) + y_pred_without_weights = model_without_weights.predict_proba(X) + assert_allclose(y_pred_with_weights, y_pred_without_weights) + + +def test_tuned_threshold_classifier_thresholds_array(): + """Check that we can pass an array to `thresholds` and it is used as candidate + threshold internally.""" + X, y = make_classification(random_state=0) + estimator = LogisticRegression() + thresholds = np.linspace(0, 1, 11) + tuned_model = TunedThresholdClassifierCV( + estimator, + thresholds=thresholds, + response_method="predict_proba", + store_cv_results=True, + ).fit(X, y) + assert_allclose(tuned_model.cv_results_["thresholds"], thresholds) + + +@pytest.mark.parametrize("store_cv_results", [True, False]) +def test_tuned_threshold_classifier_store_cv_results(store_cv_results): + """Check that if `cv_results_` exists depending on `store_cv_results`.""" + X, y = make_classification(random_state=0) + estimator = LogisticRegression() + tuned_model = TunedThresholdClassifierCV( + estimator, store_cv_results=store_cv_results + ).fit(X, y) + if store_cv_results: + assert hasattr(tuned_model, "cv_results_") + else: + assert not hasattr(tuned_model, "cv_results_") + + +def test_tuned_threshold_classifier_cv_float(): + """Check the behaviour when `cv` is set to a float.""" + X, y = make_classification(random_state=0) + + # case where `refit=False` and cv is a float: the underlying estimator will be fit + # on the training set given by a ShuffleSplit. We check that we get the same model + # coefficients. + test_size = 0.3 + estimator = LogisticRegression() + tuned_model = TunedThresholdClassifierCV( + estimator, cv=test_size, refit=False, random_state=0 + ).fit(X, y) + tuned_model.fit(X, y) + + cv = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=0) + train_idx, val_idx = next(cv.split(X, y)) + cloned_estimator = clone(estimator).fit(X[train_idx], y[train_idx]) + + assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_) + + # case where `refit=True`, then the underlying estimator is fitted on the full + # dataset. + tuned_model.set_params(refit=True).fit(X, y) + cloned_estimator = clone(estimator).fit(X, y) + + assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_) + + +def test_tuned_threshold_classifier_error_constant_predictor(): + """Check that we raise a ValueError if the underlying classifier returns constant + probabilities such that we cannot find any threshold. + """ + X, y = make_classification(random_state=0) + estimator = DummyClassifier(strategy="constant", constant=1) + tuned_model = TunedThresholdClassifierCV(estimator, response_method="predict_proba") + err_msg = "The provided estimator makes constant predictions" + with pytest.raises(ValueError, match=err_msg): + tuned_model.fit(X, y) + + +@pytest.mark.parametrize( + "response_method", ["auto", "predict_proba", "decision_function"] +) +def test_fixed_threshold_classifier_equivalence_default(response_method): + """Check that `FixedThresholdClassifier` has the same behaviour as the vanilla + classifier. + """ + X, y = make_classification(random_state=0) + classifier = LogisticRegression().fit(X, y) + classifier_default_threshold = FixedThresholdClassifier( + estimator=clone(classifier), response_method=response_method + ) + classifier_default_threshold.fit(X, y) + + # emulate the response method that should take into account the `pos_label` + if response_method in ("auto", "predict_proba"): + y_score = classifier_default_threshold.predict_proba(X)[:, 1] + threshold = 0.5 + else: # response_method == "decision_function" + y_score = classifier_default_threshold.decision_function(X) + threshold = 0.0 + + y_pred_lr = (y_score >= threshold).astype(int) + assert_allclose(classifier_default_threshold.predict(X), y_pred_lr) + + +@pytest.mark.parametrize( + "response_method, threshold", [("predict_proba", 0.7), ("decision_function", 2.0)] +) +@pytest.mark.parametrize("pos_label", [0, 1]) +def test_fixed_threshold_classifier(response_method, threshold, pos_label): + """Check that applying `predict` lead to the same prediction as applying the + threshold to the output of the response method. + """ + X, y = make_classification(n_samples=50, random_state=0) + logistic_regression = LogisticRegression().fit(X, y) + model = FixedThresholdClassifier( + estimator=clone(logistic_regression), + threshold=threshold, + response_method=response_method, + pos_label=pos_label, + ).fit(X, y) + + # check that the underlying estimator is the same + assert_allclose(model.estimator_.coef_, logistic_regression.coef_) + + # emulate the response method that should take into account the `pos_label` + if response_method == "predict_proba": + y_score = model.predict_proba(X)[:, pos_label] + else: # response_method == "decision_function" + y_score = model.decision_function(X) + y_score = y_score if pos_label == 1 else -y_score + + # create a mapping from boolean values to class labels + map_to_label = np.array([0, 1]) if pos_label == 1 else np.array([1, 0]) + y_pred_lr = map_to_label[(y_score >= threshold).astype(int)] + assert_allclose(model.predict(X), y_pred_lr) + + for method in ("predict_proba", "predict_log_proba", "decision_function"): + assert_allclose( + getattr(model, method)(X), getattr(logistic_regression, method)(X) + ) + assert_allclose( + getattr(model.estimator_, method)(X), + getattr(logistic_regression, method)(X), + ) + + +@config_context(enable_metadata_routing=True) +def test_fixed_threshold_classifier_metadata_routing(): + """Check that everything works with metadata routing.""" + X, y = make_classification(random_state=0) + sample_weight = np.ones_like(y) + sample_weight[::2] = 2 + classifier = LogisticRegression().set_fit_request(sample_weight=True) + classifier.fit(X, y, sample_weight=sample_weight) + classifier_default_threshold = FixedThresholdClassifier(estimator=clone(classifier)) + classifier_default_threshold.fit(X, y, sample_weight=sample_weight) + assert_allclose(classifier_default_threshold.estimator_.coef_, classifier.coef_) + + +@pytest.mark.parametrize( + "method", ["predict_proba", "decision_function", "predict", "predict_log_proba"] +) +def test_fixed_threshold_classifier_fitted_estimator(method): + """Check that if the underlying estimator is already fitted, no fit is required.""" + X, y = make_classification(random_state=0) + classifier = LogisticRegression().fit(X, y) + fixed_threshold_classifier = FixedThresholdClassifier(estimator=classifier) + # This should not raise an error + getattr(fixed_threshold_classifier, method)(X) + + +def test_fixed_threshold_classifier_classes_(): + """Check that the classes_ attribute is properly set.""" + X, y = make_classification(random_state=0) + with pytest.raises( + AttributeError, match="The underlying estimator is not fitted yet." + ): + FixedThresholdClassifier(estimator=LogisticRegression()).classes_ + + classifier = LogisticRegression().fit(X, y) + fixed_threshold_classifier = FixedThresholdClassifier(estimator=classifier) + assert_array_equal(fixed_threshold_classifier.classes_, classifier.classes_) diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_plot.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_plot.py new file mode 100644 index 0000000000000000000000000000000000000000..1566f02acb3d361c2ca783b899a452e2e1d45418 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_plot.py @@ -0,0 +1,572 @@ +import numpy as np +import pytest + +from sklearn.datasets import load_iris +from sklearn.model_selection import ( + LearningCurveDisplay, + ValidationCurveDisplay, + learning_curve, + validation_curve, +) +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils import shuffle +from sklearn.utils._testing import assert_allclose, assert_array_equal + + +@pytest.fixture +def data(): + return shuffle(*load_iris(return_X_y=True), random_state=0) + + +@pytest.mark.parametrize( + "params, err_type, err_msg", + [ + ({"std_display_style": "invalid"}, ValueError, "Unknown std_display_style:"), + ({"score_type": "invalid"}, ValueError, "Unknown score_type:"), + ], +) +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_parameters_validation( + pyplot, data, params, err_type, err_msg, CurveDisplay, specific_params +): + """Check that we raise a proper error when passing invalid parameters.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + with pytest.raises(err_type, match=err_msg): + CurveDisplay.from_estimator(estimator, X, y, **specific_params, **params) + + +def test_learning_curve_display_default_usage(pyplot, data): + """Check the default usage of the LearningCurveDisplay class.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + train_sizes = [0.3, 0.6, 0.9] + display = LearningCurveDisplay.from_estimator( + estimator, X, y, train_sizes=train_sizes + ) + + import matplotlib as mpl + + assert display.errorbar_ is None + + assert isinstance(display.lines_, list) + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) + + assert isinstance(display.fill_between_, list) + for fill in display.fill_between_: + assert isinstance(fill, mpl.collections.PolyCollection) + assert fill.get_alpha() == 0.5 + + assert display.score_name == "Score" + assert display.ax_.get_xlabel() == "Number of samples in the training set" + assert display.ax_.get_ylabel() == "Score" + + _, legend_labels = display.ax_.get_legend_handles_labels() + assert legend_labels == ["Train", "Test"] + + train_sizes_abs, train_scores, test_scores = learning_curve( + estimator, X, y, train_sizes=train_sizes + ) + + assert_array_equal(display.train_sizes, train_sizes_abs) + assert_allclose(display.train_scores, train_scores) + assert_allclose(display.test_scores, test_scores) + + +def test_validation_curve_display_default_usage(pyplot, data): + """Check the default usage of the ValidationCurveDisplay class.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + param_name, param_range = "max_depth", [1, 3, 5] + display = ValidationCurveDisplay.from_estimator( + estimator, X, y, param_name=param_name, param_range=param_range + ) + + import matplotlib as mpl + + assert display.errorbar_ is None + + assert isinstance(display.lines_, list) + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) + + assert isinstance(display.fill_between_, list) + for fill in display.fill_between_: + assert isinstance(fill, mpl.collections.PolyCollection) + assert fill.get_alpha() == 0.5 + + assert display.score_name == "Score" + assert display.ax_.get_xlabel() == f"{param_name}" + assert display.ax_.get_ylabel() == "Score" + + _, legend_labels = display.ax_.get_legend_handles_labels() + assert legend_labels == ["Train", "Test"] + + train_scores, test_scores = validation_curve( + estimator, X, y, param_name=param_name, param_range=param_range + ) + + assert_array_equal(display.param_range, param_range) + assert_allclose(display.train_scores, train_scores) + assert_allclose(display.test_scores, test_scores) + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_negate_score(pyplot, data, CurveDisplay, specific_params): + """Check the behaviour of the `negate_score` parameter calling `from_estimator` and + `plot`. + """ + X, y = data + estimator = DecisionTreeClassifier(max_depth=1, random_state=0) + + negate_score = False + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score + ) + + positive_scores = display.lines_[0].get_data()[1] + assert (positive_scores >= 0).all() + assert display.ax_.get_ylabel() == "Score" + + negate_score = True + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score + ) + + negative_scores = display.lines_[0].get_data()[1] + assert (negative_scores <= 0).all() + assert_allclose(negative_scores, -positive_scores) + assert display.ax_.get_ylabel() == "Negative score" + + negate_score = False + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score + ) + assert display.ax_.get_ylabel() == "Score" + display.plot(negate_score=not negate_score) + assert display.ax_.get_ylabel() == "Score" + assert (display.lines_[0].get_data()[1] < 0).all() + + +@pytest.mark.parametrize( + "score_name, ylabel", [(None, "Score"), ("Accuracy", "Accuracy")] +) +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_score_name( + pyplot, data, score_name, ylabel, CurveDisplay, specific_params +): + """Check that we can overwrite the default score name shown on the y-axis.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, score_name=score_name + ) + + assert display.ax_.get_ylabel() == ylabel + X, y = data + estimator = DecisionTreeClassifier(max_depth=1, random_state=0) + + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, score_name=score_name + ) + + assert display.score_name == ylabel + + +@pytest.mark.parametrize("std_display_style", (None, "errorbar")) +def test_learning_curve_display_score_type(pyplot, data, std_display_style): + """Check the behaviour of setting the `score_type` parameter.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + train_sizes = [0.3, 0.6, 0.9] + train_sizes_abs, train_scores, test_scores = learning_curve( + estimator, X, y, train_sizes=train_sizes + ) + + score_type = "train" + display = LearningCurveDisplay.from_estimator( + estimator, + X, + y, + train_sizes=train_sizes, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, train_sizes_abs) + assert_allclose(y_data, train_scores.mean(axis=1)) + + score_type = "test" + display = LearningCurveDisplay.from_estimator( + estimator, + X, + y, + train_sizes=train_sizes, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Test"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, train_sizes_abs) + assert_allclose(y_data, test_scores.mean(axis=1)) + + score_type = "both" + display = LearningCurveDisplay.from_estimator( + estimator, + X, + y, + train_sizes=train_sizes, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train", "Test"] + + if std_display_style is None: + assert len(display.lines_) == 2 + assert display.errorbar_ is None + x_data_train, y_data_train = display.lines_[0].get_data() + x_data_test, y_data_test = display.lines_[1].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 2 + x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data() + x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data() + + assert_array_equal(x_data_train, train_sizes_abs) + assert_allclose(y_data_train, train_scores.mean(axis=1)) + assert_array_equal(x_data_test, train_sizes_abs) + assert_allclose(y_data_test, test_scores.mean(axis=1)) + + +@pytest.mark.parametrize("std_display_style", (None, "errorbar")) +def test_validation_curve_display_score_type(pyplot, data, std_display_style): + """Check the behaviour of setting the `score_type` parameter.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + param_name, param_range = "max_depth", [1, 3, 5] + train_scores, test_scores = validation_curve( + estimator, X, y, param_name=param_name, param_range=param_range + ) + + score_type = "train" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, param_range) + assert_allclose(y_data, train_scores.mean(axis=1)) + + score_type = "test" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Test"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, param_range) + assert_allclose(y_data, test_scores.mean(axis=1)) + + score_type = "both" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train", "Test"] + + if std_display_style is None: + assert len(display.lines_) == 2 + assert display.errorbar_ is None + x_data_train, y_data_train = display.lines_[0].get_data() + x_data_test, y_data_test = display.lines_[1].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 2 + x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data() + x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data() + + assert_array_equal(x_data_train, param_range) + assert_allclose(y_data_train, train_scores.mean(axis=1)) + assert_array_equal(x_data_test, param_range) + assert_allclose(y_data_test, test_scores.mean(axis=1)) + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params, expected_xscale", + [ + ( + ValidationCurveDisplay, + {"param_name": "max_depth", "param_range": np.arange(1, 5)}, + "linear", + ), + (LearningCurveDisplay, {"train_sizes": np.linspace(0.1, 0.9, num=5)}, "linear"), + ( + ValidationCurveDisplay, + { + "param_name": "max_depth", + "param_range": np.round(np.logspace(0, 2, num=5)).astype(np.int64), + }, + "log", + ), + (LearningCurveDisplay, {"train_sizes": np.logspace(-1, 0, num=5)}, "log"), + ], +) +def test_curve_display_xscale_auto( + pyplot, data, CurveDisplay, specific_params, expected_xscale +): + """Check the behaviour of the x-axis scaling depending on the data provided.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + display = CurveDisplay.from_estimator(estimator, X, y, **specific_params) + assert display.ax_.get_xscale() == expected_xscale + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_std_display_style(pyplot, data, CurveDisplay, specific_params): + """Check the behaviour of the parameter `std_display_style`.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + import matplotlib as mpl + + std_display_style = None + display = CurveDisplay.from_estimator( + estimator, + X, + y, + **specific_params, + std_display_style=std_display_style, + ) + + assert len(display.lines_) == 2 + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) + assert display.errorbar_ is None + assert display.fill_between_ is None + _, legend_label = display.ax_.get_legend_handles_labels() + assert len(legend_label) == 2 + + std_display_style = "fill_between" + display = CurveDisplay.from_estimator( + estimator, + X, + y, + **specific_params, + std_display_style=std_display_style, + ) + + assert len(display.lines_) == 2 + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) + assert display.errorbar_ is None + assert len(display.fill_between_) == 2 + for fill_between in display.fill_between_: + assert isinstance(fill_between, mpl.collections.PolyCollection) + _, legend_label = display.ax_.get_legend_handles_labels() + assert len(legend_label) == 2 + + std_display_style = "errorbar" + display = CurveDisplay.from_estimator( + estimator, + X, + y, + **specific_params, + std_display_style=std_display_style, + ) + + assert display.lines_ is None + assert len(display.errorbar_) == 2 + for errorbar in display.errorbar_: + assert isinstance(errorbar, mpl.container.ErrorbarContainer) + assert display.fill_between_ is None + _, legend_label = display.ax_.get_legend_handles_labels() + assert len(legend_label) == 2 + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_plot_kwargs(pyplot, data, CurveDisplay, specific_params): + """Check the behaviour of the different plotting keyword arguments: `line_kw`, + `fill_between_kw`, and `errorbar_kw`.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + std_display_style = "fill_between" + line_kw = {"color": "red"} + fill_between_kw = {"color": "red", "alpha": 1.0} + display = CurveDisplay.from_estimator( + estimator, + X, + y, + **specific_params, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + ) + + assert display.lines_[0].get_color() == "red" + assert_allclose( + display.fill_between_[0].get_facecolor(), + [[1.0, 0.0, 0.0, 1.0]], # trust me, it's red + ) + + std_display_style = "errorbar" + errorbar_kw = {"color": "red"} + display = CurveDisplay.from_estimator( + estimator, + X, + y, + **specific_params, + std_display_style=std_display_style, + errorbar_kw=errorbar_kw, + ) + + assert display.errorbar_[0].lines[0].get_color() == "red" + + +@pytest.mark.parametrize( + "param_range, xscale", + [([5, 10, 15], "linear"), ([-50, 5, 50, 500], "symlog"), ([5, 50, 500], "log")], +) +def test_validation_curve_xscale_from_param_range_provided_as_a_list( + pyplot, data, param_range, xscale +): + """Check the induced xscale from the provided param_range values.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + param_name = "max_depth" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + ) + + assert display.ax_.get_xscale() == xscale + + +@pytest.mark.parametrize( + "Display, params", + [ + (LearningCurveDisplay, {}), + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + ], +) +def test_subclassing_displays(pyplot, data, Display, params): + """Check that named constructors return the correct type when subclassed. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/pull/27675 + """ + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + class SubclassOfDisplay(Display): + pass + + display = SubclassOfDisplay.from_estimator(estimator, X, y, **params) + assert isinstance(display, SubclassOfDisplay) diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_search.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_search.py new file mode 100644 index 0000000000000000000000000000000000000000..9c387d62b80d9321a3dcb9c595f471bfcdba4d70 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_search.py @@ -0,0 +1,2874 @@ +"""Test the search module""" + +import pickle +import re +import sys +import warnings +from collections.abc import Iterable, Sized +from functools import partial +from io import StringIO +from itertools import chain, product +from types import GeneratorType + +import numpy as np +import pytest +from scipy.stats import bernoulli, expon, uniform + +from sklearn import config_context +from sklearn.base import BaseEstimator, ClassifierMixin, is_classifier +from sklearn.cluster import KMeans +from sklearn.compose import ColumnTransformer +from sklearn.datasets import ( + make_blobs, + make_classification, + make_multilabel_classification, +) +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.exceptions import FitFailedWarning +from sklearn.experimental import enable_halving_search_cv # noqa +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.impute import SimpleImputer +from sklearn.linear_model import ( + LinearRegression, + LogisticRegression, + Ridge, + SGDClassifier, +) +from sklearn.metrics import ( + accuracy_score, + confusion_matrix, + f1_score, + make_scorer, + r2_score, + recall_score, + roc_auc_score, +) +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.model_selection import ( + GridSearchCV, + GroupKFold, + GroupShuffleSplit, + HalvingGridSearchCV, + KFold, + LeaveOneGroupOut, + LeavePGroupsOut, + ParameterGrid, + ParameterSampler, + RandomizedSearchCV, + StratifiedKFold, + StratifiedShuffleSplit, + train_test_split, +) +from sklearn.model_selection._search import ( + BaseSearchCV, + _yield_masked_array_for_each_param, +) +from sklearn.model_selection.tests.common import OneTimeSplitter +from sklearn.naive_bayes import ComplementNB +from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import ( + OneHotEncoder, + OrdinalEncoder, + SplineTransformer, + StandardScaler, +) +from sklearn.svm import SVC, LinearSVC +from sklearn.tests.metadata_routing_common import ( + ConsumingScorer, + _Registry, + check_recorded_metadata, +) +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils._array_api import yield_namespace_device_dtype_combinations +from sklearn.utils._mocking import CheckingClassifier, MockDataFrame +from sklearn.utils._testing import ( + MinimalClassifier, + MinimalRegressor, + MinimalTransformer, + _array_api_for_tests, + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS +from sklearn.utils.validation import _num_samples + + +# Neither of the following two estimators inherit from BaseEstimator, +# to test hyperparameter search on user-defined classifiers. +class MockClassifier(ClassifierMixin, BaseEstimator): + """Dummy classifier to test the parameter search algorithms""" + + def __init__(self, foo_param=0): + self.foo_param = foo_param + + def fit(self, X, Y): + assert len(X) == len(Y) + self.classes_ = np.unique(Y) + return self + + def predict(self, T): + return T.shape[0] + + def transform(self, X): + return X + self.foo_param + + def inverse_transform(self, X): + return X - self.foo_param + + predict_proba = predict + predict_log_proba = predict + decision_function = predict + + def score(self, X=None, Y=None): + if self.foo_param > 1: + score = 1.0 + else: + score = 0.0 + return score + + def get_params(self, deep=False): + return {"foo_param": self.foo_param} + + def set_params(self, **params): + self.foo_param = params["foo_param"] + return self + + +class LinearSVCNoScore(LinearSVC): + """A LinearSVC classifier that has no score method.""" + + @property + def score(self): + raise AttributeError + + +X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) +y = np.array([1, 1, 2, 2]) + + +def assert_grid_iter_equals_getitem(grid): + assert list(grid) == [grid[i] for i in range(len(grid))] + + +@pytest.mark.parametrize("klass", [ParameterGrid, partial(ParameterSampler, n_iter=10)]) +@pytest.mark.parametrize( + "input, error_type, error_message", + [ + (0, TypeError, r"Parameter .* a dict or a list, got: 0 of type int"), + ([{"foo": [0]}, 0], TypeError, r"Parameter .* is not a dict \(0\)"), + ( + {"foo": 0}, + TypeError, + r"Parameter (grid|distribution) for parameter 'foo' (is not|needs to be) " + r"(a list or a numpy array|iterable or a distribution).*", + ), + ], +) +def test_validate_parameter_input(klass, input, error_type, error_message): + with pytest.raises(error_type, match=error_message): + klass(input) + + +def test_parameter_grid(): + # Test basic properties of ParameterGrid. + params1 = {"foo": [1, 2, 3]} + grid1 = ParameterGrid(params1) + assert isinstance(grid1, Iterable) + assert isinstance(grid1, Sized) + assert len(grid1) == 3 + assert_grid_iter_equals_getitem(grid1) + + params2 = {"foo": [4, 2], "bar": ["ham", "spam", "eggs"]} + grid2 = ParameterGrid(params2) + assert len(grid2) == 6 + + # loop to assert we can iterate over the grid multiple times + for i in range(2): + # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2) + points = set(tuple(chain(*(sorted(p.items())))) for p in grid2) + assert points == set( + ("bar", x, "foo", y) for x, y in product(params2["bar"], params2["foo"]) + ) + assert_grid_iter_equals_getitem(grid2) + + # Special case: empty grid (useful to get default estimator settings) + empty = ParameterGrid({}) + assert len(empty) == 1 + assert list(empty) == [{}] + assert_grid_iter_equals_getitem(empty) + with pytest.raises(IndexError): + empty[1] + + has_empty = ParameterGrid([{"C": [1, 10]}, {}, {"C": [0.5]}]) + assert len(has_empty) == 4 + assert list(has_empty) == [{"C": 1}, {"C": 10}, {}, {"C": 0.5}] + assert_grid_iter_equals_getitem(has_empty) + + +def test_grid_search(): + # Test that the best estimator contains the right value for foo_param + clf = MockClassifier() + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, verbose=3) + # make sure it selects the smallest parameter in case of ties + old_stdout = sys.stdout + sys.stdout = StringIO() + grid_search.fit(X, y) + sys.stdout = old_stdout + assert grid_search.best_estimator_.foo_param == 2 + + assert_array_equal(grid_search.cv_results_["param_foo_param"].data, [1, 2, 3]) + + # Smoke test the score etc: + grid_search.score(X, y) + grid_search.predict_proba(X) + grid_search.decision_function(X) + grid_search.transform(X) + + # Test exception handling on scoring + grid_search.scoring = "sklearn" + with pytest.raises(ValueError): + grid_search.fit(X, y) + + +def test_grid_search_pipeline_steps(): + # check that parameters that are estimators are cloned before fitting + pipe = Pipeline([("regressor", LinearRegression())]) + param_grid = {"regressor": [LinearRegression(), Ridge()]} + grid_search = GridSearchCV(pipe, param_grid, cv=2) + grid_search.fit(X, y) + regressor_results = grid_search.cv_results_["param_regressor"] + assert isinstance(regressor_results[0], LinearRegression) + assert isinstance(regressor_results[1], Ridge) + assert not hasattr(regressor_results[0], "coef_") + assert not hasattr(regressor_results[1], "coef_") + assert regressor_results[0] is not grid_search.best_estimator_ + assert regressor_results[1] is not grid_search.best_estimator_ + # check that we didn't modify the parameter grid that was passed + assert not hasattr(param_grid["regressor"][0], "coef_") + assert not hasattr(param_grid["regressor"][1], "coef_") + + +@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV]) +def test_SearchCV_with_fit_params(SearchCV): + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + clf = CheckingClassifier(expected_fit_params=["spam", "eggs"]) + searcher = SearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, error_score="raise") + + # The CheckingClassifier generates an assertion error if + # a parameter is missing or has length != len(X). + err_msg = r"Expected fit parameter\(s\) \['eggs'\] not seen." + with pytest.raises(AssertionError, match=err_msg): + searcher.fit(X, y, spam=np.ones(10)) + + err_msg = "Fit parameter spam has length 1; expected" + with pytest.raises(AssertionError, match=err_msg): + searcher.fit(X, y, spam=np.ones(1), eggs=np.zeros(10)) + searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10)) + + +def test_grid_search_no_score(): + # Test grid-search on classifier that has no score function. + clf = LinearSVC(random_state=0) + X, y = make_blobs(random_state=0, centers=2) + Cs = [0.1, 1, 10] + clf_no_score = LinearSVCNoScore(random_state=0) + grid_search = GridSearchCV(clf, {"C": Cs}, scoring="accuracy") + grid_search.fit(X, y) + + grid_search_no_score = GridSearchCV(clf_no_score, {"C": Cs}, scoring="accuracy") + # smoketest grid search + grid_search_no_score.fit(X, y) + + # check that best params are equal + assert grid_search_no_score.best_params_ == grid_search.best_params_ + # check that we can call score and that it gives the correct result + assert grid_search.score(X, y) == grid_search_no_score.score(X, y) + + # giving no scoring function raises an error + grid_search_no_score = GridSearchCV(clf_no_score, {"C": Cs}) + with pytest.raises(TypeError, match="no scoring"): + grid_search_no_score.fit([[1]]) + + +def test_grid_search_score_method(): + X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0) + clf = LinearSVC(random_state=0) + grid = {"C": [0.1]} + + search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y) + search_accuracy = GridSearchCV(clf, grid, scoring="accuracy").fit(X, y) + search_no_score_method_auc = GridSearchCV( + LinearSVCNoScore(), grid, scoring="roc_auc" + ).fit(X, y) + search_auc = GridSearchCV(clf, grid, scoring="roc_auc").fit(X, y) + + # Check warning only occurs in situation where behavior changed: + # estimator requires score method to compete with scoring parameter + score_no_scoring = search_no_scoring.score(X, y) + score_accuracy = search_accuracy.score(X, y) + score_no_score_auc = search_no_score_method_auc.score(X, y) + score_auc = search_auc.score(X, y) + + # ensure the test is sane + assert score_auc < 1.0 + assert score_accuracy < 1.0 + assert score_auc != score_accuracy + + assert_almost_equal(score_accuracy, score_no_scoring) + assert_almost_equal(score_auc, score_no_score_auc) + + +def test_grid_search_groups(): + # Check if ValueError (when groups is None) propagates to GridSearchCV + # And also check if groups is correctly passed to the cv object + rng = np.random.RandomState(0) + + X, y = make_classification(n_samples=15, n_classes=2, random_state=0) + groups = rng.randint(0, 3, 15) + + clf = LinearSVC(random_state=0) + grid = {"C": [1]} + + group_cvs = [ + LeaveOneGroupOut(), + LeavePGroupsOut(2), + GroupKFold(n_splits=3), + GroupShuffleSplit(), + ] + error_msg = "The 'groups' parameter should not be None." + for cv in group_cvs: + gs = GridSearchCV(clf, grid, cv=cv) + with pytest.raises(ValueError, match=error_msg): + gs.fit(X, y) + gs.fit(X, y, groups=groups) + + non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] + for cv in non_group_cvs: + gs = GridSearchCV(clf, grid, cv=cv) + # Should not raise an error + gs.fit(X, y) + + +def test_classes__property(): + # Test that classes_ property matches best_estimator_.classes_ + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + Cs = [0.1, 1, 10] + + grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs}) + grid_search.fit(X, y) + assert_array_equal(grid_search.best_estimator_.classes_, grid_search.classes_) + + # Test that regressors do not have a classes_ attribute + grid_search = GridSearchCV(Ridge(), {"alpha": [1.0, 2.0]}) + grid_search.fit(X, y) + assert not hasattr(grid_search, "classes_") + + # Test that the grid searcher has no classes_ attribute before it's fit + grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs}) + assert not hasattr(grid_search, "classes_") + + # Test that the grid searcher has no classes_ attribute without a refit + grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs}, refit=False) + grid_search.fit(X, y) + assert not hasattr(grid_search, "classes_") + + +def test_trivial_cv_results_attr(): + # Test search over a "grid" with only one point. + clf = MockClassifier() + grid_search = GridSearchCV(clf, {"foo_param": [1]}, cv=2) + grid_search.fit(X, y) + assert hasattr(grid_search, "cv_results_") + + random_search = RandomizedSearchCV(clf, {"foo_param": [0]}, n_iter=1, cv=2) + random_search.fit(X, y) + assert hasattr(grid_search, "cv_results_") + + +def test_no_refit(): + # Test that GSCV can be used for model selection alone without refitting + clf = MockClassifier() + for scoring in [None, ["accuracy", "precision"]]: + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=False, cv=2) + grid_search.fit(X, y) + assert ( + not hasattr(grid_search, "best_estimator_") + and hasattr(grid_search, "best_index_") + and hasattr(grid_search, "best_params_") + ) + + # Make sure the functions predict/transform etc. raise meaningful + # error messages + for fn_name in ( + "predict", + "predict_proba", + "predict_log_proba", + "transform", + "inverse_transform", + ): + outer_msg = f"has no attribute '{fn_name}'" + inner_msg = ( + f"`refit=False`. {fn_name} is available only after " + "refitting on the best parameters" + ) + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + getattr(grid_search, fn_name)(X) + + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + # Test that an invalid refit param raises appropriate error messages + error_msg = ( + "For multi-metric scoring, the parameter refit must be set to a scorer key" + ) + for refit in [True, "recall", "accuracy"]: + with pytest.raises(ValueError, match=error_msg): + GridSearchCV( + clf, {}, refit=refit, scoring={"acc": "accuracy", "prec": "precision"} + ).fit(X, y) + + +def test_grid_search_error(): + # Test that grid search will capture errors on data with different length + X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) + + clf = LinearSVC() + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) + with pytest.raises(ValueError): + cv.fit(X_[:180], y_) + + +def test_grid_search_one_grid_point(): + X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) + param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]} + + clf = SVC(gamma="auto") + cv = GridSearchCV(clf, param_dict) + cv.fit(X_, y_) + + clf = SVC(C=1.0, kernel="rbf", gamma=0.1) + clf.fit(X_, y_) + + assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_) + + +def test_grid_search_when_param_grid_includes_range(): + # Test that the best estimator contains the right value for foo_param + clf = MockClassifier() + grid_search = None + grid_search = GridSearchCV(clf, {"foo_param": range(1, 4)}, cv=2) + grid_search.fit(X, y) + assert grid_search.best_estimator_.foo_param == 2 + + +def test_grid_search_bad_param_grid(): + X, y = make_classification(n_samples=10, n_features=5, random_state=0) + param_dict = {"C": 1} + clf = SVC(gamma="auto") + error_msg = re.escape( + "Parameter grid for parameter 'C' needs to be a list or " + "a numpy array, but got 1 (of type int) instead. Single " + "values need to be wrapped in a list with one element." + ) + search = GridSearchCV(clf, param_dict) + with pytest.raises(TypeError, match=error_msg): + search.fit(X, y) + + param_dict = {"C": []} + clf = SVC() + error_msg = re.escape( + "Parameter grid for parameter 'C' need to be a non-empty sequence, got: []" + ) + search = GridSearchCV(clf, param_dict) + with pytest.raises(ValueError, match=error_msg): + search.fit(X, y) + + param_dict = {"C": "1,2,3"} + clf = SVC(gamma="auto") + error_msg = re.escape( + "Parameter grid for parameter 'C' needs to be a list or a numpy array, " + "but got '1,2,3' (of type str) instead. Single values need to be " + "wrapped in a list with one element." + ) + search = GridSearchCV(clf, param_dict) + with pytest.raises(TypeError, match=error_msg): + search.fit(X, y) + + param_dict = {"C": np.ones((3, 2))} + clf = SVC() + search = GridSearchCV(clf, param_dict) + with pytest.raises(ValueError): + search.fit(X, y) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_grid_search_sparse(csr_container): + # Test that grid search works with both dense and sparse matrices + X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) + + clf = LinearSVC() + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) + cv.fit(X_[:180], y_[:180]) + y_pred = cv.predict(X_[180:]) + C = cv.best_estimator_.C + + X_ = csr_container(X_) + clf = LinearSVC() + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) + cv.fit(X_[:180].tocoo(), y_[:180]) + y_pred2 = cv.predict(X_[180:]) + C2 = cv.best_estimator_.C + + assert np.mean(y_pred == y_pred2) >= 0.9 + assert C == C2 + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_grid_search_sparse_scoring(csr_container): + X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) + + clf = LinearSVC() + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1") + cv.fit(X_[:180], y_[:180]) + y_pred = cv.predict(X_[180:]) + C = cv.best_estimator_.C + + X_ = csr_container(X_) + clf = LinearSVC() + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1") + cv.fit(X_[:180], y_[:180]) + y_pred2 = cv.predict(X_[180:]) + C2 = cv.best_estimator_.C + + assert_array_equal(y_pred, y_pred2) + assert C == C2 + # Smoke test the score + # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), + # cv.score(X_[:180], y[:180])) + + # test loss where greater is worse + def f1_loss(y_true_, y_pred_): + return -f1_score(y_true_, y_pred_) + + F1Loss = make_scorer(f1_loss, greater_is_better=False) + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring=F1Loss) + cv.fit(X_[:180], y_[:180]) + y_pred3 = cv.predict(X_[180:]) + C3 = cv.best_estimator_.C + + assert C == C3 + assert_array_equal(y_pred, y_pred3) + + +def test_grid_search_precomputed_kernel(): + # Test that grid search works when the input features are given in the + # form of a precomputed kernel matrix + X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) + + # compute the training kernel matrix corresponding to the linear kernel + K_train = np.dot(X_[:180], X_[:180].T) + y_train = y_[:180] + + clf = SVC(kernel="precomputed") + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) + cv.fit(K_train, y_train) + + assert cv.best_score_ >= 0 + + # compute the test kernel matrix + K_test = np.dot(X_[180:], X_[:180].T) + y_test = y_[180:] + + y_pred = cv.predict(K_test) + + assert np.mean(y_pred == y_test) >= 0 + + # test error is raised when the precomputed kernel is not array-like + # or sparse + with pytest.raises(ValueError): + cv.fit(K_train.tolist(), y_train) + + +def test_grid_search_precomputed_kernel_error_nonsquare(): + # Test that grid search returns an error with a non-square precomputed + # training kernel matrix + K_train = np.zeros((10, 20)) + y_train = np.ones((10,)) + clf = SVC(kernel="precomputed") + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) + with pytest.raises(ValueError): + cv.fit(K_train, y_train) + + +class BrokenClassifier(BaseEstimator): + """Broken classifier that cannot be fit twice""" + + def __init__(self, parameter=None): + self.parameter = parameter + + def fit(self, X, y): + assert not hasattr(self, "has_been_fit_") + self.has_been_fit_ = True + + def predict(self, X): + return np.zeros(X.shape[0]) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.UndefinedMetricWarning") +def test_refit(): + # Regression test for bug in refitting + # Simulates re-fitting a broken estimator; this used to break with + # sparse SVMs. + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + + clf = GridSearchCV( + BrokenClassifier(), [{"parameter": [0, 1]}], scoring="precision", refit=True + ) + clf.fit(X, y) + + +def test_refit_callable(): + """ + Test refit=callable, which adds flexibility in identifying the + "best" estimator. + """ + + def refit_callable(cv_results): + """ + A dummy function tests `refit=callable` interface. + Return the index of a model that has the least + `mean_test_score`. + """ + # Fit a dummy clf with `refit=True` to get a list of keys in + # clf.cv_results_. + X, y = make_classification(n_samples=100, n_features=4, random_state=42) + clf = GridSearchCV( + LinearSVC(random_state=42), + {"C": [0.01, 0.1, 1]}, + scoring="precision", + refit=True, + ) + clf.fit(X, y) + # Ensure that `best_index_ != 0` for this dummy clf + assert clf.best_index_ != 0 + + # Assert every key matches those in `cv_results` + for key in clf.cv_results_.keys(): + assert key in cv_results + + return cv_results["mean_test_score"].argmin() + + X, y = make_classification(n_samples=100, n_features=4, random_state=42) + clf = GridSearchCV( + LinearSVC(random_state=42), + {"C": [0.01, 0.1, 1]}, + scoring="precision", + refit=refit_callable, + ) + clf.fit(X, y) + + assert clf.best_index_ == 0 + # Ensure `best_score_` is disabled when using `refit=callable` + assert not hasattr(clf, "best_score_") + + +def test_refit_callable_invalid_type(): + """ + Test implementation catches the errors when 'best_index_' returns an + invalid result. + """ + + def refit_callable_invalid_type(cv_results): + """ + A dummy function tests when returned 'best_index_' is not integer. + """ + return None + + X, y = make_classification(n_samples=100, n_features=4, random_state=42) + + clf = GridSearchCV( + LinearSVC(random_state=42), + {"C": [0.1, 1]}, + scoring="precision", + refit=refit_callable_invalid_type, + ) + with pytest.raises(TypeError, match="best_index_ returned is not an integer"): + clf.fit(X, y) + + +@pytest.mark.parametrize("out_bound_value", [-1, 2]) +@pytest.mark.parametrize("search_cv", [RandomizedSearchCV, GridSearchCV]) +def test_refit_callable_out_bound(out_bound_value, search_cv): + """ + Test implementation catches the errors when 'best_index_' returns an + out of bound result. + """ + + def refit_callable_out_bound(cv_results): + """ + A dummy function tests when returned 'best_index_' is out of bounds. + """ + return out_bound_value + + X, y = make_classification(n_samples=100, n_features=4, random_state=42) + + clf = search_cv( + LinearSVC(random_state=42), + {"C": [0.1, 1]}, + scoring="precision", + refit=refit_callable_out_bound, + ) + with pytest.raises(IndexError, match="best_index_ index out of range"): + clf.fit(X, y) + + +def test_refit_callable_multi_metric(): + """ + Test refit=callable in multiple metric evaluation setting + """ + + def refit_callable(cv_results): + """ + A dummy function tests `refit=callable` interface. + Return the index of a model that has the least + `mean_test_prec`. + """ + assert "mean_test_prec" in cv_results + return cv_results["mean_test_prec"].argmin() + + X, y = make_classification(n_samples=100, n_features=4, random_state=42) + scoring = {"Accuracy": make_scorer(accuracy_score), "prec": "precision"} + clf = GridSearchCV( + LinearSVC(random_state=42), + {"C": [0.01, 0.1, 1]}, + scoring=scoring, + refit=refit_callable, + ) + clf.fit(X, y) + + assert clf.best_index_ == 0 + # Ensure `best_score_` is disabled when using `refit=callable` + assert not hasattr(clf, "best_score_") + + +def test_gridsearch_nd(): + # Pass X as list in GridSearchCV + X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) + y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) + + def check_X(x): + return x.shape[1:] == (5, 3, 2) + + def check_y(x): + return x.shape[1:] == (7, 11) + + clf = CheckingClassifier( + check_X=check_X, + check_y=check_y, + methods_to_check=["fit"], + ) + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}) + grid_search.fit(X_4d, y_3d).score(X, y) + assert hasattr(grid_search, "cv_results_") + + +def test_X_as_list(): + # Pass X as list in GridSearchCV + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + + clf = CheckingClassifier( + check_X=lambda x: isinstance(x, list), + methods_to_check=["fit"], + ) + cv = KFold(n_splits=3) + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=cv) + grid_search.fit(X.tolist(), y).score(X, y) + assert hasattr(grid_search, "cv_results_") + + +def test_y_as_list(): + # Pass y as list in GridSearchCV + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + + clf = CheckingClassifier( + check_y=lambda x: isinstance(x, list), + methods_to_check=["fit"], + ) + cv = KFold(n_splits=3) + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=cv) + grid_search.fit(X, y.tolist()).score(X, y) + assert hasattr(grid_search, "cv_results_") + + +def test_pandas_input(): + # check cross_val_score doesn't destroy pandas dataframe + types = [(MockDataFrame, MockDataFrame)] + try: + from pandas import DataFrame, Series + + types.append((DataFrame, Series)) + except ImportError: + pass + + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + + for InputFeatureType, TargetType in types: + # X dataframe, y series + X_df, y_ser = InputFeatureType(X), TargetType(y) + + def check_df(x): + return isinstance(x, InputFeatureType) + + def check_series(x): + return isinstance(x, TargetType) + + clf = CheckingClassifier(check_X=check_df, check_y=check_series) + + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}) + grid_search.fit(X_df, y_ser).score(X_df, y_ser) + grid_search.predict(X_df) + assert hasattr(grid_search, "cv_results_") + + +def test_unsupervised_grid_search(): + # test grid-search with unsupervised estimator + X, y = make_blobs(n_samples=50, random_state=0) + km = KMeans(random_state=0, init="random", n_init=1) + + # Multi-metric evaluation unsupervised + scoring = ["adjusted_rand_score", "fowlkes_mallows_score"] + for refit in ["adjusted_rand_score", "fowlkes_mallows_score"]: + grid_search = GridSearchCV( + km, param_grid=dict(n_clusters=[2, 3, 4]), scoring=scoring, refit=refit + ) + grid_search.fit(X, y) + # Both ARI and FMS can find the right number :) + assert grid_search.best_params_["n_clusters"] == 3 + + # Single metric evaluation unsupervised + grid_search = GridSearchCV( + km, param_grid=dict(n_clusters=[2, 3, 4]), scoring="fowlkes_mallows_score" + ) + grid_search.fit(X, y) + assert grid_search.best_params_["n_clusters"] == 3 + + # Now without a score, and without y + grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4])) + grid_search.fit(X) + assert grid_search.best_params_["n_clusters"] == 4 + + +def test_gridsearch_no_predict(): + # test grid-search with an estimator without predict. + # slight duplication of a test from KDE + def custom_scoring(estimator, X): + return 42 if estimator.bandwidth == 0.1 else 0 + + X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) + search = GridSearchCV( + KernelDensity(), + param_grid=dict(bandwidth=[0.01, 0.1, 1]), + scoring=custom_scoring, + ) + search.fit(X) + assert search.best_params_["bandwidth"] == 0.1 + assert search.best_score_ == 42 + + +def test_param_sampler(): + # test basic properties of param sampler + param_distributions = {"kernel": ["rbf", "linear"], "C": uniform(0, 1)} + sampler = ParameterSampler( + param_distributions=param_distributions, n_iter=10, random_state=0 + ) + samples = [x for x in sampler] + assert len(samples) == 10 + for sample in samples: + assert sample["kernel"] in ["rbf", "linear"] + assert 0 <= sample["C"] <= 1 + + # test that repeated calls yield identical parameters + param_distributions = {"C": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]} + sampler = ParameterSampler( + param_distributions=param_distributions, n_iter=3, random_state=0 + ) + assert [x for x in sampler] == [x for x in sampler] + + param_distributions = {"C": uniform(0, 1)} + sampler = ParameterSampler( + param_distributions=param_distributions, n_iter=10, random_state=0 + ) + assert [x for x in sampler] == [x for x in sampler] + + +def check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds +): + # Check if the search `cv_results`'s array are of correct types + cv_results = search.cv_results_ + assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys) + assert { + key: cv_results[key].dtype.kind for key in param_keys + } == expected_cv_results_kinds + assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys) + assert all( + cv_results[key].dtype == np.float64 + for key in score_keys + if not key.startswith("rank") + ) + + scorer_keys = search.scorer_.keys() if search.multimetric_ else ["score"] + + for key in scorer_keys: + assert cv_results["rank_test_%s" % key].dtype == np.int32 + + +def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys=()): + # Test the search.cv_results_ contains all the required results + all_keys = param_keys + score_keys + extra_keys + assert_array_equal(sorted(cv_results.keys()), sorted(all_keys + ("params",))) + assert all(cv_results[key].shape == (n_cand,) for key in param_keys + score_keys) + + +def test_grid_search_cv_results(): + X, y = make_classification(n_samples=50, n_features=4, random_state=42) + + n_grid_points = 6 + params = [ + dict( + kernel=[ + "rbf", + ], + C=[1, 10], + gamma=[0.1, 1], + ), + dict( + kernel=[ + "poly", + ], + degree=[1, 2], + ), + ] + + param_keys = ("param_C", "param_degree", "param_gamma", "param_kernel") + score_keys = ( + "mean_test_score", + "mean_train_score", + "rank_test_score", + "split0_test_score", + "split1_test_score", + "split2_test_score", + "split0_train_score", + "split1_train_score", + "split2_train_score", + "std_test_score", + "std_train_score", + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ) + n_candidates = n_grid_points + + search = GridSearchCV(SVC(), cv=3, param_grid=params, return_train_score=True) + search.fit(X, y) + cv_results = search.cv_results_ + # Check if score and timing are reasonable + assert all(cv_results["rank_test_score"] >= 1) + assert (all(cv_results[k] >= 0) for k in score_keys if k != "rank_test_score") + assert ( + all(cv_results[k] <= 1) + for k in score_keys + if "time" not in k and k != "rank_test_score" + ) + # Check cv_results structure + expected_cv_results_kinds = { + "param_C": "i", + "param_degree": "i", + "param_gamma": "f", + "param_kernel": "O", + } + check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds + ) + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) + # Check masking + cv_results = search.cv_results_ + + poly_results = [ + ( + cv_results["param_C"].mask[i] + and cv_results["param_gamma"].mask[i] + and not cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "poly" + ] + assert all(poly_results) + assert len(poly_results) == 2 + + rbf_results = [ + ( + not cv_results["param_C"].mask[i] + and not cv_results["param_gamma"].mask[i] + and cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "rbf" + ] + assert all(rbf_results) + assert len(rbf_results) == 4 + + +def test_random_search_cv_results(): + X, y = make_classification(n_samples=50, n_features=4, random_state=42) + + n_search_iter = 30 + + params = [ + {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)}, + {"kernel": ["poly"], "degree": [2, 3]}, + ] + param_keys = ("param_C", "param_degree", "param_gamma", "param_kernel") + score_keys = ( + "mean_test_score", + "mean_train_score", + "rank_test_score", + "split0_test_score", + "split1_test_score", + "split2_test_score", + "split0_train_score", + "split1_train_score", + "split2_train_score", + "std_test_score", + "std_train_score", + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ) + n_candidates = n_search_iter + + search = RandomizedSearchCV( + SVC(), + n_iter=n_search_iter, + cv=3, + param_distributions=params, + return_train_score=True, + ) + search.fit(X, y) + cv_results = search.cv_results_ + # Check results structure + expected_cv_results_kinds = { + "param_C": "f", + "param_degree": "i", + "param_gamma": "f", + "param_kernel": "O", + } + check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds + ) + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) + assert all( + ( + cv_results["param_C"].mask[i] + and cv_results["param_gamma"].mask[i] + and not cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "poly" + ) + assert all( + ( + not cv_results["param_C"].mask[i] + and not cv_results["param_gamma"].mask[i] + and cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "rbf" + ) + + +@pytest.mark.parametrize( + "SearchCV, specialized_params", + [ + (GridSearchCV, {"param_grid": {"C": [1, 10]}}), + (RandomizedSearchCV, {"param_distributions": {"C": [1, 10]}, "n_iter": 2}), + ], +) +def test_search_default_iid(SearchCV, specialized_params): + # Test the IID parameter TODO: Clearly this test does something else??? + # noise-free simple 2d-data + X, y = make_blobs( + centers=[[0, 0], [1, 0], [0, 1], [1, 1]], + random_state=0, + cluster_std=0.1, + shuffle=False, + n_samples=80, + ) + # split dataset into two folds that are not iid + # first one contains data of all 4 blobs, second only from two. + mask = np.ones(X.shape[0], dtype=bool) + mask[np.where(y == 1)[0][::2]] = 0 + mask[np.where(y == 2)[0][::2]] = 0 + # this leads to perfect classification on one fold and a score of 1/3 on + # the other + # create "cv" for splits + cv = [[mask, ~mask], [~mask, mask]] + + common_params = {"estimator": SVC(), "cv": cv, "return_train_score": True} + search = SearchCV(**common_params, **specialized_params) + search.fit(X, y) + + test_cv_scores = np.array( + [ + search.cv_results_["split%d_test_score" % s][0] + for s in range(search.n_splits_) + ] + ) + test_mean = search.cv_results_["mean_test_score"][0] + test_std = search.cv_results_["std_test_score"][0] + + train_cv_scores = np.array( + [ + search.cv_results_["split%d_train_score" % s][0] + for s in range(search.n_splits_) + ] + ) + train_mean = search.cv_results_["mean_train_score"][0] + train_std = search.cv_results_["std_train_score"][0] + + assert search.cv_results_["param_C"][0] == 1 + # scores are the same as above + assert_allclose(test_cv_scores, [1, 1.0 / 3.0]) + assert_allclose(train_cv_scores, [1, 1]) + # Unweighted mean/std is used + assert test_mean == pytest.approx(np.mean(test_cv_scores)) + assert test_std == pytest.approx(np.std(test_cv_scores)) + + # For the train scores, we do not take a weighted mean irrespective of + # i.i.d. or not + assert train_mean == pytest.approx(1) + assert train_std == pytest.approx(0) + + +def test_grid_search_cv_results_multimetric(): + X, y = make_classification(n_samples=50, n_features=4, random_state=42) + + n_splits = 3 + params = [ + dict( + kernel=[ + "rbf", + ], + C=[1, 10], + gamma=[0.1, 1], + ), + dict( + kernel=[ + "poly", + ], + degree=[1, 2], + ), + ] + + grid_searches = [] + for scoring in ( + {"accuracy": make_scorer(accuracy_score), "recall": make_scorer(recall_score)}, + "accuracy", + "recall", + ): + grid_search = GridSearchCV( + SVC(), cv=n_splits, param_grid=params, scoring=scoring, refit=False + ) + grid_search.fit(X, y) + grid_searches.append(grid_search) + + compare_cv_results_multimetric_with_single(*grid_searches) + + +def test_random_search_cv_results_multimetric(): + X, y = make_classification(n_samples=50, n_features=4, random_state=42) + + n_splits = 3 + n_search_iter = 30 + + # Scipy 0.12's stats dists do not accept seed, hence we use param grid + params = dict(C=np.logspace(-4, 1, 3), gamma=np.logspace(-5, 0, 3, base=0.1)) + for refit in (True, False): + random_searches = [] + for scoring in (("accuracy", "recall"), "accuracy", "recall"): + # If True, for multi-metric pass refit='accuracy' + if refit: + probability = True + refit = "accuracy" if isinstance(scoring, tuple) else refit + else: + probability = False + clf = SVC(probability=probability, random_state=42) + random_search = RandomizedSearchCV( + clf, + n_iter=n_search_iter, + cv=n_splits, + param_distributions=params, + scoring=scoring, + refit=refit, + random_state=0, + ) + random_search.fit(X, y) + random_searches.append(random_search) + + compare_cv_results_multimetric_with_single(*random_searches) + compare_refit_methods_when_refit_with_acc( + random_searches[0], random_searches[1], refit + ) + + +def compare_cv_results_multimetric_with_single(search_multi, search_acc, search_rec): + """Compare multi-metric cv_results with the ensemble of multiple + single metric cv_results from single metric grid/random search""" + + assert search_multi.multimetric_ + assert_array_equal(sorted(search_multi.scorer_), ("accuracy", "recall")) + + cv_results_multi = search_multi.cv_results_ + cv_results_acc_rec = { + re.sub("_score$", "_accuracy", k): v for k, v in search_acc.cv_results_.items() + } + cv_results_acc_rec.update( + {re.sub("_score$", "_recall", k): v for k, v in search_rec.cv_results_.items()} + ) + + # Check if score and timing are reasonable, also checks if the keys + # are present + assert all( + ( + np.all(cv_results_multi[k] <= 1) + for k in ( + "mean_score_time", + "std_score_time", + "mean_fit_time", + "std_fit_time", + ) + ) + ) + + # Compare the keys, other than time keys, among multi-metric and + # single metric grid search results. np.testing.assert_equal performs a + # deep nested comparison of the two cv_results dicts + np.testing.assert_equal( + {k: v for k, v in cv_results_multi.items() if not k.endswith("_time")}, + {k: v for k, v in cv_results_acc_rec.items() if not k.endswith("_time")}, + ) + + +def compare_refit_methods_when_refit_with_acc(search_multi, search_acc, refit): + """Compare refit multi-metric search methods with single metric methods""" + assert search_acc.refit == refit + if refit: + assert search_multi.refit == "accuracy" + else: + assert not search_multi.refit + return # search cannot predict/score without refit + + X, y = make_blobs(n_samples=100, n_features=4, random_state=42) + for method in ("predict", "predict_proba", "predict_log_proba"): + assert_almost_equal( + getattr(search_multi, method)(X), getattr(search_acc, method)(X) + ) + assert_almost_equal(search_multi.score(X, y), search_acc.score(X, y)) + for key in ("best_index_", "best_score_", "best_params_"): + assert getattr(search_multi, key) == getattr(search_acc, key) + + +@pytest.mark.parametrize( + "search_cv", + [ + RandomizedSearchCV( + estimator=DecisionTreeClassifier(), + param_distributions={"max_depth": [5, 10]}, + ), + GridSearchCV( + estimator=DecisionTreeClassifier(), param_grid={"max_depth": [5, 10]} + ), + ], +) +def test_search_cv_score_samples_error(search_cv): + X, y = make_blobs(n_samples=100, n_features=4, random_state=42) + search_cv.fit(X, y) + + # Make sure to error out when underlying estimator does not implement + # the method `score_samples` + outer_msg = f"'{search_cv.__class__.__name__}' has no attribute 'score_samples'" + inner_msg = "'DecisionTreeClassifier' object has no attribute 'score_samples'" + + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + search_cv.score_samples(X) + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg == str(exec_info.value.__cause__) + + +@pytest.mark.parametrize( + "search_cv", + [ + RandomizedSearchCV( + estimator=LocalOutlierFactor(novelty=True), + param_distributions={"n_neighbors": [5, 10]}, + scoring="precision", + ), + GridSearchCV( + estimator=LocalOutlierFactor(novelty=True), + param_grid={"n_neighbors": [5, 10]}, + scoring="precision", + ), + ], +) +def test_search_cv_score_samples_method(search_cv): + # Set parameters + rng = np.random.RandomState(42) + n_samples = 300 + outliers_fraction = 0.15 + n_outliers = int(outliers_fraction * n_samples) + n_inliers = n_samples - n_outliers + + # Create dataset + X = make_blobs( + n_samples=n_inliers, + n_features=2, + centers=[[0, 0], [0, 0]], + cluster_std=0.5, + random_state=0, + )[0] + # Add some noisy points + X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], axis=0) + + # Define labels to be able to score the estimator with `search_cv` + y_true = np.array([1] * n_samples) + y_true[-n_outliers:] = -1 + + # Fit on data + search_cv.fit(X, y_true) + + # Verify that the stand alone estimator yields the same results + # as the ones obtained with *SearchCV + assert_allclose( + search_cv.score_samples(X), search_cv.best_estimator_.score_samples(X) + ) + + +def test_search_cv_results_rank_tie_breaking(): + X, y = make_blobs(n_samples=50, random_state=42) + + # The two C values are close enough to give similar models + # which would result in a tie of their mean cv-scores + param_grid = {"C": [1, 1.001, 0.001]} + + grid_search = GridSearchCV(SVC(), param_grid=param_grid, return_train_score=True) + random_search = RandomizedSearchCV( + SVC(), n_iter=3, param_distributions=param_grid, return_train_score=True + ) + + for search in (grid_search, random_search): + search.fit(X, y) + cv_results = search.cv_results_ + # Check tie breaking strategy - + # Check that there is a tie in the mean scores between + # candidates 1 and 2 alone + assert_almost_equal( + cv_results["mean_test_score"][0], cv_results["mean_test_score"][1] + ) + assert_almost_equal( + cv_results["mean_train_score"][0], cv_results["mean_train_score"][1] + ) + assert not np.allclose( + cv_results["mean_test_score"][1], cv_results["mean_test_score"][2] + ) + assert not np.allclose( + cv_results["mean_train_score"][1], cv_results["mean_train_score"][2] + ) + # 'min' rank should be assigned to the tied candidates + assert_almost_equal(search.cv_results_["rank_test_score"], [1, 1, 3]) + + +def test_search_cv_results_none_param(): + X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1] + estimators = (DecisionTreeRegressor(), DecisionTreeClassifier()) + est_parameters = {"random_state": [0, None]} + cv = KFold() + + for est in estimators: + grid_search = GridSearchCV( + est, + est_parameters, + cv=cv, + ).fit(X, y) + assert_array_equal(grid_search.cv_results_["param_random_state"], [0, None]) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.FitFailedWarning") +def test_search_cv_timing(): + svc = LinearSVC(random_state=0) + + X = [ + [ + 1, + ], + [ + 2, + ], + [ + 3, + ], + [ + 4, + ], + ] + y = [0, 1, 1, 0] + + gs = GridSearchCV(svc, {"C": [0, 1]}, cv=2, error_score=0) + rs = RandomizedSearchCV(svc, {"C": [0, 1]}, cv=2, error_score=0, n_iter=2) + + for search in (gs, rs): + search.fit(X, y) + for key in ["mean_fit_time", "std_fit_time"]: + # NOTE The precision of time.time in windows is not high + # enough for the fit/score times to be non-zero for trivial X and y + assert np.all(search.cv_results_[key] >= 0) + assert np.all(search.cv_results_[key] < 1) + + for key in ["mean_score_time", "std_score_time"]: + assert search.cv_results_[key][1] >= 0 + assert search.cv_results_[key][0] == 0.0 + assert np.all(search.cv_results_[key] < 1) + + assert hasattr(search, "refit_time_") + assert isinstance(search.refit_time_, float) + assert search.refit_time_ >= 0 + + +def test_grid_search_correct_score_results(): + # test that correct scores are used + n_splits = 3 + clf = LinearSVC(random_state=0) + X, y = make_blobs(random_state=0, centers=2) + Cs = [0.1, 1, 10] + for score in ["f1", "roc_auc"]: + grid_search = GridSearchCV(clf, {"C": Cs}, scoring=score, cv=n_splits) + cv_results = grid_search.fit(X, y).cv_results_ + + # Test scorer names + result_keys = list(cv_results.keys()) + expected_keys = ("mean_test_score", "rank_test_score") + tuple( + "split%d_test_score" % cv_i for cv_i in range(n_splits) + ) + assert all(np.isin(expected_keys, result_keys)) + + cv = StratifiedKFold(n_splits=n_splits) + n_splits = grid_search.n_splits_ + for candidate_i, C in enumerate(Cs): + clf.set_params(C=C) + cv_scores = np.array( + [ + grid_search.cv_results_["split%d_test_score" % s][candidate_i] + for s in range(n_splits) + ] + ) + for i, (train, test) in enumerate(cv.split(X, y)): + clf.fit(X[train], y[train]) + if score == "f1": + correct_score = f1_score(y[test], clf.predict(X[test])) + elif score == "roc_auc": + dec = clf.decision_function(X[test]) + correct_score = roc_auc_score(y[test], dec) + assert_almost_equal(correct_score, cv_scores[i]) + + +def test_pickle(): + # Test that a fit search can be pickled + clf = MockClassifier() + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True, cv=2) + grid_search.fit(X, y) + grid_search_pickled = pickle.loads(pickle.dumps(grid_search)) + assert_array_almost_equal(grid_search.predict(X), grid_search_pickled.predict(X)) + + random_search = RandomizedSearchCV( + clf, {"foo_param": [1, 2, 3]}, refit=True, n_iter=3, cv=2 + ) + random_search.fit(X, y) + random_search_pickled = pickle.loads(pickle.dumps(random_search)) + assert_array_almost_equal( + random_search.predict(X), random_search_pickled.predict(X) + ) + + +def test_grid_search_with_multioutput_data(): + # Test search with multi-output estimator + + X, y = make_multilabel_classification(return_indicator=True, random_state=0) + + est_parameters = {"max_depth": [1, 2, 3, 4]} + cv = KFold() + + estimators = [ + DecisionTreeRegressor(random_state=0), + DecisionTreeClassifier(random_state=0), + ] + + # Test with grid search cv + for est in estimators: + grid_search = GridSearchCV(est, est_parameters, cv=cv) + grid_search.fit(X, y) + res_params = grid_search.cv_results_["params"] + for cand_i in range(len(res_params)): + est.set_params(**res_params[cand_i]) + + for i, (train, test) in enumerate(cv.split(X, y)): + est.fit(X[train], y[train]) + correct_score = est.score(X[test], y[test]) + assert_almost_equal( + correct_score, + grid_search.cv_results_["split%d_test_score" % i][cand_i], + ) + + # Test with a randomized search + for est in estimators: + random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) + random_search.fit(X, y) + res_params = random_search.cv_results_["params"] + for cand_i in range(len(res_params)): + est.set_params(**res_params[cand_i]) + + for i, (train, test) in enumerate(cv.split(X, y)): + est.fit(X[train], y[train]) + correct_score = est.score(X[test], y[test]) + assert_almost_equal( + correct_score, + random_search.cv_results_["split%d_test_score" % i][cand_i], + ) + + +def test_predict_proba_disabled(): + # Test predict_proba when disabled on estimator. + X = np.arange(20).reshape(5, -1) + y = [0, 0, 1, 1, 1] + clf = SVC(probability=False) + gs = GridSearchCV(clf, {}, cv=2).fit(X, y) + assert not hasattr(gs, "predict_proba") + + +def test_grid_search_allows_nans(): + # Test GridSearchCV with SimpleImputer + X = np.arange(20, dtype=np.float64).reshape(5, -1) + X[2, :] = np.nan + y = [0, 0, 1, 1, 1] + p = Pipeline( + [ + ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)), + ("classifier", MockClassifier()), + ] + ) + GridSearchCV(p, {"classifier__foo_param": [1, 2, 3]}, cv=2).fit(X, y) + + +class FailingClassifier(BaseEstimator): + """Classifier that raises a ValueError on fit()""" + + FAILING_PARAMETER = 2 + + def __init__(self, parameter=None): + self.parameter = parameter + + def fit(self, X, y=None): + if self.parameter == FailingClassifier.FAILING_PARAMETER: + raise ValueError("Failing classifier failed as required") + + def predict(self, X): + return np.zeros(X.shape[0]) + + def score(self, X=None, Y=None): + return 0.0 + + +def test_grid_search_failing_classifier(): + # GridSearchCV with on_error != 'raise' + # Ensures that a warning is raised and score reset where appropriate. + + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + + # refit=False because we only want to check that errors caused by fits + # to individual folds will be caught and warnings raised instead. If + # refit was done, then an exception would be raised on refit and not + # caught by grid_search (expected behavior), and this would cause an + # error in this test. + gs = GridSearchCV( + clf, + [{"parameter": [0, 1, 2]}], + scoring="accuracy", + refit=False, + error_score=0.0, + ) + + warning_message = re.compile( + "5 fits failed.+total of 15.+The score on these" + r" train-test partitions for these parameters will be set to 0\.0.+" + "5 fits failed with the following error.+ValueError.+Failing classifier failed" + " as required", + flags=re.DOTALL, + ) + with pytest.warns(FitFailedWarning, match=warning_message): + gs.fit(X, y) + n_candidates = len(gs.cv_results_["params"]) + + # Ensure that grid scores were set to zero as required for those fits + # that are expected to fail. + def get_cand_scores(i): + return np.array( + [gs.cv_results_["split%d_test_score" % s][i] for s in range(gs.n_splits_)] + ) + + assert all( + ( + np.all(get_cand_scores(cand_i) == 0.0) + for cand_i in range(n_candidates) + if gs.cv_results_["param_parameter"][cand_i] + == FailingClassifier.FAILING_PARAMETER + ) + ) + + gs = GridSearchCV( + clf, + [{"parameter": [0, 1, 2]}], + scoring="accuracy", + refit=False, + error_score=float("nan"), + ) + warning_message = re.compile( + "5 fits failed.+total of 15.+The score on these" + r" train-test partitions for these parameters will be set to nan.+" + "5 fits failed with the following error.+ValueError.+Failing classifier failed" + " as required", + flags=re.DOTALL, + ) + with pytest.warns(FitFailedWarning, match=warning_message): + gs.fit(X, y) + n_candidates = len(gs.cv_results_["params"]) + assert all( + np.all(np.isnan(get_cand_scores(cand_i))) + for cand_i in range(n_candidates) + if gs.cv_results_["param_parameter"][cand_i] + == FailingClassifier.FAILING_PARAMETER + ) + + ranks = gs.cv_results_["rank_test_score"] + + # Check that succeeded estimators have lower ranks + assert ranks[0] <= 2 and ranks[1] <= 2 + # Check that failed estimator has the highest rank + assert ranks[clf.FAILING_PARAMETER] == 3 + assert gs.best_index_ != clf.FAILING_PARAMETER + + +def test_grid_search_classifier_all_fits_fail(): + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + + gs = GridSearchCV( + clf, + [{"parameter": [FailingClassifier.FAILING_PARAMETER] * 3}], + error_score=0.0, + ) + + warning_message = re.compile( + ( + "All the 15 fits failed.+15 fits failed with the following" + " error.+ValueError.+Failing classifier failed as required" + ), + flags=re.DOTALL, + ) + with pytest.raises(ValueError, match=warning_message): + gs.fit(X, y) + + +def test_grid_search_failing_classifier_raise(): + # GridSearchCV with on_error == 'raise' raises the error + + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + + # refit=False because we want to test the behaviour of the grid search part + gs = GridSearchCV( + clf, + [{"parameter": [0, 1, 2]}], + scoring="accuracy", + refit=False, + error_score="raise", + ) + + # FailingClassifier issues a ValueError so this is what we look for. + with pytest.raises(ValueError): + gs.fit(X, y) + + +def test_parameters_sampler_replacement(): + # raise warning if n_iter is bigger than total parameter space + params = [ + {"first": [0, 1], "second": ["a", "b", "c"]}, + {"third": ["two", "values"]}, + ] + sampler = ParameterSampler(params, n_iter=9) + n_iter = 9 + grid_size = 8 + expected_warning = ( + "The total space of parameters %d is smaller " + "than n_iter=%d. Running %d iterations. For " + "exhaustive searches, use GridSearchCV." % (grid_size, n_iter, grid_size) + ) + with pytest.warns(UserWarning, match=expected_warning): + list(sampler) + + # degenerates to GridSearchCV if n_iter the same as grid_size + sampler = ParameterSampler(params, n_iter=8) + samples = list(sampler) + assert len(samples) == 8 + for values in ParameterGrid(params): + assert values in samples + assert len(ParameterSampler(params, n_iter=1000)) == 8 + + # test sampling without replacement in a large grid + params = {"a": range(10), "b": range(10), "c": range(10)} + sampler = ParameterSampler(params, n_iter=99, random_state=42) + samples = list(sampler) + assert len(samples) == 99 + hashable_samples = ["a%db%dc%d" % (p["a"], p["b"], p["c"]) for p in samples] + assert len(set(hashable_samples)) == 99 + + # doesn't go into infinite loops + params_distribution = {"first": bernoulli(0.5), "second": ["a", "b", "c"]} + sampler = ParameterSampler(params_distribution, n_iter=7) + samples = list(sampler) + assert len(samples) == 7 + + +def test_stochastic_gradient_loss_param(): + # Make sure the predict_proba works when loss is specified + # as one of the parameters in the param_grid. + param_grid = { + "loss": ["log_loss"], + } + X = np.arange(24).reshape(6, -1) + y = [0, 0, 0, 1, 1, 1] + clf = GridSearchCV( + estimator=SGDClassifier(loss="hinge"), param_grid=param_grid, cv=3 + ) + + # When the estimator is not fitted, `predict_proba` is not available as the + # loss is 'hinge'. + assert not hasattr(clf, "predict_proba") + clf.fit(X, y) + clf.predict_proba(X) + clf.predict_log_proba(X) + + # Make sure `predict_proba` is not available when setting loss=['hinge'] + # in param_grid + param_grid = { + "loss": ["hinge"], + } + clf = GridSearchCV( + estimator=SGDClassifier(loss="hinge"), param_grid=param_grid, cv=3 + ) + assert not hasattr(clf, "predict_proba") + clf.fit(X, y) + assert not hasattr(clf, "predict_proba") + + +def test_search_train_scores_set_to_false(): + X = np.arange(6).reshape(6, -1) + y = [0, 0, 0, 1, 1, 1] + clf = LinearSVC(random_state=0) + + gs = GridSearchCV(clf, param_grid={"C": [0.1, 0.2]}, cv=3) + gs.fit(X, y) + + +def test_grid_search_cv_splits_consistency(): + # Check if a one time iterable is accepted as a cv parameter. + n_samples = 100 + n_splits = 5 + X, y = make_classification(n_samples=n_samples, random_state=0) + + gs = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [0.1, 0.2, 0.3]}, + cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples), + return_train_score=True, + ) + gs.fit(X, y) + + gs2 = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [0.1, 0.2, 0.3]}, + cv=KFold(n_splits=n_splits), + return_train_score=True, + ) + gs2.fit(X, y) + + # Give generator as a cv parameter + assert isinstance( + KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X, y), + GeneratorType, + ) + gs3 = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [0.1, 0.2, 0.3]}, + cv=KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X, y), + return_train_score=True, + ) + gs3.fit(X, y) + + gs4 = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [0.1, 0.2, 0.3]}, + cv=KFold(n_splits=n_splits, shuffle=True, random_state=0), + return_train_score=True, + ) + gs4.fit(X, y) + + def _pop_time_keys(cv_results): + for key in ( + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ): + cv_results.pop(key) + return cv_results + + # Check if generators are supported as cv and + # that the splits are consistent + np.testing.assert_equal( + _pop_time_keys(gs3.cv_results_), _pop_time_keys(gs4.cv_results_) + ) + + # OneTimeSplitter is a non-re-entrant cv where split can be called only + # once if ``cv.split`` is called once per param setting in GridSearchCV.fit + # the 2nd and 3rd parameter will not be evaluated as no train/test indices + # will be generated for the 2nd and subsequent cv.split calls. + # This is a check to make sure cv.split is not called once per param + # setting. + np.testing.assert_equal( + {k: v for k, v in gs.cv_results_.items() if not k.endswith("_time")}, + {k: v for k, v in gs2.cv_results_.items() if not k.endswith("_time")}, + ) + + # Check consistency of folds across the parameters + gs = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [0.1, 0.1, 0.2, 0.2]}, + cv=KFold(n_splits=n_splits, shuffle=True), + return_train_score=True, + ) + gs.fit(X, y) + + # As the first two param settings (C=0.1) and the next two param + # settings (C=0.2) are same, the test and train scores must also be + # same as long as the same train/test indices are generated for all + # the cv splits, for both param setting + for score_type in ("train", "test"): + per_param_scores = {} + for param_i in range(4): + per_param_scores[param_i] = [ + gs.cv_results_["split%d_%s_score" % (s, score_type)][param_i] + for s in range(5) + ] + + assert_array_almost_equal(per_param_scores[0], per_param_scores[1]) + assert_array_almost_equal(per_param_scores[2], per_param_scores[3]) + + +def test_transform_inverse_transform_round_trip(): + clf = MockClassifier() + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, verbose=3) + + grid_search.fit(X, y) + X_round_trip = grid_search.inverse_transform(grid_search.transform(X)) + assert_array_equal(X, X_round_trip) + + +def test_custom_run_search(): + def check_results(results, gscv): + exp_results = gscv.cv_results_ + assert sorted(results.keys()) == sorted(exp_results) + for k in results: + if not k.endswith("_time"): + # XXX: results['params'] is a list :| + results[k] = np.asanyarray(results[k]) + if results[k].dtype.kind == "O": + assert_array_equal( + exp_results[k], results[k], err_msg="Checking " + k + ) + else: + assert_allclose(exp_results[k], results[k], err_msg="Checking " + k) + + def fit_grid(param_grid): + return GridSearchCV(clf, param_grid, return_train_score=True).fit(X, y) + + class CustomSearchCV(BaseSearchCV): + def __init__(self, estimator, **kwargs): + super().__init__(estimator, **kwargs) + + def _run_search(self, evaluate): + results = evaluate([{"max_depth": 1}, {"max_depth": 2}]) + check_results(results, fit_grid({"max_depth": [1, 2]})) + results = evaluate([{"min_samples_split": 5}, {"min_samples_split": 10}]) + check_results( + results, + fit_grid([{"max_depth": [1, 2]}, {"min_samples_split": [5, 10]}]), + ) + + # Using regressor to make sure each score differs + clf = DecisionTreeRegressor(random_state=0) + X, y = make_classification(n_samples=100, n_informative=4, random_state=0) + mycv = CustomSearchCV(clf, return_train_score=True).fit(X, y) + gscv = fit_grid([{"max_depth": [1, 2]}, {"min_samples_split": [5, 10]}]) + + results = mycv.cv_results_ + check_results(results, gscv) + for attr in dir(gscv): + if ( + attr[0].islower() + and attr[-1:] == "_" + and attr + not in { + "cv_results_", + "best_estimator_", + "refit_time_", + "classes_", + "scorer_", + } + ): + assert getattr(gscv, attr) == getattr(mycv, attr), ( + "Attribute %s not equal" % attr + ) + + +def test__custom_fit_no_run_search(): + class NoRunSearchSearchCV(BaseSearchCV): + def __init__(self, estimator, **kwargs): + super().__init__(estimator, **kwargs) + + def fit(self, X, y=None, groups=None, **fit_params): + return self + + # this should not raise any exceptions + NoRunSearchSearchCV(SVC()).fit(X, y) + + class BadSearchCV(BaseSearchCV): + def __init__(self, estimator, **kwargs): + super().__init__(estimator, **kwargs) + + with pytest.raises(NotImplementedError, match="_run_search not implemented."): + # this should raise a NotImplementedError + BadSearchCV(SVC()).fit(X, y) + + +def test_empty_cv_iterator_error(): + # Use global X, y + + # create cv + cv = KFold(n_splits=3).split(X) + + # pop all of it, this should cause the expected ValueError + [u for u in cv] + # cv is empty now + + train_size = 100 + ridge = RandomizedSearchCV(Ridge(), {"alpha": [1e-3, 1e-2, 1e-1]}, cv=cv, n_jobs=4) + + # assert that this raises an error + with pytest.raises( + ValueError, + match=( + "No fits were performed. " + "Was the CV iterator empty\\? " + "Were there no candidates\\?" + ), + ): + ridge.fit(X[:train_size], y[:train_size]) + + +def test_random_search_bad_cv(): + # Use global X, y + + class BrokenKFold(KFold): + def get_n_splits(self, *args, **kw): + return 1 + + # create bad cv + cv = BrokenKFold(n_splits=3) + + train_size = 100 + ridge = RandomizedSearchCV(Ridge(), {"alpha": [1e-3, 1e-2, 1e-1]}, cv=cv, n_jobs=4) + + # assert that this raises an error + with pytest.raises( + ValueError, + match=( + "cv.split and cv.get_n_splits returned " + "inconsistent results. Expected \\d+ " + "splits, got \\d+" + ), + ): + ridge.fit(X[:train_size], y[:train_size]) + + +@pytest.mark.parametrize("return_train_score", [False, True]) +@pytest.mark.parametrize( + "SearchCV, specialized_params", + [ + (GridSearchCV, {"param_grid": {"max_depth": [2, 3, 5, 8]}}), + ( + RandomizedSearchCV, + {"param_distributions": {"max_depth": [2, 3, 5, 8]}, "n_iter": 4}, + ), + ], +) +def test_searchcv_raise_warning_with_non_finite_score( + SearchCV, specialized_params, return_train_score +): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/10529 + # Check that we raise a UserWarning when a non-finite score is + # computed in the SearchCV + X, y = make_classification(n_classes=2, random_state=0) + + class FailingScorer: + """Scorer that will fail for some split but not all.""" + + def __init__(self): + self.n_counts = 0 + + def __call__(self, estimator, X, y): + self.n_counts += 1 + if self.n_counts % 5 == 0: + return np.nan + return 1 + + grid = SearchCV( + DecisionTreeClassifier(), + scoring=FailingScorer(), + cv=3, + return_train_score=return_train_score, + **specialized_params, + ) + + with pytest.warns(UserWarning) as warn_msg: + grid.fit(X, y) + + set_with_warning = ["test", "train"] if return_train_score else ["test"] + assert len(warn_msg) == len(set_with_warning) + for msg, dataset in zip(warn_msg, set_with_warning): + assert f"One or more of the {dataset} scores are non-finite" in str(msg.message) + + # all non-finite scores should be equally ranked last + last_rank = grid.cv_results_["rank_test_score"].max() + non_finite_mask = np.isnan(grid.cv_results_["mean_test_score"]) + assert_array_equal(grid.cv_results_["rank_test_score"][non_finite_mask], last_rank) + # all finite scores should be better ranked than the non-finite scores + assert np.all(grid.cv_results_["rank_test_score"][~non_finite_mask] < last_rank) + + +def test_callable_multimetric_confusion_matrix(): + # Test callable with many metrics inserts the correct names and metrics + # into the search cv object + def custom_scorer(clf, X, y): + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]} + + X, y = make_classification(n_samples=40, n_features=4, random_state=42) + est = LinearSVC(random_state=42) + search = GridSearchCV(est, {"C": [0.1, 1]}, scoring=custom_scorer, refit="fp") + + search.fit(X, y) + + score_names = ["tn", "fp", "fn", "tp"] + for name in score_names: + assert "mean_test_{}".format(name) in search.cv_results_ + + y_pred = search.predict(X) + cm = confusion_matrix(y, y_pred) + assert search.score(X, y) == pytest.approx(cm[0, 1]) + + +def test_callable_multimetric_same_as_list_of_strings(): + # Test callable multimetric is the same as a list of strings + def custom_scorer(est, X, y): + y_pred = est.predict(X) + return { + "recall": recall_score(y, y_pred), + "accuracy": accuracy_score(y, y_pred), + } + + X, y = make_classification(n_samples=40, n_features=4, random_state=42) + est = LinearSVC(random_state=42) + search_callable = GridSearchCV( + est, {"C": [0.1, 1]}, scoring=custom_scorer, refit="recall" + ) + search_str = GridSearchCV( + est, {"C": [0.1, 1]}, scoring=["recall", "accuracy"], refit="recall" + ) + + search_callable.fit(X, y) + search_str.fit(X, y) + + assert search_callable.best_score_ == pytest.approx(search_str.best_score_) + assert search_callable.best_index_ == search_str.best_index_ + assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y)) + + +def test_callable_single_metric_same_as_single_string(): + # Tests callable scorer is the same as scoring with a single string + def custom_scorer(est, X, y): + y_pred = est.predict(X) + return recall_score(y, y_pred) + + X, y = make_classification(n_samples=40, n_features=4, random_state=42) + est = LinearSVC(random_state=42) + search_callable = GridSearchCV( + est, {"C": [0.1, 1]}, scoring=custom_scorer, refit=True + ) + search_str = GridSearchCV(est, {"C": [0.1, 1]}, scoring="recall", refit="recall") + search_list_str = GridSearchCV( + est, {"C": [0.1, 1]}, scoring=["recall"], refit="recall" + ) + search_callable.fit(X, y) + search_str.fit(X, y) + search_list_str.fit(X, y) + + assert search_callable.best_score_ == pytest.approx(search_str.best_score_) + assert search_callable.best_index_ == search_str.best_index_ + assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y)) + + assert search_list_str.best_score_ == pytest.approx(search_str.best_score_) + assert search_list_str.best_index_ == search_str.best_index_ + assert search_list_str.score(X, y) == pytest.approx(search_str.score(X, y)) + + +def test_callable_multimetric_error_on_invalid_key(): + # Raises when the callable scorer does not return a dict with `refit` key. + def bad_scorer(est, X, y): + return {"bad_name": 1} + + X, y = make_classification(n_samples=40, n_features=4, random_state=42) + clf = GridSearchCV( + LinearSVC(random_state=42), + {"C": [0.1, 1]}, + scoring=bad_scorer, + refit="good_name", + ) + + msg = ( + "For multi-metric scoring, the parameter refit must be set to a " + "scorer key or a callable to refit" + ) + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) + + +def test_callable_multimetric_error_failing_clf(): + # Warns when there is an estimator the fails to fit with a float + # error_score + def custom_scorer(est, X, y): + return {"acc": 1} + + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + gs = GridSearchCV( + clf, + [{"parameter": [0, 1, 2]}], + scoring=custom_scorer, + refit=False, + error_score=0.1, + ) + + warning_message = re.compile( + "5 fits failed.+total of 15.+The score on these" + r" train-test partitions for these parameters will be set to 0\.1", + flags=re.DOTALL, + ) + with pytest.warns(FitFailedWarning, match=warning_message): + gs.fit(X, y) + + assert_allclose(gs.cv_results_["mean_test_acc"], [1, 1, 0.1]) + + +def test_callable_multimetric_clf_all_fits_fail(): + # Warns and raises when all estimator fails to fit. + def custom_scorer(est, X, y): + return {"acc": 1} + + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + + gs = GridSearchCV( + clf, + [{"parameter": [FailingClassifier.FAILING_PARAMETER] * 3}], + scoring=custom_scorer, + refit=False, + error_score=0.1, + ) + + individual_fit_error_message = "ValueError: Failing classifier failed as required" + error_message = re.compile( + ( + "All the 15 fits failed.+your model is misconfigured.+" + f"{individual_fit_error_message}" + ), + flags=re.DOTALL, + ) + + with pytest.raises(ValueError, match=error_message): + gs.fit(X, y) + + +def test_n_features_in(): + # make sure grid search and random search delegate n_features_in to the + # best estimator + n_features = 4 + X, y = make_classification(n_features=n_features) + gbdt = HistGradientBoostingClassifier() + param_grid = {"max_iter": [3, 4]} + gs = GridSearchCV(gbdt, param_grid) + rs = RandomizedSearchCV(gbdt, param_grid, n_iter=1) + assert not hasattr(gs, "n_features_in_") + assert not hasattr(rs, "n_features_in_") + gs.fit(X, y) + rs.fit(X, y) + assert gs.n_features_in_ == n_features + assert rs.n_features_in_ == n_features + + +@pytest.mark.parametrize("pairwise", [True, False]) +def test_search_cv_pairwise_property_delegated_to_base_estimator(pairwise): + """ + Test implementation of BaseSearchCV has the pairwise tag + which matches the pairwise tag of its estimator. + This test make sure pairwise tag is delegated to the base estimator. + + Non-regression test for issue #13920. + """ + + class TestEstimator(BaseEstimator): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = pairwise + return tags + + est = TestEstimator() + attr_message = "BaseSearchCV pairwise tag must match estimator" + cv = GridSearchCV(est, {"n_neighbors": [10]}) + assert pairwise == cv.__sklearn_tags__().input_tags.pairwise, attr_message + + +def test_search_cv__pairwise_property_delegated_to_base_estimator(): + """ + Test implementation of BaseSearchCV has the pairwise property + which matches the pairwise tag of its estimator. + This test make sure pairwise tag is delegated to the base estimator. + + Non-regression test for issue #13920. + """ + + class EstimatorPairwise(BaseEstimator): + def __init__(self, pairwise=True): + self.pairwise = pairwise + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.pairwise + return tags + + est = EstimatorPairwise() + attr_message = "BaseSearchCV _pairwise property must match estimator" + + for _pairwise_setting in [True, False]: + est.set_params(pairwise=_pairwise_setting) + cv = GridSearchCV(est, {"n_neighbors": [10]}) + assert ( + _pairwise_setting == cv.__sklearn_tags__().input_tags.pairwise + ), attr_message + + +def test_search_cv_pairwise_property_equivalence_of_precomputed(): + """ + Test implementation of BaseSearchCV has the pairwise tag + which matches the pairwise tag of its estimator. + This test ensures the equivalence of 'precomputed'. + + Non-regression test for issue #13920. + """ + n_samples = 50 + n_splits = 2 + X, y = make_classification(n_samples=n_samples, random_state=0) + grid_params = {"n_neighbors": [10]} + + # defaults to euclidean metric (minkowski p = 2) + clf = KNeighborsClassifier() + cv = GridSearchCV(clf, grid_params, cv=n_splits) + cv.fit(X, y) + preds_original = cv.predict(X) + + # precompute euclidean metric to validate pairwise is working + X_precomputed = euclidean_distances(X) + clf = KNeighborsClassifier(metric="precomputed") + cv = GridSearchCV(clf, grid_params, cv=n_splits) + cv.fit(X_precomputed, y) + preds_precomputed = cv.predict(X_precomputed) + + attr_message = "GridSearchCV not identical with precomputed metric" + assert (preds_original == preds_precomputed).all(), attr_message + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [(GridSearchCV, {"a": [0.1, 0.01]}), (RandomizedSearchCV, {"a": uniform(1, 3)})], +) +def test_scalar_fit_param(SearchCV, param_search): + # unofficially sanctioned tolerance for scalar values in fit_params + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15805 + class TestEstimator(ClassifierMixin, BaseEstimator): + def __init__(self, a=None): + self.a = a + + def fit(self, X, y, r=None): + self.r_ = r + + def predict(self, X): + return np.zeros(shape=(len(X))) + + model = SearchCV(TestEstimator(), param_search) + X, y = make_classification(random_state=42) + model.fit(X, y, r=42) + assert model.best_estimator_.r_ == 42 + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [ + (GridSearchCV, {"alpha": [0.1, 0.01]}), + (RandomizedSearchCV, {"alpha": uniform(0.01, 0.1)}), + ], +) +def test_scalar_fit_param_compat(SearchCV, param_search): + # check support for scalar values in fit_params, for instance in LightGBM + # that do not exactly respect the scikit-learn API contract but that we do + # not want to break without an explicit deprecation cycle and API + # recommendations for implementing early stopping with a user provided + # validation set. non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15805 + X_train, X_valid, y_train, y_valid = train_test_split( + *make_classification(random_state=42), random_state=42 + ) + + class _FitParamClassifier(SGDClassifier): + def fit( + self, + X, + y, + sample_weight=None, + tuple_of_arrays=None, + scalar_param=None, + callable_param=None, + ): + super().fit(X, y, sample_weight=sample_weight) + assert scalar_param > 0 + assert callable(callable_param) + + # The tuple of arrays should be preserved as tuple. + assert isinstance(tuple_of_arrays, tuple) + assert tuple_of_arrays[0].ndim == 2 + assert tuple_of_arrays[1].ndim == 1 + return self + + def _fit_param_callable(): + pass + + model = SearchCV(_FitParamClassifier(), param_search) + + # NOTE: `fit_params` should be data dependent (e.g. `sample_weight`) which + # is not the case for the following parameters. But this abuse is common in + # popular third-party libraries and we should tolerate this behavior for + # now and be careful not to break support for those without following + # proper deprecation cycle. + fit_params = { + "tuple_of_arrays": (X_valid, y_valid), + "callable_param": _fit_param_callable, + "scalar_param": 42, + } + model.fit(X_train, y_train, **fit_params) + + +# FIXME: Replace this test with a full `check_estimator` once we have API only +# checks. +@pytest.mark.filterwarnings("ignore:The total space of parameters 4 is") +@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV]) +@pytest.mark.parametrize("Predictor", [MinimalRegressor, MinimalClassifier]) +def test_search_cv_using_minimal_compatible_estimator(SearchCV, Predictor): + # Check that third-party library can run tests without inheriting from + # BaseEstimator. + rng = np.random.RandomState(0) + X, y = rng.randn(25, 2), np.array([0] * 5 + [1] * 20) + + model = Pipeline( + [("transformer", MinimalTransformer()), ("predictor", Predictor())] + ) + + params = { + "transformer__param": [1, 10], + "predictor__parama": [1, 10], + } + search = SearchCV(model, params, error_score="raise") + search.fit(X, y) + + assert search.best_params_.keys() == params.keys() + + y_pred = search.predict(X) + if is_classifier(search): + assert_array_equal(y_pred, 1) + assert search.score(X, y) == pytest.approx(accuracy_score(y, y_pred)) + else: + assert_allclose(y_pred, y.mean()) + assert search.score(X, y) == pytest.approx(r2_score(y, y_pred)) + + +@pytest.mark.parametrize("return_train_score", [True, False]) +def test_search_cv_verbose_3(capsys, return_train_score): + """Check that search cv with verbose>2 shows the score for single + metrics. non-regression test for #19658.""" + X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0) + clf = LinearSVC(random_state=0) + grid = {"C": [0.1]} + + GridSearchCV( + clf, + grid, + scoring="accuracy", + verbose=3, + cv=3, + return_train_score=return_train_score, + ).fit(X, y) + captured = capsys.readouterr().out + if return_train_score: + match = re.findall(r"score=\(train=[\d\.]+, test=[\d.]+\)", captured) + else: + match = re.findall(r"score=[\d\.]+", captured) + assert len(match) == 3 + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [ + (GridSearchCV, "param_grid"), + (RandomizedSearchCV, "param_distributions"), + (HalvingGridSearchCV, "param_grid"), + ], +) +def test_search_estimator_param(SearchCV, param_search): + # test that SearchCV object doesn't change the object given in the parameter grid + X, y = make_classification(random_state=42) + + params = {"clf": [LinearSVC()], "clf__C": [0.01]} + orig_C = params["clf"][0].C + + pipe = Pipeline([("trs", MinimalTransformer()), ("clf", None)]) + + param_grid_search = {param_search: params} + gs = SearchCV(pipe, refit=True, cv=2, scoring="accuracy", **param_grid_search).fit( + X, y + ) + + # testing that the original object in params is not changed + assert params["clf"][0].C == orig_C + # testing that the GS is setting the parameter of the step correctly + assert gs.best_estimator_.named_steps["clf"].C == 0.01 + + +def test_search_with_2d_array(): + parameter_grid = { + "vect__ngram_range": ((1, 1), (1, 2)), # unigrams or bigrams + "vect__norm": ("l1", "l2"), + } + pipeline = Pipeline( + [ + ("vect", TfidfVectorizer()), + ("clf", ComplementNB()), + ] + ) + random_search = RandomizedSearchCV( + estimator=pipeline, + param_distributions=parameter_grid, + n_iter=3, + random_state=0, + n_jobs=2, + verbose=1, + cv=3, + ) + data_train = ["one", "two", "three", "four", "five"] + data_target = [0, 0, 1, 0, 1] + random_search.fit(data_train, data_target) + result = random_search.cv_results_["param_vect__ngram_range"] + expected_data = np.empty(3, dtype=object) + expected_data[:] = [(1, 2), (1, 2), (1, 1)] + np.testing.assert_array_equal(result.data, expected_data) + + +def test_search_html_repr(): + """Test different HTML representations for GridSearchCV.""" + X, y = make_classification(random_state=42) + + pipeline = Pipeline([("scale", StandardScaler()), ("clf", DummyClassifier())]) + param_grid = {"clf": [DummyClassifier(), LogisticRegression()]} + + # Unfitted shows the original pipeline + search_cv = GridSearchCV(pipeline, param_grid=param_grid, refit=False) + with config_context(display="diagram"): + repr_html = search_cv._repr_html_() + assert "
DummyClassifier()
" in repr_html + + # Fitted with `refit=False` shows the original pipeline + search_cv.fit(X, y) + with config_context(display="diagram"): + repr_html = search_cv._repr_html_() + assert "
DummyClassifier()
" in repr_html + + # Fitted with `refit=True` shows the best estimator + search_cv = GridSearchCV(pipeline, param_grid=param_grid, refit=True) + search_cv.fit(X, y) + with config_context(display="diagram"): + repr_html = search_cv._repr_html_() + assert "
DummyClassifier()
" not in repr_html + assert "
LogisticRegression()
" in repr_html + + +# TODO(1.7): remove this test +@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV]) +def test_inverse_transform_Xt_deprecation(SearchCV): + clf = MockClassifier() + search = SearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, verbose=3) + + X2 = search.fit(X, y).transform(X) + + with pytest.raises(TypeError, match="Missing required positional argument"): + search.inverse_transform() + + with pytest.raises(TypeError, match="Cannot use both X and Xt. Use X only"): + search.inverse_transform(X=X2, Xt=X2) + + with warnings.catch_warnings(record=True): + warnings.simplefilter("error") + search.inverse_transform(X2) + + with pytest.warns(FutureWarning, match="Xt was renamed X in version 1.5"): + search.inverse_transform(Xt=X2) + + +# Metadata Routing Tests +# ====================== + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [ + (GridSearchCV, "param_grid"), + (RandomizedSearchCV, "param_distributions"), + ], +) +@config_context(enable_metadata_routing=True) +def test_multi_metric_search_forwards_metadata(SearchCV, param_search): + """Test that *SearchCV forwards metadata correctly when passed multiple metrics.""" + X, y = make_classification(random_state=42) + n_samples = _num_samples(X) + rng = np.random.RandomState(0) + score_weights = rng.rand(n_samples) + score_metadata = rng.rand(n_samples) + + est = LinearSVC() + param_grid_search = {param_search: {"C": [1]}} + + scorer_registry = _Registry() + scorer = ConsumingScorer(registry=scorer_registry).set_score_request( + sample_weight="score_weights", metadata="score_metadata" + ) + scoring = dict(my_scorer=scorer, accuracy="accuracy") + SearchCV(est, refit="accuracy", cv=2, scoring=scoring, **param_grid_search).fit( + X, y, score_weights=score_weights, score_metadata=score_metadata + ) + assert len(scorer_registry) + for _scorer in scorer_registry: + check_recorded_metadata( + obj=_scorer, + method="score", + parent="_score", + split_params=("sample_weight", "metadata"), + sample_weight=score_weights, + metadata=score_metadata, + ) + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [ + (GridSearchCV, "param_grid"), + (RandomizedSearchCV, "param_distributions"), + (HalvingGridSearchCV, "param_grid"), + ], +) +def test_score_rejects_params_with_no_routing_enabled(SearchCV, param_search): + """*SearchCV should reject **params when metadata routing is not enabled + since this is added only when routing is enabled.""" + X, y = make_classification(random_state=42) + est = LinearSVC() + param_grid_search = {param_search: {"C": [1]}} + + gs = SearchCV(est, cv=2, **param_grid_search).fit(X, y) + + with pytest.raises(ValueError, match="is only supported if"): + gs.score(X, y, metadata=1) + + +# End of Metadata Routing Tests +# ============================= + + +def test_cv_results_dtype_issue_29074(): + """Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/29074""" + + class MetaEstimator(BaseEstimator, ClassifierMixin): + def __init__( + self, + base_clf, + parameter1=None, + parameter2=None, + parameter3=None, + parameter4=None, + ): + self.base_clf = base_clf + self.parameter1 = parameter1 + self.parameter2 = parameter2 + self.parameter3 = parameter3 + self.parameter4 = parameter4 + + def fit(self, X, y=None): + self.base_clf.fit(X, y) + return self + + def score(self, X, y): + return self.base_clf.score(X, y) + + # Values of param_grid are such that np.result_type gives slightly + # different errors, in particular ValueError and TypeError + param_grid = { + "parameter1": [None, {"option": "A"}, {"option": "B"}], + "parameter2": [None, [1, 2]], + "parameter3": [{"a": 1}], + "parameter4": ["str1", "str2"], + } + grid_search = GridSearchCV( + estimator=MetaEstimator(LogisticRegression()), + param_grid=param_grid, + cv=3, + ) + + X, y = make_blobs(random_state=0) + grid_search.fit(X, y) + for param in param_grid: + assert grid_search.cv_results_[f"param_{param}"].dtype == object + + +def test_search_with_estimators_issue_29157(): + """Check cv_results_ for estimators with a `dtype` parameter, e.g. OneHotEncoder.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + { + "numeric_1": [1, 2, 3, 4, 5], + "object_1": ["a", "a", "a", "a", "a"], + "target": [1.0, 4.1, 2.0, 3.0, 1.0], + } + ) + X = df.drop("target", axis=1) + y = df["target"] + enc = ColumnTransformer( + [("enc", OneHotEncoder(sparse_output=False), ["object_1"])], + remainder="passthrough", + ) + pipe = Pipeline( + [ + ("enc", enc), + ("regressor", LinearRegression()), + ] + ) + grid_params = { + "enc__enc": [ + OneHotEncoder(sparse_output=False), + OrdinalEncoder(), + ] + } + grid_search = GridSearchCV(pipe, grid_params, cv=2) + grid_search.fit(X, y) + assert grid_search.cv_results_["param_enc__enc"].dtype == object + + +def test_cv_results_multi_size_array(): + """Check that GridSearchCV works with params that are arrays of different sizes. + + Non-regression test for #29277. + """ + n_features = 10 + X, y = make_classification(n_features=10) + + spline_reg_pipe = make_pipeline( + SplineTransformer(extrapolation="periodic"), + LogisticRegression(), + ) + + n_knots_list = [n_features * i for i in [10, 11, 12]] + knots_list = [ + np.linspace(0, np.pi * 2, n_knots).reshape((-1, n_features)) + for n_knots in n_knots_list + ] + spline_reg_pipe_cv = GridSearchCV( + estimator=spline_reg_pipe, + param_grid={ + "splinetransformer__knots": knots_list, + }, + ) + + spline_reg_pipe_cv.fit(X, y) + assert ( + spline_reg_pipe_cv.cv_results_["param_splinetransformer__knots"].dtype == object + ) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV]) +def test_array_api_search_cv_classifier(SearchCV, array_namespace, device, dtype): + xp = _array_api_for_tests(array_namespace, device) + + X = np.arange(100).reshape((10, 10)) + X_np = X.astype(dtype) + X_xp = xp.asarray(X_np, device=device) + + # y should always be an integer, no matter what `dtype` is + y_np = np.array([0] * 5 + [1] * 5) + y_xp = xp.asarray(y_np, device=device) + + with config_context(array_api_dispatch=True): + searcher = SearchCV( + LinearDiscriminantAnalysis(), + {"tol": [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]}, + cv=2, + error_score="raise", + ) + searcher.fit(X_xp, y_xp) + searcher.score(X_xp, y_xp) + + +# Construct these outside the tests so that the same object is used +# for both input and `expected` +one_hot_encoder = OneHotEncoder() +ordinal_encoder = OrdinalEncoder() + +# If we construct this directly via `MaskedArray`, the list of tuples +# gets auto-converted to a 2D array. +ma_with_tuples = np.ma.MaskedArray(np.empty(2), mask=True, dtype=object) +ma_with_tuples[0] = (1, 2) +ma_with_tuples[1] = (3, 4) + + +@pytest.mark.parametrize( + ("candidate_params", "expected"), + [ + pytest.param( + [{"foo": 1}, {"foo": 2}], + [ + ("param_foo", np.ma.MaskedArray(np.array([1, 2]))), + ], + id="simple numeric, single param", + ), + pytest.param( + [{"foo": 1, "bar": 3}, {"foo": 2, "bar": 4}, {"foo": 3}], + [ + ("param_foo", np.ma.MaskedArray(np.array([1, 2, 3]))), + ( + "param_bar", + np.ma.MaskedArray(np.array([3, 4, 0]), mask=[False, False, True]), + ), + ], + id="simple numeric, one param is missing in one round", + ), + pytest.param( + [{"foo": [[1], [2], [3]]}, {"foo": [[1], [2]]}], + [ + ( + "param_foo", + np.ma.MaskedArray([[[1], [2], [3]], [[1], [2]]], dtype=object), + ), + ], + id="lists of different lengths", + ), + pytest.param( + [{"foo": (1, 2)}, {"foo": (3, 4)}], + [ + ( + "param_foo", + ma_with_tuples, + ), + ], + id="lists tuples", + ), + pytest.param( + [{"foo": ordinal_encoder}, {"foo": one_hot_encoder}], + [ + ( + "param_foo", + np.ma.MaskedArray([ordinal_encoder, one_hot_encoder], dtype=object), + ), + ], + id="estimators", + ), + ], +) +def test_yield_masked_array_for_each_param(candidate_params, expected): + result = list(_yield_masked_array_for_each_param(candidate_params)) + for (key, value), (expected_key, expected_value) in zip(result, expected): + assert key == expected_key + assert value.dtype == expected_value.dtype + np.testing.assert_array_equal(value, expected_value) + np.testing.assert_array_equal(value.mask, expected_value.mask) + + +def test_yield_masked_array_no_runtime_warning(): + # non-regression test for https://github.com/scikit-learn/scikit-learn/issues/29929 + candidate_params = [{"param": i} for i in range(1000)] + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + list(_yield_masked_array_for_each_param(candidate_params)) diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_split.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_split.py new file mode 100644 index 0000000000000000000000000000000000000000..b333d49f5058efcde5edce15d27f0b8c9f0e053f --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_split.py @@ -0,0 +1,2099 @@ +"""Test the split module""" + +import re +import warnings +from itertools import combinations, combinations_with_replacement, permutations + +import numpy as np +import pytest +from scipy import stats +from scipy.sparse import issparse +from scipy.special import comb + +from sklearn import config_context +from sklearn.datasets import load_digits, make_classification +from sklearn.dummy import DummyClassifier +from sklearn.model_selection import ( + GridSearchCV, + GroupKFold, + GroupShuffleSplit, + KFold, + LeaveOneGroupOut, + LeaveOneOut, + LeavePGroupsOut, + LeavePOut, + PredefinedSplit, + RepeatedKFold, + RepeatedStratifiedKFold, + ShuffleSplit, + StratifiedGroupKFold, + StratifiedKFold, + StratifiedShuffleSplit, + TimeSeriesSplit, + check_cv, + cross_val_score, + train_test_split, +) +from sklearn.model_selection._split import ( + _build_repr, + _validate_shuffle_split, + _yields_constant_splits, +) +from sklearn.svm import SVC +from sklearn.tests.metadata_routing_common import assert_request_is_empty +from sklearn.utils._array_api import ( + _convert_to_numpy, + get_namespace, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._array_api import ( + device as array_api_device, +) +from sklearn.utils._mocking import MockDataFrame +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.estimator_checks import ( + _array_api_for_tests, +) +from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS +from sklearn.utils.validation import _num_samples + +NO_GROUP_SPLITTERS = [ + KFold(), + StratifiedKFold(), + TimeSeriesSplit(), + LeaveOneOut(), + LeavePOut(p=2), + ShuffleSplit(), + StratifiedShuffleSplit(test_size=0.5), + PredefinedSplit([1, 1, 2, 2]), + RepeatedKFold(), + RepeatedStratifiedKFold(), +] + +GROUP_SPLITTERS = [ + GroupKFold(), + LeavePGroupsOut(n_groups=1), + StratifiedGroupKFold(), + LeaveOneGroupOut(), + GroupShuffleSplit(), +] +GROUP_SPLITTER_NAMES = set(splitter.__class__.__name__ for splitter in GROUP_SPLITTERS) + +ALL_SPLITTERS = NO_GROUP_SPLITTERS + GROUP_SPLITTERS # type: ignore + +SPLITTERS_REQUIRING_TARGET = [ + StratifiedKFold(), + StratifiedShuffleSplit(), + RepeatedStratifiedKFold(), +] + +X = np.ones(10) +y = np.arange(10) // 2 +test_groups = ( + np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), + np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), + np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), + [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3], + ["1", "1", "1", "1", "2", "2", "2", "3", "3", "3", "3", "3"], +) +digits = load_digits() + +pytestmark = pytest.mark.filterwarnings( + "error:The groups parameter:UserWarning:sklearn.*" +) + + +def _split(splitter, X, y, groups): + if splitter.__class__.__name__ in GROUP_SPLITTER_NAMES: + return splitter.split(X, y, groups=groups) + else: + return splitter.split(X, y) + + +def test_cross_validator_with_default_params(): + n_samples = 4 + n_unique_groups = 4 + n_splits = 2 + p = 2 + n_shuffle_splits = 10 # (the default value) + + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + X_1d = np.array([1, 2, 3, 4]) + y = np.array([1, 1, 2, 2]) + groups = np.array([1, 2, 3, 4]) + loo = LeaveOneOut() + lpo = LeavePOut(p) + kf = KFold(n_splits) + skf = StratifiedKFold(n_splits) + lolo = LeaveOneGroupOut() + lopo = LeavePGroupsOut(p) + ss = ShuffleSplit(random_state=0) + ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 + sgkf = StratifiedGroupKFold(n_splits) + + loo_repr = "LeaveOneOut()" + lpo_repr = "LeavePOut(p=2)" + kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" + skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" + lolo_repr = "LeaveOneGroupOut()" + lopo_repr = "LeavePGroupsOut(n_groups=2)" + ss_repr = ( + "ShuffleSplit(n_splits=10, random_state=0, test_size=None, train_size=None)" + ) + ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" + sgkf_repr = "StratifiedGroupKFold(n_splits=2, random_state=None, shuffle=False)" + + n_splits_expected = [ + n_samples, + comb(n_samples, p), + n_splits, + n_splits, + n_unique_groups, + comb(n_unique_groups, p), + n_shuffle_splits, + 2, + n_splits, + ] + + for i, (cv, cv_repr) in enumerate( + zip( + [loo, lpo, kf, skf, lolo, lopo, ss, ps, sgkf], + [ + loo_repr, + lpo_repr, + kf_repr, + skf_repr, + lolo_repr, + lopo_repr, + ss_repr, + ps_repr, + sgkf_repr, + ], + ) + ): + # Test if get_n_splits works correctly + assert n_splits_expected[i] == cv.get_n_splits(X, y, groups) + + # Test if the cross-validator works as expected even if + # the data is 1d + np.testing.assert_equal( + list(_split(cv, X, y, groups)), list(_split(cv, X_1d, y, groups)) + ) + # Test that train, test indices returned are integers + for train, test in _split(cv, X, y, groups): + assert np.asarray(train).dtype.kind == "i" + assert np.asarray(test).dtype.kind == "i" + + # Test if the repr works without any errors + assert cv_repr == repr(cv) + + # ValueError for get_n_splits methods + msg = "The 'X' parameter should not be None." + with pytest.raises(ValueError, match=msg): + loo.get_n_splits(None, y, groups) + with pytest.raises(ValueError, match=msg): + lpo.get_n_splits(None, y, groups) + + +def test_2d_y(): + # smoke test for 2d y and multi-label + n_samples = 30 + rng = np.random.RandomState(1) + X = rng.randint(0, 3, size=(n_samples, 2)) + y = rng.randint(0, 3, size=(n_samples,)) + y_2d = y.reshape(-1, 1) + y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) + groups = rng.randint(0, 3, size=(n_samples,)) + splitters = [ + LeaveOneOut(), + LeavePOut(p=2), + KFold(), + StratifiedKFold(), + RepeatedKFold(), + RepeatedStratifiedKFold(), + StratifiedGroupKFold(), + ShuffleSplit(), + StratifiedShuffleSplit(test_size=0.5), + GroupShuffleSplit(), + LeaveOneGroupOut(), + LeavePGroupsOut(n_groups=2), + GroupKFold(n_splits=3), + TimeSeriesSplit(), + PredefinedSplit(test_fold=groups), + ] + for splitter in splitters: + list(_split(splitter, X, y, groups=groups)) + list(_split(splitter, X, y_2d, groups=groups)) + try: + list(_split(splitter, X, y_multilabel, groups=groups)) + except ValueError as e: + allowed_target_types = ("binary", "multiclass") + msg = "Supported target types are: {}. Got 'multilabel".format( + allowed_target_types + ) + assert msg in str(e) + + +def check_valid_split(train, test, n_samples=None): + # Use python sets to get more informative assertion failure messages + train, test = set(train), set(test) + + # Train and test split should not overlap + assert train.intersection(test) == set() + + if n_samples is not None: + # Check that the union of train an test split cover all the indices + assert train.union(test) == set(range(n_samples)) + + +def check_cv_coverage(cv, X, y, groups, expected_n_splits): + n_samples = _num_samples(X) + # Check that a all the samples appear at least once in a test fold + assert cv.get_n_splits(X, y, groups) == expected_n_splits + + collected_test_samples = set() + iterations = 0 + for train, test in cv.split(X, y, groups): + check_valid_split(train, test, n_samples=n_samples) + iterations += 1 + collected_test_samples.update(test) + + # Check that the accumulated test samples cover the whole dataset + assert iterations == expected_n_splits + if n_samples is not None: + assert collected_test_samples == set(range(n_samples)) + + +def test_kfold_valueerrors(): + X1 = np.array([[1, 2], [3, 4], [5, 6]]) + X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) + # Check that errors are raised if there is not enough samples + (ValueError, next, KFold(4).split(X1)) + + # Check that a warning is raised if the least populated class has too few + # members. + y = np.array([3, 3, -1, -1, 3]) + + skf_3 = StratifiedKFold(3) + with pytest.warns(Warning, match="The least populated class"): + next(skf_3.split(X2, y)) + + sgkf_3 = StratifiedGroupKFold(3) + naive_groups = np.arange(len(y)) + with pytest.warns(Warning, match="The least populated class"): + next(sgkf_3.split(X2, y, naive_groups)) + + # Check that despite the warning the folds are still computed even + # though all the classes are not necessarily represented at on each + # side of the split at each split + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + check_cv_coverage(skf_3, X2, y, groups=None, expected_n_splits=3) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + check_cv_coverage(sgkf_3, X2, y, groups=naive_groups, expected_n_splits=3) + + # Check that errors are raised if all n_groups for individual + # classes are less than n_splits. + y = np.array([3, 3, -1, -1, 2]) + + with pytest.raises(ValueError): + next(skf_3.split(X2, y)) + with pytest.raises(ValueError): + next(sgkf_3.split(X2, y)) + + # Error when number of folds is <= 1 + with pytest.raises(ValueError): + KFold(0) + with pytest.raises(ValueError): + KFold(1) + error_string = "k-fold cross-validation requires at least one train/test split" + with pytest.raises(ValueError, match=error_string): + StratifiedKFold(0) + with pytest.raises(ValueError, match=error_string): + StratifiedKFold(1) + with pytest.raises(ValueError, match=error_string): + StratifiedGroupKFold(0) + with pytest.raises(ValueError, match=error_string): + StratifiedGroupKFold(1) + + # When n_splits is not integer: + with pytest.raises(ValueError): + KFold(1.5) + with pytest.raises(ValueError): + KFold(2.0) + with pytest.raises(ValueError): + StratifiedKFold(1.5) + with pytest.raises(ValueError): + StratifiedKFold(2.0) + with pytest.raises(ValueError): + StratifiedGroupKFold(1.5) + with pytest.raises(ValueError): + StratifiedGroupKFold(2.0) + + # When shuffle is not a bool: + with pytest.raises(TypeError): + KFold(n_splits=4, shuffle=None) + + +def test_kfold_indices(): + # Check all indices are returned in the test folds + X1 = np.ones(18) + kf = KFold(3) + check_cv_coverage(kf, X1, y=None, groups=None, expected_n_splits=3) + + # Check all indices are returned in the test folds even when equal-sized + # folds are not possible + X2 = np.ones(17) + kf = KFold(3) + check_cv_coverage(kf, X2, y=None, groups=None, expected_n_splits=3) + + # Check if get_n_splits returns the number of folds + assert 5 == KFold(5).get_n_splits(X2) + + +def test_kfold_no_shuffle(): + # Manually check that KFold preserves the data ordering on toy datasets + X2 = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] + + splits = KFold(2).split(X2[:-1]) + train, test = next(splits) + assert_array_equal(test, [0, 1]) + assert_array_equal(train, [2, 3]) + + train, test = next(splits) + assert_array_equal(test, [2, 3]) + assert_array_equal(train, [0, 1]) + + splits = KFold(2).split(X2) + train, test = next(splits) + assert_array_equal(test, [0, 1, 2]) + assert_array_equal(train, [3, 4]) + + train, test = next(splits) + assert_array_equal(test, [3, 4]) + assert_array_equal(train, [0, 1, 2]) + + +def test_stratified_kfold_no_shuffle(): + # Manually check that StratifiedKFold preserves the data ordering as much + # as possible on toy datasets in order to avoid hiding sample dependencies + # when possible + X, y = np.ones(4), [1, 1, 0, 0] + splits = StratifiedKFold(2).split(X, y) + train, test = next(splits) + assert_array_equal(test, [0, 2]) + assert_array_equal(train, [1, 3]) + + train, test = next(splits) + assert_array_equal(test, [1, 3]) + assert_array_equal(train, [0, 2]) + + X, y = np.ones(7), [1, 1, 1, 0, 0, 0, 0] + splits = StratifiedKFold(2).split(X, y) + train, test = next(splits) + assert_array_equal(test, [0, 1, 3, 4]) + assert_array_equal(train, [2, 5, 6]) + + train, test = next(splits) + assert_array_equal(test, [2, 5, 6]) + assert_array_equal(train, [0, 1, 3, 4]) + + # Check if get_n_splits returns the number of folds + assert 5 == StratifiedKFold(5).get_n_splits(X, y) + + # Make sure string labels are also supported + X = np.ones(7) + y1 = ["1", "1", "1", "0", "0", "0", "0"] + y2 = [1, 1, 1, 0, 0, 0, 0] + np.testing.assert_equal( + list(StratifiedKFold(2).split(X, y1)), list(StratifiedKFold(2).split(X, y2)) + ) + + # Check equivalence to KFold + y = [0, 1, 0, 1, 0, 1, 0, 1] + X = np.ones_like(y) + np.testing.assert_equal( + list(StratifiedKFold(3).split(X, y)), list(KFold(3).split(X, y)) + ) + + +@pytest.mark.parametrize("shuffle", [False, True]) +@pytest.mark.parametrize("k", [4, 5, 6, 7, 8, 9, 10]) +@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold]) +def test_stratified_kfold_ratios(k, shuffle, kfold): + # Check that stratified kfold preserves class ratios in individual splits + # Repeat with shuffling turned off and on + n_samples = 1000 + X = np.ones(n_samples) + y = np.array( + [4] * int(0.10 * n_samples) + + [0] * int(0.89 * n_samples) + + [1] * int(0.01 * n_samples) + ) + # ensure perfect stratification with StratifiedGroupKFold + groups = np.arange(len(y)) + distr = np.bincount(y) / len(y) + + test_sizes = [] + random_state = None if not shuffle else 0 + skf = kfold(k, random_state=random_state, shuffle=shuffle) + for train, test in _split(skf, X, y, groups=groups): + assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02) + assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02) + test_sizes.append(len(test)) + assert np.ptp(test_sizes) <= 1 + + +@pytest.mark.parametrize("shuffle", [False, True]) +@pytest.mark.parametrize("k", [4, 6, 7]) +@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold]) +def test_stratified_kfold_label_invariance(k, shuffle, kfold): + # Check that stratified kfold gives the same indices regardless of labels + n_samples = 100 + y = np.array( + [2] * int(0.10 * n_samples) + + [0] * int(0.89 * n_samples) + + [1] * int(0.01 * n_samples) + ) + X = np.ones(len(y)) + # ensure perfect stratification with StratifiedGroupKFold + groups = np.arange(len(y)) + + def get_splits(y): + random_state = None if not shuffle else 0 + return [ + (list(train), list(test)) + for train, test in _split( + kfold(k, random_state=random_state, shuffle=shuffle), + X, + y, + groups=groups, + ) + ] + + splits_base = get_splits(y) + for perm in permutations([0, 1, 2]): + y_perm = np.take(perm, y) + splits_perm = get_splits(y_perm) + assert splits_perm == splits_base + + +def test_kfold_balance(): + # Check that KFold returns folds with balanced sizes + for i in range(11, 17): + kf = KFold(5).split(X=np.ones(i)) + sizes = [len(test) for _, test in kf] + + assert (np.max(sizes) - np.min(sizes)) <= 1 + assert np.sum(sizes) == i + + +@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold]) +def test_stratifiedkfold_balance(kfold): + # Check that KFold returns folds with balanced sizes (only when + # stratification is possible) + # Repeat with shuffling turned off and on + X = np.ones(17) + y = [0] * 3 + [1] * 14 + # ensure perfect stratification with StratifiedGroupKFold + groups = np.arange(len(y)) + + for shuffle in (True, False): + cv = kfold(3, shuffle=shuffle) + for i in range(11, 17): + skf = _split(cv, X[:i], y[:i], groups[:i]) + sizes = [len(test) for _, test in skf] + + assert (np.max(sizes) - np.min(sizes)) <= 1 + assert np.sum(sizes) == i + + +def test_shuffle_kfold(): + # Check the indices are shuffled properly + kf = KFold(3) + kf2 = KFold(3, shuffle=True, random_state=0) + kf3 = KFold(3, shuffle=True, random_state=1) + + X = np.ones(300) + + all_folds = np.zeros(300) + for (tr1, te1), (tr2, te2), (tr3, te3) in zip( + kf.split(X), kf2.split(X), kf3.split(X) + ): + for tr_a, tr_b in combinations((tr1, tr2, tr3), 2): + # Assert that there is no complete overlap + assert len(np.intersect1d(tr_a, tr_b)) != len(tr1) + + # Set all test indices in successive iterations of kf2 to 1 + all_folds[te2] = 1 + + # Check that all indices are returned in the different test folds + assert sum(all_folds) == 300 + + +@pytest.mark.parametrize("kfold", [KFold, StratifiedKFold, StratifiedGroupKFold]) +def test_shuffle_kfold_stratifiedkfold_reproducibility(kfold): + X = np.ones(15) # Divisible by 3 + y = [0] * 7 + [1] * 8 + groups_1 = np.arange(len(y)) + X2 = np.ones(16) # Not divisible by 3 + y2 = [0] * 8 + [1] * 8 + groups_2 = np.arange(len(y2)) + + # Check that when the shuffle is True, multiple split calls produce the + # same split when random_state is int + kf = kfold(3, shuffle=True, random_state=0) + + np.testing.assert_equal( + list(_split(kf, X, y, groups_1)), list(_split(kf, X, y, groups_1)) + ) + + # Check that when the shuffle is True, multiple split calls often + # (not always) produce different splits when random_state is + # RandomState instance or None + kf = kfold(3, shuffle=True, random_state=np.random.RandomState(0)) + for data in zip((X, X2), (y, y2), (groups_1, groups_2)): + # Test if the two splits are different cv + for (_, test_a), (_, test_b) in zip(_split(kf, *data), _split(kf, *data)): + # cv.split(...) returns an array of tuples, each tuple + # consisting of an array with train indices and test indices + # Ensure that the splits for data are not same + # when random state is not set + with pytest.raises(AssertionError): + np.testing.assert_array_equal(test_a, test_b) + + +def test_shuffle_stratifiedkfold(): + # Check that shuffling is happening when requested, and for proper + # sample coverage + X_40 = np.ones(40) + y = [0] * 20 + [1] * 20 + kf0 = StratifiedKFold(5, shuffle=True, random_state=0) + kf1 = StratifiedKFold(5, shuffle=True, random_state=1) + for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)): + assert set(test0) != set(test1) + check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5) + + # Ensure that we shuffle each class's samples with different + # random_state in StratifiedKFold + # See https://github.com/scikit-learn/scikit-learn/pull/13124 + X = np.arange(10) + y = [0] * 5 + [1] * 5 + kf1 = StratifiedKFold(5, shuffle=True, random_state=0) + kf2 = StratifiedKFold(5, shuffle=True, random_state=1) + test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)]) + test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)]) + assert test_set1 != test_set2 + + +def test_shuffle_groupkfold(): + # Check that shuffling is happening when requested, and for proper + # sample coverage + X = np.ones(40) + y = [0] * 20 + [1] * 20 + groups = np.arange(40) // 3 + gkf0 = GroupKFold(4, shuffle=True, random_state=0) + gkf1 = GroupKFold(4, shuffle=True, random_state=1) + + # Check that the groups are shuffled differently + test_groups0 = [ + set(groups[test_idx]) for _, test_idx in gkf0.split(X, None, groups) + ] + test_groups1 = [ + set(groups[test_idx]) for _, test_idx in gkf1.split(X, None, groups) + ] + for g0, g1 in zip(test_groups0, test_groups1): + assert g0 != g1, "Test groups should differ with different random states" + + # Check coverage and splits + check_cv_coverage(gkf0, X, y, groups, expected_n_splits=4) + check_cv_coverage(gkf1, X, y, groups, expected_n_splits=4) + + +def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 + # The digits samples are dependent: they are apparently grouped by authors + # although we don't have any information on the groups segment locations + # for this data. We can highlight this fact by computing k-fold cross- + # validation with and without shuffling: we observe that the shuffling case + # wrongly makes the IID assumption and is therefore too optimistic: it + # estimates a much higher accuracy (around 0.93) than that the non + # shuffling variant (around 0.81). + + X, y = digits.data[:600], digits.target[:600] + model = SVC(C=10, gamma=0.005) + + n_splits = 3 + + cv = KFold(n_splits=n_splits, shuffle=False) + mean_score = cross_val_score(model, X, y, cv=cv).mean() + assert 0.92 > mean_score + assert mean_score > 0.80 + + # Shuffling the data artificially breaks the dependency and hides the + # overfitting of the model with regards to the writing style of the authors + # by yielding a seriously overestimated score: + + cv = KFold(n_splits, shuffle=True, random_state=0) + mean_score = cross_val_score(model, X, y, cv=cv).mean() + assert mean_score > 0.92 + + cv = KFold(n_splits, shuffle=True, random_state=1) + mean_score = cross_val_score(model, X, y, cv=cv).mean() + assert mean_score > 0.92 + + # Similarly, StratifiedKFold should try to shuffle the data as little + # as possible (while respecting the balanced class constraints) + # and thus be able to detect the dependency by not overestimating + # the CV score either. As the digits dataset is approximately balanced + # the estimated mean score is close to the score measured with + # non-shuffled KFold + + cv = StratifiedKFold(n_splits) + mean_score = cross_val_score(model, X, y, cv=cv).mean() + assert 0.94 > mean_score + assert mean_score > 0.80 + + +def test_stratified_group_kfold_trivial(): + sgkf = StratifiedGroupKFold(n_splits=3) + # Trivial example - groups with the same distribution + y = np.array([1] * 6 + [0] * 12) + X = np.ones_like(y).reshape(-1, 1) + groups = np.asarray((1, 2, 3, 4, 5, 6, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6)) + distr = np.bincount(y) / len(y) + test_sizes = [] + for train, test in sgkf.split(X, y, groups): + # check group constraint + assert np.intersect1d(groups[train], groups[test]).size == 0 + # check y distribution + assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02) + assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02) + test_sizes.append(len(test)) + assert np.ptp(test_sizes) <= 1 + + +def test_stratified_group_kfold_approximate(): + # Not perfect stratification (even though it is possible) because of + # iteration over groups + sgkf = StratifiedGroupKFold(n_splits=3) + y = np.array([1] * 6 + [0] * 12) + X = np.ones_like(y).reshape(-1, 1) + groups = np.array([1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6]) + expected = np.asarray([[0.833, 0.166], [0.666, 0.333], [0.5, 0.5]]) + test_sizes = [] + for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected): + # check group constraint + assert np.intersect1d(groups[train], groups[test]).size == 0 + split_dist = np.bincount(y[test]) / len(test) + assert_allclose(split_dist, expect_dist, atol=0.001) + test_sizes.append(len(test)) + assert np.ptp(test_sizes) <= 1 + + +@pytest.mark.parametrize( + "y, groups, expected", + [ + ( + np.array([0] * 6 + [1] * 6), + np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]), + np.asarray([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]), + ), + ( + np.array([0] * 9 + [1] * 3), + np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6]), + np.asarray([[0.75, 0.25], [0.75, 0.25], [0.75, 0.25]]), + ), + ], +) +def test_stratified_group_kfold_homogeneous_groups(y, groups, expected): + sgkf = StratifiedGroupKFold(n_splits=3) + X = np.ones_like(y).reshape(-1, 1) + for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected): + # check group constraint + assert np.intersect1d(groups[train], groups[test]).size == 0 + split_dist = np.bincount(y[test]) / len(test) + assert_allclose(split_dist, expect_dist, atol=0.001) + + +@pytest.mark.parametrize("cls_distr", [(0.4, 0.6), (0.3, 0.7), (0.2, 0.8), (0.8, 0.2)]) +@pytest.mark.parametrize("n_groups", [5, 30, 70]) +def test_stratified_group_kfold_against_group_kfold(cls_distr, n_groups): + # Check that given sufficient amount of samples StratifiedGroupKFold + # produces better stratified folds than regular GroupKFold + n_splits = 5 + sgkf = StratifiedGroupKFold(n_splits=n_splits) + gkf = GroupKFold(n_splits=n_splits) + rng = np.random.RandomState(0) + n_points = 1000 + y = rng.choice(2, size=n_points, p=cls_distr) + X = np.ones_like(y).reshape(-1, 1) + g = rng.choice(n_groups, n_points) + sgkf_folds = sgkf.split(X, y, groups=g) + gkf_folds = gkf.split(X, y, groups=g) + sgkf_entr = 0 + gkf_entr = 0 + for (sgkf_train, sgkf_test), (_, gkf_test) in zip(sgkf_folds, gkf_folds): + # check group constraint + assert np.intersect1d(g[sgkf_train], g[sgkf_test]).size == 0 + sgkf_distr = np.bincount(y[sgkf_test]) / len(sgkf_test) + gkf_distr = np.bincount(y[gkf_test]) / len(gkf_test) + sgkf_entr += stats.entropy(sgkf_distr, qk=cls_distr) + gkf_entr += stats.entropy(gkf_distr, qk=cls_distr) + sgkf_entr /= n_splits + gkf_entr /= n_splits + assert sgkf_entr <= gkf_entr + + +def test_shuffle_split(): + ss1 = ShuffleSplit(test_size=0.2, random_state=0).split(X) + ss2 = ShuffleSplit(test_size=2, random_state=0).split(X) + ss3 = ShuffleSplit(test_size=np.int32(2), random_state=0).split(X) + ss4 = ShuffleSplit(test_size=int(2), random_state=0).split(X) + for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4): + assert_array_equal(t1[0], t2[0]) + assert_array_equal(t2[0], t3[0]) + assert_array_equal(t3[0], t4[0]) + assert_array_equal(t1[1], t2[1]) + assert_array_equal(t2[1], t3[1]) + assert_array_equal(t3[1], t4[1]) + + +@pytest.mark.parametrize("split_class", [ShuffleSplit, StratifiedShuffleSplit]) +@pytest.mark.parametrize( + "train_size, exp_train, exp_test", [(None, 9, 1), (8, 8, 2), (0.8, 8, 2)] +) +def test_shuffle_split_default_test_size(split_class, train_size, exp_train, exp_test): + # Check that the default value has the expected behavior, i.e. 0.1 if both + # unspecified or complement train_size unless both are specified. + X = np.ones(10) + y = np.ones(10) + + X_train, X_test = next(split_class(train_size=train_size).split(X, y)) + + assert len(X_train) == exp_train + assert len(X_test) == exp_test + + +@pytest.mark.parametrize( + "train_size, exp_train, exp_test", [(None, 8, 2), (7, 7, 3), (0.7, 7, 3)] +) +def test_group_shuffle_split_default_test_size(train_size, exp_train, exp_test): + # Check that the default value has the expected behavior, i.e. 0.2 if both + # unspecified or complement train_size unless both are specified. + X = np.ones(10) + y = np.ones(10) + groups = range(10) + + X_train, X_test = next(GroupShuffleSplit(train_size=train_size).split(X, y, groups)) + + assert len(X_train) == exp_train + assert len(X_test) == exp_test + + +def test_stratified_shuffle_split_init(): + X = np.arange(7) + y = np.asarray([0, 1, 1, 1, 2, 2, 2]) + # Check that error is raised if there is a class with only one sample + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(3, test_size=0.2).split(X, y)) + + # Check that error is raised if the test set size is smaller than n_classes + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(3, test_size=2).split(X, y)) + # Check that error is raised if the train set size is smaller than + # n_classes + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(3, test_size=3, train_size=2).split(X, y)) + + X = np.arange(9) + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2]) + + # Train size or test size too small + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(train_size=2).split(X, y)) + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(test_size=2).split(X, y)) + + +def test_stratified_shuffle_split_respects_test_size(): + y = np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]) + test_size = 5 + train_size = 10 + sss = StratifiedShuffleSplit( + 6, test_size=test_size, train_size=train_size, random_state=0 + ).split(np.ones(len(y)), y) + for train, test in sss: + assert len(train) == train_size + assert len(test) == test_size + + +def test_stratified_shuffle_split_iter(): + ys = [ + np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), + np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2), + np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), + np.array([-1] * 800 + [1] * 50), + np.concatenate([[i] * (100 + i) for i in range(11)]), + [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3], + ["1", "1", "1", "1", "2", "2", "2", "3", "3", "3", "3", "3"], + ] + + for y in ys: + sss = StratifiedShuffleSplit(6, test_size=0.33, random_state=0).split( + np.ones(len(y)), y + ) + y = np.asanyarray(y) # To make it indexable for y[train] + # this is how test-size is computed internally + # in _validate_shuffle_split + test_size = np.ceil(0.33 * len(y)) + train_size = len(y) - test_size + for train, test in sss: + assert_array_equal(np.unique(y[train]), np.unique(y[test])) + # Checks if folds keep classes proportions + p_train = np.bincount(np.unique(y[train], return_inverse=True)[1]) / float( + len(y[train]) + ) + p_test = np.bincount(np.unique(y[test], return_inverse=True)[1]) / float( + len(y[test]) + ) + assert_array_almost_equal(p_train, p_test, 1) + assert len(train) + len(test) == y.size + assert len(train) == train_size + assert len(test) == test_size + assert_array_equal(np.intersect1d(train, test), []) + + +def test_stratified_shuffle_split_even(): + # Test the StratifiedShuffleSplit, indices are drawn with a + # equal chance + n_folds = 5 + n_splits = 1000 + + def assert_counts_are_ok(idx_counts, p): + # Here we test that the distribution of the counts + # per index is close enough to a binomial + threshold = 0.05 / n_splits + bf = stats.binom(n_splits, p) + for count in idx_counts: + prob = bf.pmf(count) + assert ( + prob > threshold + ), "An index is not drawn with chance corresponding to even draws" + + for n_samples in (6, 22): + groups = np.array((n_samples // 2) * [0, 1]) + splits = StratifiedShuffleSplit( + n_splits=n_splits, test_size=1.0 / n_folds, random_state=0 + ) + + train_counts = [0] * n_samples + test_counts = [0] * n_samples + n_splits_actual = 0 + for train, test in splits.split(X=np.ones(n_samples), y=groups): + n_splits_actual += 1 + for counter, ids in [(train_counts, train), (test_counts, test)]: + for id in ids: + counter[id] += 1 + assert n_splits_actual == n_splits + + n_train, n_test = _validate_shuffle_split( + n_samples, test_size=1.0 / n_folds, train_size=1.0 - (1.0 / n_folds) + ) + + assert len(train) == n_train + assert len(test) == n_test + assert len(set(train).intersection(test)) == 0 + + group_counts = np.unique(groups) + assert splits.test_size == 1.0 / n_folds + assert n_train + n_test == len(groups) + assert len(group_counts) == 2 + ex_test_p = float(n_test) / n_samples + ex_train_p = float(n_train) / n_samples + + assert_counts_are_ok(train_counts, ex_train_p) + assert_counts_are_ok(test_counts, ex_test_p) + + +def test_stratified_shuffle_split_overlap_train_test_bug(): + # See https://github.com/scikit-learn/scikit-learn/issues/6121 for + # the original bug report + y = [0, 1, 2, 3] * 3 + [4, 5] * 5 + X = np.ones_like(y) + + sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0) + + train, test = next(sss.split(X=X, y=y)) + + # no overlap + assert_array_equal(np.intersect1d(train, test), []) + + # complete partition + assert_array_equal(np.union1d(train, test), np.arange(len(y))) + + +def test_stratified_shuffle_split_multilabel(): + # fix for issue 9037 + for y in [ + np.array([[0, 1], [1, 0], [1, 0], [0, 1]]), + np.array([[0, 1], [1, 1], [1, 1], [0, 1]]), + ]: + X = np.ones_like(y) + sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0) + train, test = next(sss.split(X=X, y=y)) + y_train = y[train] + y_test = y[test] + + # no overlap + assert_array_equal(np.intersect1d(train, test), []) + + # complete partition + assert_array_equal(np.union1d(train, test), np.arange(len(y))) + + # correct stratification of entire rows + # (by design, here y[:, 0] uniquely determines the entire row of y) + expected_ratio = np.mean(y[:, 0]) + assert expected_ratio == np.mean(y_train[:, 0]) + assert expected_ratio == np.mean(y_test[:, 0]) + + +def test_stratified_shuffle_split_multilabel_many_labels(): + # fix in PR #9922: for multilabel data with > 1000 labels, str(row) + # truncates with an ellipsis for elements in positions 4 through + # len(row) - 4, so labels were not being correctly split using the powerset + # method for transforming a multilabel problem to a multiclass one; this + # test checks that this problem is fixed. + row_with_many_zeros = [1, 0, 1] + [0] * 1000 + [1, 0, 1] + row_with_many_ones = [1, 0, 1] + [1] * 1000 + [1, 0, 1] + y = np.array([row_with_many_zeros] * 10 + [row_with_many_ones] * 100) + X = np.ones_like(y) + + sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0) + train, test = next(sss.split(X=X, y=y)) + y_train = y[train] + y_test = y[test] + + # correct stratification of entire rows + # (by design, here y[:, 4] uniquely determines the entire row of y) + expected_ratio = np.mean(y[:, 4]) + assert expected_ratio == np.mean(y_train[:, 4]) + assert expected_ratio == np.mean(y_test[:, 4]) + + +def test_predefinedsplit_with_kfold_split(): + # Check that PredefinedSplit can reproduce a split generated by Kfold. + folds = np.full(10, -1.0) + kf_train = [] + kf_test = [] + for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)): + kf_train.append(train_ind) + kf_test.append(test_ind) + folds[test_ind] = i + ps = PredefinedSplit(folds) + # n_splits is simply the no of unique folds + assert len(np.unique(folds)) == ps.get_n_splits() + ps_train, ps_test = zip(*ps.split()) + assert_array_equal(ps_train, kf_train) + assert_array_equal(ps_test, kf_test) + + +def test_group_shuffle_split(): + for groups_i in test_groups: + X = y = np.ones(len(groups_i)) + n_splits = 6 + test_size = 1.0 / 3 + slo = GroupShuffleSplit(n_splits, test_size=test_size, random_state=0) + + # Make sure the repr works + repr(slo) + + # Test that the length is correct + assert slo.get_n_splits(X, y, groups=groups_i) == n_splits + + l_unique = np.unique(groups_i) + l = np.asarray(groups_i) + + for train, test in slo.split(X, y, groups=groups_i): + # First test: no train group is in the test set and vice versa + l_train_unique = np.unique(l[train]) + l_test_unique = np.unique(l[test]) + assert not np.any(np.isin(l[train], l_test_unique)) + assert not np.any(np.isin(l[test], l_train_unique)) + + # Second test: train and test add up to all the data + assert l[train].size + l[test].size == l.size + + # Third test: train and test are disjoint + assert_array_equal(np.intersect1d(train, test), []) + + # Fourth test: + # unique train and test groups are correct, +- 1 for rounding error + assert abs(len(l_test_unique) - round(test_size * len(l_unique))) <= 1 + assert ( + abs(len(l_train_unique) - round((1.0 - test_size) * len(l_unique))) <= 1 + ) + + +def test_leave_one_p_group_out(): + logo = LeaveOneGroupOut() + lpgo_1 = LeavePGroupsOut(n_groups=1) + lpgo_2 = LeavePGroupsOut(n_groups=2) + + # Make sure the repr works + assert repr(logo) == "LeaveOneGroupOut()" + assert repr(lpgo_1) == "LeavePGroupsOut(n_groups=1)" + assert repr(lpgo_2) == "LeavePGroupsOut(n_groups=2)" + assert repr(LeavePGroupsOut(n_groups=3)) == "LeavePGroupsOut(n_groups=3)" + + for j, (cv, p_groups_out) in enumerate(((logo, 1), (lpgo_1, 1), (lpgo_2, 2))): + for i, groups_i in enumerate(test_groups): + n_groups = len(np.unique(groups_i)) + n_splits = n_groups if p_groups_out == 1 else n_groups * (n_groups - 1) / 2 + X = y = np.ones(len(groups_i)) + + # Test that the length is correct + assert cv.get_n_splits(X, y, groups=groups_i) == n_splits + + groups_arr = np.asarray(groups_i) + + # Split using the original list / array / list of string groups_i + for train, test in cv.split(X, y, groups=groups_i): + # First test: no train group is in the test set and vice versa + assert_array_equal( + np.intersect1d(groups_arr[train], groups_arr[test]).tolist(), [] + ) + + # Second test: train and test add up to all the data + assert len(train) + len(test) == len(groups_i) + + # Third test: + # The number of groups in test must be equal to p_groups_out + assert np.unique(groups_arr[test]).shape[0], p_groups_out + + # check get_n_splits() with dummy parameters + assert logo.get_n_splits(None, None, ["a", "b", "c", "b", "c"]) == 3 + assert logo.get_n_splits(groups=[1.0, 1.1, 1.0, 1.2]) == 3 + assert lpgo_2.get_n_splits(None, None, np.arange(4)) == 6 + assert lpgo_1.get_n_splits(groups=np.arange(4)) == 4 + + # raise ValueError if a `groups` parameter is illegal + with pytest.raises(ValueError): + logo.get_n_splits(None, None, [0.0, np.nan, 0.0]) + with pytest.raises(ValueError): + lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0]) + + msg = "The 'groups' parameter should not be None." + with pytest.raises(ValueError, match=msg): + logo.get_n_splits(None, None, None) + with pytest.raises(ValueError, match=msg): + lpgo_1.get_n_splits(None, None, None) + + +def test_leave_group_out_changing_groups(): + # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if + # the groups variable is changed before calling split + groups = np.array([0, 1, 2, 1, 1, 2, 0, 0]) + X = np.ones(len(groups)) + groups_changing = np.array(groups, copy=True) + lolo = LeaveOneGroupOut().split(X, groups=groups) + lolo_changing = LeaveOneGroupOut().split(X, groups=groups) + lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups) + lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups) + groups_changing[:] = 0 + for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: + for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): + assert_array_equal(train, train_chan) + assert_array_equal(test, test_chan) + + # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3 + assert 3 == LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X, groups=groups) + # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups) + assert 3 == LeaveOneGroupOut().get_n_splits(X, y=X, groups=groups) + + +def test_leave_group_out_order_dependence(): + # Check that LeaveOneGroupOut orders the splits according to the index + # of the group left out. + groups = np.array([2, 2, 0, 0, 1, 1]) + X = np.ones(len(groups)) + + splits = iter(LeaveOneGroupOut().split(X, groups=groups)) + + expected_indices = [ + ([0, 1, 4, 5], [2, 3]), + ([0, 1, 2, 3], [4, 5]), + ([2, 3, 4, 5], [0, 1]), + ] + + for expected_train, expected_test in expected_indices: + train, test = next(splits) + assert_array_equal(train, expected_train) + assert_array_equal(test, expected_test) + + +def test_leave_one_p_group_out_error_on_fewer_number_of_groups(): + X = y = groups = np.ones(0) + msg = re.escape("Found array with 0 sample(s)") + with pytest.raises(ValueError, match=msg): + next(LeaveOneGroupOut().split(X, y, groups)) + + X = y = groups = np.ones(1) + msg = re.escape( + f"The groups parameter contains fewer than 2 unique groups ({groups})." + " LeaveOneGroupOut expects at least 2." + ) + with pytest.raises(ValueError, match=msg): + next(LeaveOneGroupOut().split(X, y, groups)) + + X = y = groups = np.ones(1) + msg = re.escape( + "The groups parameter contains fewer than (or equal to) n_groups " + f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects " + "that at least n_groups + 1 (4) unique groups " + "be present" + ) + with pytest.raises(ValueError, match=msg): + next(LeavePGroupsOut(n_groups=3).split(X, y, groups)) + + X = y = groups = np.arange(3) + msg = re.escape( + "The groups parameter contains fewer than (or equal to) n_groups " + f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects " + "that at least n_groups + 1 (4) unique groups " + "be present" + ) + with pytest.raises(ValueError, match=msg): + next(LeavePGroupsOut(n_groups=3).split(X, y, groups)) + + +def test_repeated_cv_value_errors(): + # n_repeats is not integer or <= 0 + for cv in (RepeatedKFold, RepeatedStratifiedKFold): + with pytest.raises(ValueError): + cv(n_repeats=0) + with pytest.raises(ValueError): + cv(n_repeats=1.5) + + +@pytest.mark.parametrize("RepeatedCV", [RepeatedKFold, RepeatedStratifiedKFold]) +def test_repeated_cv_repr(RepeatedCV): + n_splits, n_repeats = 2, 6 + repeated_cv = RepeatedCV(n_splits=n_splits, n_repeats=n_repeats) + repeated_cv_repr = "{}(n_repeats=6, n_splits=2, random_state=None)".format( + repeated_cv.__class__.__name__ + ) + assert repeated_cv_repr == repr(repeated_cv) + + +def test_repeated_kfold_determinstic_split(): + X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] + random_state = 258173307 + rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state) + + # split should produce same and deterministic splits on + # each call + for _ in range(3): + splits = rkf.split(X) + train, test = next(splits) + assert_array_equal(train, [2, 4]) + assert_array_equal(test, [0, 1, 3]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 3]) + assert_array_equal(test, [2, 4]) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [2, 3, 4]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4]) + assert_array_equal(test, [0, 1]) + + with pytest.raises(StopIteration): + next(splits) + + +def test_get_n_splits_for_repeated_kfold(): + n_splits = 3 + n_repeats = 4 + rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats) + expected_n_splits = n_splits * n_repeats + assert expected_n_splits == rkf.get_n_splits() + + +def test_get_n_splits_for_repeated_stratified_kfold(): + n_splits = 3 + n_repeats = 4 + rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats) + expected_n_splits = n_splits * n_repeats + assert expected_n_splits == rskf.get_n_splits() + + +def test_repeated_stratified_kfold_determinstic_split(): + X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] + y = [1, 1, 1, 0, 0] + random_state = 1944695409 + rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, random_state=random_state) + + # split should produce same and deterministic splits on + # each call + for _ in range(3): + splits = rskf.split(X, y) + train, test = next(splits) + assert_array_equal(train, [1, 4]) + assert_array_equal(test, [0, 2, 3]) + + train, test = next(splits) + assert_array_equal(train, [0, 2, 3]) + assert_array_equal(test, [1, 4]) + + train, test = next(splits) + assert_array_equal(train, [2, 3]) + assert_array_equal(test, [0, 1, 4]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 4]) + assert_array_equal(test, [2, 3]) + + with pytest.raises(StopIteration): + next(splits) + + +def test_train_test_split_errors(): + pytest.raises(ValueError, train_test_split) + + pytest.raises(ValueError, train_test_split, range(3), train_size=1.1) + + pytest.raises(ValueError, train_test_split, range(3), test_size=0.6, train_size=0.6) + pytest.raises( + ValueError, + train_test_split, + range(3), + test_size=np.float32(0.6), + train_size=np.float32(0.6), + ) + pytest.raises(ValueError, train_test_split, range(3), test_size="wrong_type") + pytest.raises(ValueError, train_test_split, range(3), test_size=2, train_size=4) + pytest.raises(TypeError, train_test_split, range(3), some_argument=1.1) + pytest.raises(ValueError, train_test_split, range(3), range(42)) + pytest.raises(ValueError, train_test_split, range(10), shuffle=False, stratify=True) + + with pytest.raises( + ValueError, + match=r"train_size=11 should be either positive and " + r"smaller than the number of samples 10 or a " + r"float in the \(0, 1\) range", + ): + train_test_split(range(10), train_size=11, test_size=1) + + +@pytest.mark.parametrize( + "train_size, exp_train, exp_test", [(None, 7, 3), (8, 8, 2), (0.8, 8, 2)] +) +def test_train_test_split_default_test_size(train_size, exp_train, exp_test): + # Check that the default value has the expected behavior, i.e. complement + # train_size unless both are specified. + X_train, X_test = train_test_split(X, train_size=train_size) + + assert len(X_train) == exp_train + assert len(X_test) == exp_test + + +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize( + "shuffle,stratify", + ( + (True, None), + (True, np.hstack((np.ones(6), np.zeros(4)))), + # stratification only works with shuffling + (False, None), + ), +) +def test_array_api_train_test_split( + shuffle, stratify, array_namespace, device, dtype_name +): + xp = _array_api_for_tests(array_namespace, device) + + X = np.arange(100).reshape((10, 10)) + y = np.arange(10) + + X_np = X.astype(dtype_name) + X_xp = xp.asarray(X_np, device=device) + + y_np = y.astype(dtype_name) + y_xp = xp.asarray(y_np, device=device) + + X_train_np, X_test_np, y_train_np, y_test_np = train_test_split( + X_np, y, random_state=0, shuffle=shuffle, stratify=stratify + ) + with config_context(array_api_dispatch=True): + if stratify is not None: + stratify_xp = xp.asarray(stratify) + else: + stratify_xp = stratify + X_train_xp, X_test_xp, y_train_xp, y_test_xp = train_test_split( + X_xp, y_xp, shuffle=shuffle, stratify=stratify_xp, random_state=0 + ) + + # Check that namespace is preserved, has to happen with + # array_api_dispatch enabled. + assert get_namespace(X_train_xp)[0] == get_namespace(X_xp)[0] + assert get_namespace(X_test_xp)[0] == get_namespace(X_xp)[0] + assert get_namespace(y_train_xp)[0] == get_namespace(y_xp)[0] + assert get_namespace(y_test_xp)[0] == get_namespace(y_xp)[0] + + # Check device and dtype is preserved on output + assert array_api_device(X_train_xp) == array_api_device(X_xp) + assert array_api_device(y_train_xp) == array_api_device(y_xp) + assert array_api_device(X_test_xp) == array_api_device(X_xp) + assert array_api_device(y_test_xp) == array_api_device(y_xp) + + assert X_train_xp.dtype == X_xp.dtype + assert y_train_xp.dtype == y_xp.dtype + assert X_test_xp.dtype == X_xp.dtype + assert y_test_xp.dtype == y_xp.dtype + + assert_allclose( + _convert_to_numpy(X_train_xp, xp=xp), + X_train_np, + ) + assert_allclose( + _convert_to_numpy(X_test_xp, xp=xp), + X_test_np, + ) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_train_test_split(coo_container): + X = np.arange(100).reshape((10, 10)) + X_s = coo_container(X) + y = np.arange(10) + + # simple test + split = train_test_split(X, y, test_size=None, train_size=0.5) + X_train, X_test, y_train, y_test = split + assert len(y_test) == len(y_train) + # test correspondence of X and y + assert_array_equal(X_train[:, 0], y_train * 10) + assert_array_equal(X_test[:, 0], y_test * 10) + + # don't convert lists to anything else by default + split = train_test_split(X, X_s, y.tolist()) + X_train, X_test, X_s_train, X_s_test, y_train, y_test = split + assert isinstance(y_train, list) + assert isinstance(y_test, list) + + # allow nd-arrays + X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) + y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) + split = train_test_split(X_4d, y_3d) + assert split[0].shape == (7, 5, 3, 2) + assert split[1].shape == (3, 5, 3, 2) + assert split[2].shape == (7, 7, 11) + assert split[3].shape == (3, 7, 11) + + # test stratification option + y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) + for test_size, exp_test_size in zip([2, 4, 0.25, 0.5, 0.75], [2, 4, 2, 4, 6]): + train, test = train_test_split( + y, test_size=test_size, stratify=y, random_state=0 + ) + assert len(test) == exp_test_size + assert len(test) + len(train) == len(y) + # check the 1:1 ratio of ones and twos in the data is preserved + assert np.sum(train == 1) == np.sum(train == 2) + + # test unshuffled split + y = np.arange(10) + for test_size in [2, 0.2]: + train, test = train_test_split(y, shuffle=False, test_size=test_size) + assert_array_equal(test, [8, 9]) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7]) + + +def test_train_test_split_32bit_overflow(): + """Check for integer overflow on 32-bit platforms. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/20774 + """ + + # A number 'n' big enough for expression 'n * n * train_size' to cause + # an overflow for signed 32-bit integer + big_number = 100000 + + # Definition of 'y' is a part of reproduction - population for at least + # one class should be in the same order of magnitude as size of X + X = np.arange(big_number) + y = X > (0.99 * big_number) + + split = train_test_split(X, y, stratify=y, train_size=0.25) + X_train, X_test, y_train, y_test = split + + assert X_train.size + X_test.size == big_number + assert y_train.size + y_test.size == big_number + + +def test_train_test_split_pandas(): + # check train_test_split doesn't destroy pandas dataframe + types = [MockDataFrame] + try: + from pandas import DataFrame + + types.append(DataFrame) + except ImportError: + pass + for InputFeatureType in types: + # X dataframe + X_df = InputFeatureType(X) + X_train, X_test = train_test_split(X_df) + assert isinstance(X_train, InputFeatureType) + assert isinstance(X_test, InputFeatureType) + + +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_train_test_split_sparse(sparse_container): + # check that train_test_split converts scipy sparse matrices + # to csr, as stated in the documentation + X = np.arange(100).reshape((10, 10)) + X_s = sparse_container(X) + X_train, X_test = train_test_split(X_s) + assert issparse(X_train) and X_train.format == "csr" + assert issparse(X_test) and X_test.format == "csr" + + +def test_train_test_split_mock_pandas(): + # X mock dataframe + X_df = MockDataFrame(X) + X_train, X_test = train_test_split(X_df) + assert isinstance(X_train, MockDataFrame) + assert isinstance(X_test, MockDataFrame) + X_train_arr, X_test_arr = train_test_split(X_df) + + +def test_train_test_split_list_input(): + # Check that when y is a list / list of string labels, it works. + X = np.ones(7) + y1 = ["1"] * 4 + ["0"] * 3 + y2 = np.hstack((np.ones(4), np.zeros(3))) + y3 = y2.tolist() + + for stratify in (True, False): + X_train1, X_test1, y_train1, y_test1 = train_test_split( + X, y1, stratify=y1 if stratify else None, random_state=0 + ) + X_train2, X_test2, y_train2, y_test2 = train_test_split( + X, y2, stratify=y2 if stratify else None, random_state=0 + ) + X_train3, X_test3, y_train3, y_test3 = train_test_split( + X, y3, stratify=y3 if stratify else None, random_state=0 + ) + + np.testing.assert_equal(X_train1, X_train2) + np.testing.assert_equal(y_train2, y_train3) + np.testing.assert_equal(X_test1, X_test3) + np.testing.assert_equal(y_test3, y_test2) + + +@pytest.mark.parametrize( + "test_size, train_size", + [(2.0, None), (1.0, None), (0.1, 0.95), (None, 1j), (11, None), (10, None), (8, 3)], +) +def test_shufflesplit_errors(test_size, train_size): + with pytest.raises(ValueError): + next(ShuffleSplit(test_size=test_size, train_size=train_size).split(X)) + + +def test_shufflesplit_reproducible(): + # Check that iterating twice on the ShuffleSplit gives the same + # sequence of train-test when the random_state is given + ss = ShuffleSplit(random_state=21) + assert_array_equal([a for a, b in ss.split(X)], [a for a, b in ss.split(X)]) + + +def test_stratifiedshufflesplit_list_input(): + # Check that when y is a list / list of string labels, it works. + sss = StratifiedShuffleSplit(test_size=2, random_state=42) + X = np.ones(7) + y1 = ["1"] * 4 + ["0"] * 3 + y2 = np.hstack((np.ones(4), np.zeros(3))) + y3 = y2.tolist() + + np.testing.assert_equal(list(sss.split(X, y1)), list(sss.split(X, y2))) + np.testing.assert_equal(list(sss.split(X, y3)), list(sss.split(X, y2))) + + +def test_train_test_split_allow_nans(): + # Check that train_test_split allows input data with NaNs + X = np.arange(200, dtype=np.float64).reshape(10, -1) + X[2, :] = np.nan + y = np.repeat([0, 1], X.shape[0] / 2) + train_test_split(X, y, test_size=0.2, random_state=42) + + +def test_check_cv(): + X = np.ones(9) + cv = check_cv(3, classifier=False) + # Use numpy.testing.assert_equal which recursively compares + # lists of lists + np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) + + y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1]) + cv = check_cv(3, y_binary, classifier=True) + np.testing.assert_equal( + list(StratifiedKFold(3).split(X, y_binary)), list(cv.split(X, y_binary)) + ) + + y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) + cv = check_cv(3, y_multiclass, classifier=True) + np.testing.assert_equal( + list(StratifiedKFold(3).split(X, y_multiclass)), list(cv.split(X, y_multiclass)) + ) + # also works with 2d multiclass + y_multiclass_2d = y_multiclass.reshape(-1, 1) + cv = check_cv(3, y_multiclass_2d, classifier=True) + np.testing.assert_equal( + list(StratifiedKFold(3).split(X, y_multiclass_2d)), + list(cv.split(X, y_multiclass_2d)), + ) + + assert not np.all( + next(StratifiedKFold(3).split(X, y_multiclass_2d))[0] + == next(KFold(3).split(X, y_multiclass_2d))[0] + ) + + X = np.ones(5) + y_multilabel = np.array( + [[0, 0, 0, 0], [0, 1, 1, 0], [0, 0, 0, 1], [1, 1, 0, 1], [0, 0, 1, 0]] + ) + cv = check_cv(3, y_multilabel, classifier=True) + np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) + + y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]]) + cv = check_cv(3, y_multioutput, classifier=True) + np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) + + with pytest.raises(ValueError): + check_cv(cv="lolo") + + +def test_cv_iterable_wrapper(): + kf_iter = KFold().split(X, y) + kf_iter_wrapped = check_cv(kf_iter) + # Since the wrapped iterable is enlisted and stored, + # split can be called any number of times to produce + # consistent results. + np.testing.assert_equal( + list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y)) + ) + # If the splits are randomized, successive calls to split yields different + # results + kf_randomized_iter = KFold(shuffle=True, random_state=0).split(X, y) + kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) + # numpy's assert_array_equal properly compares nested lists + np.testing.assert_equal( + list(kf_randomized_iter_wrapped.split(X, y)), + list(kf_randomized_iter_wrapped.split(X, y)), + ) + + try: + splits_are_equal = True + np.testing.assert_equal( + list(kf_iter_wrapped.split(X, y)), + list(kf_randomized_iter_wrapped.split(X, y)), + ) + except AssertionError: + splits_are_equal = False + assert not splits_are_equal, ( + "If the splits are randomized, " + "successive calls to split should yield different results" + ) + + +@pytest.mark.parametrize("kfold", [GroupKFold, StratifiedGroupKFold]) +@pytest.mark.parametrize("shuffle", [True, False]) +def test_group_kfold(kfold, shuffle, global_random_seed): + rng = np.random.RandomState(global_random_seed) + + # Parameters of the test + n_groups = 15 + n_samples = 1000 + n_splits = 5 + + X = y = np.ones(n_samples) + + # Construct the test data + tolerance = 0.05 * n_samples # 5 percent error allowed + groups = rng.randint(0, n_groups, n_samples) + + ideal_n_groups_per_fold = n_samples // n_splits + + len(np.unique(groups)) + # Get the test fold indices from the test set indices of each fold + folds = np.zeros(n_samples) + random_state = None if not shuffle else global_random_seed + lkf = kfold(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + for i, (_, test) in enumerate(lkf.split(X, y, groups)): + folds[test] = i + + # Check that folds have approximately the same size + assert len(folds) == len(groups) + for i in np.unique(folds): + assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold) + + # Check that each group appears only in 1 fold + for group in np.unique(groups): + assert len(np.unique(folds[groups == group])) == 1 + + # Check that no group is on both sides of the split + groups = np.asarray(groups, dtype=object) + for train, test in lkf.split(X, y, groups): + assert len(np.intersect1d(groups[train], groups[test])) == 0 + + # Construct the test data + groups = np.array( + [ + "Albert", + "Jean", + "Bertrand", + "Michel", + "Jean", + "Francis", + "Robert", + "Michel", + "Rachel", + "Lois", + "Michelle", + "Bernard", + "Marion", + "Laura", + "Jean", + "Rachel", + "Franck", + "John", + "Gael", + "Anna", + "Alix", + "Robert", + "Marion", + "David", + "Tony", + "Abel", + "Becky", + "Madmood", + "Cary", + "Mary", + "Alexandre", + "David", + "Francis", + "Barack", + "Abdoul", + "Rasha", + "Xi", + "Silvia", + ] + ) + + n_groups = len(np.unique(groups)) + n_samples = len(groups) + n_splits = 5 + tolerance = 0.05 * n_samples # 5 percent error allowed + ideal_n_groups_per_fold = n_samples // n_splits + + X = y = np.ones(n_samples) + + # Get the test fold indices from the test set indices of each fold + folds = np.zeros(n_samples) + for i, (_, test) in enumerate(lkf.split(X, y, groups)): + folds[test] = i + + # Check that folds have approximately the same size + assert len(folds) == len(groups) + if not shuffle: + for i in np.unique(folds): + assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold) + + # Check that each group appears only in 1 fold + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + for group in np.unique(groups): + assert len(np.unique(folds[groups == group])) == 1 + + # Check that no group is on both sides of the split + groups = np.asarray(groups, dtype=object) + for train, test in lkf.split(X, y, groups): + assert len(np.intersect1d(groups[train], groups[test])) == 0 + + # groups can also be a list + # use a new instance for reproducibility when shuffle=True + lkf_copy = kfold(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + cv_iter = list(lkf.split(X, y, groups.tolist())) + for (train1, test1), (train2, test2) in zip(lkf_copy.split(X, y, groups), cv_iter): + assert_array_equal(train1, train2) + assert_array_equal(test1, test2) + + # Should fail if there are more folds than groups + groups = np.array([1, 1, 1, 2, 2]) + X = y = np.ones(len(groups)) + with pytest.raises(ValueError, match="Cannot have number of splits.*greater"): + next(GroupKFold(n_splits=3).split(X, y, groups)) + + +def test_time_series_cv(): + X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] + + # Should fail if there are more folds than samples + with pytest.raises(ValueError, match="Cannot have number of folds.*greater"): + next(TimeSeriesSplit(n_splits=7).split(X)) + + tscv = TimeSeriesSplit(2) + + # Manually check that Time Series CV preserves the data + # ordering on toy datasets + splits = tscv.split(X[:-1]) + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [2, 3]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3]) + assert_array_equal(test, [4, 5]) + + splits = TimeSeriesSplit(2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2]) + assert_array_equal(test, [3, 4]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [5, 6]) + + # Check get_n_splits returns the correct number of splits + splits = TimeSeriesSplit(2).split(X) + n_splits_actual = len(list(splits)) + assert n_splits_actual == tscv.get_n_splits() + assert n_splits_actual == 2 + + +def _check_time_series_max_train_size(splits, check_splits, max_train_size): + for (train, test), (check_train, check_test) in zip(splits, check_splits): + assert_array_equal(test, check_test) + assert len(check_train) <= max_train_size + suffix_start = max(len(train) - max_train_size, 0) + assert_array_equal(check_train, train[suffix_start:]) + + +def test_time_series_max_train_size(): + X = np.zeros((6, 1)) + splits = TimeSeriesSplit(n_splits=3).split(X) + check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X) + _check_time_series_max_train_size(splits, check_splits, max_train_size=3) + + # Test for the case where the size of a fold is greater than max_train_size + check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X) + _check_time_series_max_train_size(splits, check_splits, max_train_size=2) + + # Test for the case where the size of each fold is less than max_train_size + check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X) + _check_time_series_max_train_size(splits, check_splits, max_train_size=2) + + +def test_time_series_test_size(): + X = np.zeros((10, 1)) + + # Test alone + splits = TimeSeriesSplit(n_splits=3, test_size=3).split(X) + + train, test = next(splits) + assert_array_equal(train, [0]) + assert_array_equal(test, [1, 2, 3]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6]) + assert_array_equal(test, [7, 8, 9]) + + # Test with max_train_size + splits = TimeSeriesSplit(n_splits=2, test_size=2, max_train_size=4).split(X) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [4, 5, 6, 7]) + assert_array_equal(test, [8, 9]) + + # Should fail with not enough data points for configuration + with pytest.raises(ValueError, match="Too many splits.*with test_size"): + splits = TimeSeriesSplit(n_splits=5, test_size=2).split(X) + next(splits) + + +def test_time_series_gap(): + X = np.zeros((10, 1)) + + # Test alone + splits = TimeSeriesSplit(n_splits=2, gap=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [7, 8, 9]) + + # Test with max_train_size + splits = TimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5]) + + train, test = next(splits) + assert_array_equal(train, [2, 3]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [4, 5]) + assert_array_equal(test, [8, 9]) + + # Test with test_size + splits = TimeSeriesSplit(n_splits=2, gap=2, max_train_size=4, test_size=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5]) + assert_array_equal(test, [8, 9]) + + # Test with additional test_size + splits = TimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [7, 8, 9]) + + # Verify proper error is thrown + with pytest.raises(ValueError, match="Too many splits.*and gap"): + splits = TimeSeriesSplit(n_splits=4, gap=2).split(X) + next(splits) + + +@ignore_warnings +def test_nested_cv(): + # Test if nested cross validation works with different combinations of cv + rng = np.random.RandomState(0) + + X, y = make_classification(n_samples=15, n_classes=2, random_state=0) + groups = rng.randint(0, 5, 15) + + cvs = [ + LeaveOneGroupOut(), + StratifiedKFold(n_splits=2), + LeaveOneOut(), + GroupKFold(n_splits=3), + StratifiedKFold(), + StratifiedGroupKFold(), + StratifiedShuffleSplit(n_splits=3, random_state=0), + ] + + for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): + gs = GridSearchCV( + DummyClassifier(), + param_grid={"strategy": ["stratified", "most_frequent"]}, + cv=inner_cv, + error_score="raise", + ) + cross_val_score( + gs, X=X, y=y, groups=groups, cv=outer_cv, params={"groups": groups} + ) + + +def test_build_repr(): + class MockSplitter: + def __init__(self, a, b=0, c=None): + self.a = a + self.b = b + self.c = c + + def __repr__(self): + return _build_repr(self) + + assert repr(MockSplitter(5, 6)) == "MockSplitter(a=5, b=6, c=None)" + + +@pytest.mark.parametrize( + "CVSplitter", (ShuffleSplit, GroupShuffleSplit, StratifiedShuffleSplit) +) +def test_shuffle_split_empty_trainset(CVSplitter): + cv = CVSplitter(test_size=0.99) + X, y = [[1]], [0] # 1 sample + with pytest.raises( + ValueError, + match=( + "With n_samples=1, test_size=0.99 and train_size=None, " + "the resulting train set will be empty" + ), + ): + next(_split(cv, X, y, groups=[1])) + + +def test_train_test_split_empty_trainset(): + (X,) = [[1]] # 1 sample + with pytest.raises( + ValueError, + match=( + "With n_samples=1, test_size=0.99 and train_size=None, " + "the resulting train set will be empty" + ), + ): + train_test_split(X, test_size=0.99) + + X = [[1], [1], [1]] # 3 samples, ask for more than 2 thirds + with pytest.raises( + ValueError, + match=( + "With n_samples=3, test_size=0.67 and train_size=None, " + "the resulting train set will be empty" + ), + ): + train_test_split(X, test_size=0.67) + + +def test_leave_one_out_empty_trainset(): + # LeaveOneGroup out expect at least 2 groups so no need to check + cv = LeaveOneOut() + X, y = [[1]], [0] # 1 sample + with pytest.raises(ValueError, match="Cannot perform LeaveOneOut with n_samples=1"): + next(cv.split(X, y)) + + +def test_leave_p_out_empty_trainset(): + # No need to check LeavePGroupsOut + cv = LeavePOut(p=2) + X, y = [[1], [2]], [0, 3] # 2 samples + with pytest.raises( + ValueError, match="p=2 must be strictly less than the number of samples=2" + ): + next(cv.split(X, y)) + + +@pytest.mark.parametrize( + "Klass", (KFold, StratifiedKFold, StratifiedGroupKFold, GroupKFold) +) +def test_random_state_shuffle_false(Klass): + # passing a non-default random_state when shuffle=False makes no sense + with pytest.raises(ValueError, match="has no effect since shuffle is False"): + Klass(3, shuffle=False, random_state=0) + + +@pytest.mark.parametrize( + "cv, expected", + [ + (KFold(), True), + (KFold(shuffle=True, random_state=123), True), + (StratifiedKFold(), True), + (StratifiedKFold(shuffle=True, random_state=123), True), + (StratifiedGroupKFold(shuffle=True, random_state=123), True), + (StratifiedGroupKFold(), True), + (RepeatedKFold(random_state=123), True), + (RepeatedStratifiedKFold(random_state=123), True), + (ShuffleSplit(random_state=123), True), + (GroupShuffleSplit(random_state=123), True), + (StratifiedShuffleSplit(random_state=123), True), + (GroupKFold(), True), + (GroupKFold(shuffle=True, random_state=123), True), + (TimeSeriesSplit(), True), + (LeaveOneOut(), True), + (LeaveOneGroupOut(), True), + (LeavePGroupsOut(n_groups=2), True), + (LeavePOut(p=2), True), + (KFold(shuffle=True, random_state=None), False), + (KFold(shuffle=True, random_state=None), False), + (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), False), + (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), False), + (RepeatedKFold(random_state=None), False), + (RepeatedKFold(random_state=np.random.RandomState(0)), False), + (RepeatedStratifiedKFold(random_state=None), False), + (RepeatedStratifiedKFold(random_state=np.random.RandomState(0)), False), + (ShuffleSplit(random_state=None), False), + (ShuffleSplit(random_state=np.random.RandomState(0)), False), + (GroupShuffleSplit(random_state=None), False), + (GroupShuffleSplit(random_state=np.random.RandomState(0)), False), + (StratifiedShuffleSplit(random_state=None), False), + (StratifiedShuffleSplit(random_state=np.random.RandomState(0)), False), + ], +) +def test_yields_constant_splits(cv, expected): + assert _yields_constant_splits(cv) == expected + + +@pytest.mark.parametrize("cv", ALL_SPLITTERS, ids=[str(cv) for cv in ALL_SPLITTERS]) +def test_splitter_get_metadata_routing(cv): + """Check get_metadata_routing returns the correct MetadataRouter.""" + assert hasattr(cv, "get_metadata_routing") + metadata = cv.get_metadata_routing() + if cv in GROUP_SPLITTERS: + assert metadata.split.requests["groups"] is True + elif cv in NO_GROUP_SPLITTERS: + assert not metadata.split.requests + + assert_request_is_empty(metadata, exclude=["split"]) + + +@pytest.mark.parametrize("cv", ALL_SPLITTERS, ids=[str(cv) for cv in ALL_SPLITTERS]) +def test_splitter_set_split_request(cv): + """Check set_split_request is defined for group splitters and not for others.""" + if cv in GROUP_SPLITTERS: + assert hasattr(cv, "set_split_request") + elif cv in NO_GROUP_SPLITTERS: + assert not hasattr(cv, "set_split_request") + + +@pytest.mark.parametrize("cv", NO_GROUP_SPLITTERS, ids=str) +def test_no_group_splitters_warns_with_groups(cv): + msg = f"The groups parameter is ignored by {cv.__class__.__name__}" + + n_samples = 30 + rng = np.random.RandomState(1) + X = rng.randint(0, 3, size=(n_samples, 2)) + y = rng.randint(0, 3, size=(n_samples,)) + groups = rng.randint(0, 3, size=(n_samples,)) + + with pytest.warns(UserWarning, match=msg): + cv.split(X, y, groups=groups) + + +@pytest.mark.parametrize( + "cv", SPLITTERS_REQUIRING_TARGET, ids=[str(cv) for cv in SPLITTERS_REQUIRING_TARGET] +) +def test_stratified_splitter_without_y(cv): + msg = "missing 1 required positional argument: 'y'" + with pytest.raises(TypeError, match=msg): + cv.split(X) diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_successive_halving.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_successive_halving.py new file mode 100644 index 0000000000000000000000000000000000000000..c75798437bc862dc64fbb9a5a504a1a2a9522c84 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_successive_halving.py @@ -0,0 +1,856 @@ +from math import ceil + +import numpy as np +import pytest +from scipy.stats import expon, norm, randint + +from sklearn.datasets import make_classification +from sklearn.dummy import DummyClassifier +from sklearn.experimental import enable_halving_search_cv # noqa +from sklearn.model_selection import ( + GroupKFold, + GroupShuffleSplit, + HalvingGridSearchCV, + HalvingRandomSearchCV, + KFold, + LeaveOneGroupOut, + LeavePGroupsOut, + ShuffleSplit, + StratifiedKFold, + StratifiedShuffleSplit, +) +from sklearn.model_selection._search_successive_halving import ( + _SubsampleMetaSplitter, + _top_k, +) +from sklearn.model_selection.tests.test_search import ( + check_cv_results_array_types, + check_cv_results_keys, +) +from sklearn.svm import SVC, LinearSVC + + +class FastClassifier(DummyClassifier): + """Dummy classifier that accepts parameters a, b, ... z. + + These parameter don't affect the predictions and are useful for fast + grid searching.""" + + # update the constraints such that we accept all parameters from a to z + _parameter_constraints: dict = { + **DummyClassifier._parameter_constraints, + **{ + chr(key): "no_validation" # type: ignore + for key in range(ord("a"), ord("z") + 1) + }, + } + + def __init__( + self, strategy="stratified", random_state=None, constant=None, **kwargs + ): + super().__init__( + strategy=strategy, random_state=random_state, constant=constant + ) + + def get_params(self, deep=False): + params = super().get_params(deep=deep) + for char in range(ord("a"), ord("z") + 1): + params[chr(char)] = "whatever" + return params + + +class SometimesFailClassifier(DummyClassifier): + def __init__( + self, + strategy="stratified", + random_state=None, + constant=None, + n_estimators=10, + fail_fit=False, + fail_predict=False, + a=0, + ): + self.fail_fit = fail_fit + self.fail_predict = fail_predict + self.n_estimators = n_estimators + self.a = a + + super().__init__( + strategy=strategy, random_state=random_state, constant=constant + ) + + def fit(self, X, y): + if self.fail_fit: + raise Exception("fitting failed") + return super().fit(X, y) + + def predict(self, X): + if self.fail_predict: + raise Exception("predict failed") + return super().predict(X) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.FitFailedWarning") +@pytest.mark.filterwarnings("ignore:Scoring failed:UserWarning") +@pytest.mark.filterwarnings("ignore:One or more of the:UserWarning") +@pytest.mark.parametrize("HalvingSearch", (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize("fail_at", ("fit", "predict")) +def test_nan_handling(HalvingSearch, fail_at): + """Check the selection of the best scores in presence of failure represented by + NaN values.""" + n_samples = 1_000 + X, y = make_classification(n_samples=n_samples, random_state=0) + + search = HalvingSearch( + SometimesFailClassifier(), + {f"fail_{fail_at}": [False, True], "a": range(3)}, + resource="n_estimators", + max_resources=6, + min_resources=1, + factor=2, + ) + + search.fit(X, y) + + # estimators that failed during fit/predict should always rank lower + # than ones where the fit/predict succeeded + assert not search.best_params_[f"fail_{fail_at}"] + scores = search.cv_results_["mean_test_score"] + ranks = search.cv_results_["rank_test_score"] + + # some scores should be NaN + assert np.isnan(scores).any() + + unique_nan_ranks = np.unique(ranks[np.isnan(scores)]) + # all NaN scores should have the same rank + assert unique_nan_ranks.shape[0] == 1 + # NaNs should have the lowest rank + assert (unique_nan_ranks[0] >= ranks).all() + + +@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize( + ( + "aggressive_elimination," + "max_resources," + "expected_n_iterations," + "expected_n_required_iterations," + "expected_n_possible_iterations," + "expected_n_remaining_candidates," + "expected_n_candidates," + "expected_n_resources," + ), + [ + # notice how it loops at the beginning + # also, the number of candidates evaluated at the last iteration is + # <= factor + (True, "limited", 4, 4, 3, 1, [60, 20, 7, 3], [20, 20, 60, 180]), + # no aggressive elimination: we end up with less iterations, and + # the number of candidates at the last iter is > factor, which isn't + # ideal + (False, "limited", 3, 4, 3, 3, [60, 20, 7], [20, 60, 180]), + # # When the amount of resource isn't limited, aggressive_elimination + # # has no effect. Here the default min_resources='exhaust' will take + # # over. + (True, "unlimited", 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]), + (False, "unlimited", 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]), + ], +) +def test_aggressive_elimination( + Est, + aggressive_elimination, + max_resources, + expected_n_iterations, + expected_n_required_iterations, + expected_n_possible_iterations, + expected_n_remaining_candidates, + expected_n_candidates, + expected_n_resources, +): + # Test the aggressive_elimination parameter. + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": ("l1", "l2"), "b": list(range(30))} + base_estimator = FastClassifier() + + if max_resources == "limited": + max_resources = 180 + else: + max_resources = n_samples + + sh = Est( + base_estimator, + param_grid, + aggressive_elimination=aggressive_elimination, + max_resources=max_resources, + factor=3, + ) + sh.set_params(verbose=True) # just for test coverage + + if Est is HalvingRandomSearchCV: + # same number of candidates as with the grid + sh.set_params(n_candidates=2 * 30, min_resources="exhaust") + + sh.fit(X, y) + + assert sh.n_iterations_ == expected_n_iterations + assert sh.n_required_iterations_ == expected_n_required_iterations + assert sh.n_possible_iterations_ == expected_n_possible_iterations + assert sh.n_resources_ == expected_n_resources + assert sh.n_candidates_ == expected_n_candidates + assert sh.n_remaining_candidates_ == expected_n_remaining_candidates + assert ceil(sh.n_candidates_[-1] / sh.factor) == sh.n_remaining_candidates_ + + +@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize( + ( + "min_resources," + "max_resources," + "expected_n_iterations," + "expected_n_possible_iterations," + "expected_n_resources," + ), + [ + # with enough resources + ("smallest", "auto", 2, 4, [20, 60]), + # with enough resources but min_resources set manually + (50, "auto", 2, 3, [50, 150]), + # without enough resources, only one iteration can be done + ("smallest", 30, 1, 1, [20]), + # with exhaust: use as much resources as possible at the last iter + ("exhaust", "auto", 2, 2, [333, 999]), + ("exhaust", 1000, 2, 2, [333, 999]), + ("exhaust", 999, 2, 2, [333, 999]), + ("exhaust", 600, 2, 2, [200, 600]), + ("exhaust", 599, 2, 2, [199, 597]), + ("exhaust", 300, 2, 2, [100, 300]), + ("exhaust", 60, 2, 2, [20, 60]), + ("exhaust", 50, 1, 1, [20]), + ("exhaust", 20, 1, 1, [20]), + ], +) +def test_min_max_resources( + Est, + min_resources, + max_resources, + expected_n_iterations, + expected_n_possible_iterations, + expected_n_resources, +): + # Test the min_resources and max_resources parameters, and how they affect + # the number of resources used at each iteration + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": [1, 2], "b": [1, 2, 3]} + base_estimator = FastClassifier() + + sh = Est( + base_estimator, + param_grid, + factor=3, + min_resources=min_resources, + max_resources=max_resources, + ) + if Est is HalvingRandomSearchCV: + sh.set_params(n_candidates=6) # same number as with the grid + + sh.fit(X, y) + + expected_n_required_iterations = 2 # given 6 combinations and factor = 3 + assert sh.n_iterations_ == expected_n_iterations + assert sh.n_required_iterations_ == expected_n_required_iterations + assert sh.n_possible_iterations_ == expected_n_possible_iterations + assert sh.n_resources_ == expected_n_resources + if min_resources == "exhaust": + assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh.n_resources_) + + +@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV)) +@pytest.mark.parametrize( + "max_resources, n_iterations, n_possible_iterations", + [ + ("auto", 5, 9), # all resources are used + (1024, 5, 9), + (700, 5, 8), + (512, 5, 8), + (511, 5, 7), + (32, 4, 4), + (31, 3, 3), + (16, 3, 3), + (4, 1, 1), # max_resources == min_resources, only one iteration is + # possible + ], +) +def test_n_iterations(Est, max_resources, n_iterations, n_possible_iterations): + # test the number of actual iterations that were run depending on + # max_resources + + n_samples = 1024 + X, y = make_classification(n_samples=n_samples, random_state=1) + param_grid = {"a": [1, 2], "b": list(range(10))} + base_estimator = FastClassifier() + factor = 2 + + sh = Est( + base_estimator, + param_grid, + cv=2, + factor=factor, + max_resources=max_resources, + min_resources=4, + ) + if Est is HalvingRandomSearchCV: + sh.set_params(n_candidates=20) # same as for HalvingGridSearchCV + sh.fit(X, y) + assert sh.n_required_iterations_ == 5 + assert sh.n_iterations_ == n_iterations + assert sh.n_possible_iterations_ == n_possible_iterations + + +@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV)) +def test_resource_parameter(Est): + # Test the resource parameter + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": [1, 2], "b": list(range(10))} + base_estimator = FastClassifier() + sh = Est(base_estimator, param_grid, cv=2, resource="c", max_resources=10, factor=3) + sh.fit(X, y) + assert set(sh.n_resources_) == set([1, 3, 9]) + for r_i, params, param_c in zip( + sh.cv_results_["n_resources"], + sh.cv_results_["params"], + sh.cv_results_["param_c"], + ): + assert r_i == params["c"] == param_c + + with pytest.raises( + ValueError, match="Cannot use resource=1234 which is not supported " + ): + sh = HalvingGridSearchCV( + base_estimator, param_grid, cv=2, resource="1234", max_resources=10 + ) + sh.fit(X, y) + + with pytest.raises( + ValueError, + match=( + "Cannot use parameter c as the resource since it is part " + "of the searched parameters." + ), + ): + param_grid = {"a": [1, 2], "b": [1, 2], "c": [1, 3]} + sh = HalvingGridSearchCV( + base_estimator, param_grid, cv=2, resource="c", max_resources=10 + ) + sh.fit(X, y) + + +@pytest.mark.parametrize( + "max_resources, n_candidates, expected_n_candidates", + [ + (512, "exhaust", 128), # generate exactly as much as needed + (32, "exhaust", 8), + (32, 8, 8), + (32, 7, 7), # ask for less than what we could + (32, 9, 9), # ask for more than 'reasonable' + ], +) +def test_random_search(max_resources, n_candidates, expected_n_candidates): + # Test random search and make sure the number of generated candidates is + # as expected + + n_samples = 1024 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": norm, "b": norm} + base_estimator = FastClassifier() + sh = HalvingRandomSearchCV( + base_estimator, + param_grid, + n_candidates=n_candidates, + cv=2, + max_resources=max_resources, + factor=2, + min_resources=4, + ) + sh.fit(X, y) + assert sh.n_candidates_[0] == expected_n_candidates + if n_candidates == "exhaust": + # Make sure 'exhaust' makes the last iteration use as much resources as + # we can + assert sh.n_resources_[-1] == max_resources + + +@pytest.mark.parametrize( + "param_distributions, expected_n_candidates", + [ + ({"a": [1, 2]}, 2), # all lists, sample less than n_candidates + ({"a": randint(1, 3)}, 10), # not all list, respect n_candidates + ], +) +def test_random_search_discrete_distributions( + param_distributions, expected_n_candidates +): + # Make sure random search samples the appropriate number of candidates when + # we ask for more than what's possible. How many parameters are sampled + # depends whether the distributions are 'all lists' or not (see + # ParameterSampler for details). This is somewhat redundant with the checks + # in ParameterSampler but interaction bugs were discovered during + # development of SH + + n_samples = 1024 + X, y = make_classification(n_samples=n_samples, random_state=0) + base_estimator = FastClassifier() + sh = HalvingRandomSearchCV(base_estimator, param_distributions, n_candidates=10) + sh.fit(X, y) + assert sh.n_candidates_[0] == expected_n_candidates + + +@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize( + "params, expected_error_message", + [ + ( + {"resource": "not_a_parameter"}, + "Cannot use resource=not_a_parameter which is not supported", + ), + ( + {"resource": "a", "max_resources": 100}, + "Cannot use parameter a as the resource since it is part of", + ), + ( + {"max_resources": "auto", "resource": "b"}, + "resource can only be 'n_samples' when max_resources='auto'", + ), + ( + {"min_resources": 15, "max_resources": 14}, + "min_resources_=15 is greater than max_resources_=14", + ), + ({"cv": KFold(shuffle=True)}, "must yield consistent folds"), + ({"cv": ShuffleSplit()}, "must yield consistent folds"), + ], +) +def test_input_errors(Est, params, expected_error_message): + base_estimator = FastClassifier() + param_grid = {"a": [1]} + X, y = make_classification(100) + + sh = Est(base_estimator, param_grid, **params) + + with pytest.raises(ValueError, match=expected_error_message): + sh.fit(X, y) + + +@pytest.mark.parametrize( + "params, expected_error_message", + [ + ( + {"n_candidates": "exhaust", "min_resources": "exhaust"}, + "cannot be both set to 'exhaust'", + ), + ], +) +def test_input_errors_randomized(params, expected_error_message): + # tests specific to HalvingRandomSearchCV + + base_estimator = FastClassifier() + param_grid = {"a": [1]} + X, y = make_classification(100) + + sh = HalvingRandomSearchCV(base_estimator, param_grid, **params) + + with pytest.raises(ValueError, match=expected_error_message): + sh.fit(X, y) + + +@pytest.mark.parametrize( + "fraction, subsample_test, expected_train_size, expected_test_size", + [ + (0.5, True, 40, 10), + (0.5, False, 40, 20), + (0.2, True, 16, 4), + (0.2, False, 16, 20), + ], +) +def test_subsample_splitter_shapes( + fraction, subsample_test, expected_train_size, expected_test_size +): + # Make sure splits returned by SubsampleMetaSplitter are of appropriate + # size + + n_samples = 100 + X, y = make_classification(n_samples) + cv = _SubsampleMetaSplitter( + base_cv=KFold(5), + fraction=fraction, + subsample_test=subsample_test, + random_state=None, + ) + + for train, test in cv.split(X, y): + assert train.shape[0] == expected_train_size + assert test.shape[0] == expected_test_size + if subsample_test: + assert train.shape[0] + test.shape[0] == int(n_samples * fraction) + else: + assert test.shape[0] == n_samples // cv.base_cv.get_n_splits() + + +@pytest.mark.parametrize("subsample_test", (True, False)) +def test_subsample_splitter_determinism(subsample_test): + # Make sure _SubsampleMetaSplitter is consistent across calls to split(): + # - we're OK having training sets differ (they're always sampled with a + # different fraction anyway) + # - when we don't subsample the test set, we want it to be always the same. + # This check is the most important. This is ensured by the determinism + # of the base_cv. + + # Note: we could force both train and test splits to be always the same if + # we drew an int seed in _SubsampleMetaSplitter.__init__ + + n_samples = 100 + X, y = make_classification(n_samples) + cv = _SubsampleMetaSplitter( + base_cv=KFold(5), fraction=0.5, subsample_test=subsample_test, random_state=None + ) + + folds_a = list(cv.split(X, y, groups=None)) + folds_b = list(cv.split(X, y, groups=None)) + + for (train_a, test_a), (train_b, test_b) in zip(folds_a, folds_b): + assert not np.all(train_a == train_b) + + if subsample_test: + assert not np.all(test_a == test_b) + else: + assert np.all(test_a == test_b) + assert np.all(X[test_a] == X[test_b]) + + +@pytest.mark.parametrize( + "k, itr, expected", + [ + (1, 0, ["c"]), + (2, 0, ["a", "c"]), + (4, 0, ["d", "b", "a", "c"]), + (10, 0, ["d", "b", "a", "c"]), + (1, 1, ["e"]), + (2, 1, ["f", "e"]), + (10, 1, ["f", "e"]), + (1, 2, ["i"]), + (10, 2, ["g", "h", "i"]), + ], +) +def test_top_k(k, itr, expected): + results = { # this isn't a 'real world' result dict + "iter": [0, 0, 0, 0, 1, 1, 2, 2, 2], + "mean_test_score": [4, 3, 5, 1, 11, 10, 5, 6, 9], + "params": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], + } + got = _top_k(results, k=k, itr=itr) + assert np.all(got == expected) + + +@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV)) +def test_cv_results(Est): + # test that the cv_results_ matches correctly the logic of the + # tournament: in particular that the candidates continued in each + # successive iteration are those that were best in the previous iteration + pd = pytest.importorskip("pandas") + + rng = np.random.RandomState(0) + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": ("l1", "l2"), "b": list(range(30))} + base_estimator = FastClassifier() + + # generate random scores: we want to avoid ties, which would otherwise + # mess with the ordering and make testing harder + def scorer(est, X, y): + return rng.rand() + + sh = Est(base_estimator, param_grid, factor=2, scoring=scorer) + if Est is HalvingRandomSearchCV: + # same number of candidates as with the grid + sh.set_params(n_candidates=2 * 30, min_resources="exhaust") + + sh.fit(X, y) + + # non-regression check for + # https://github.com/scikit-learn/scikit-learn/issues/19203 + assert isinstance(sh.cv_results_["iter"], np.ndarray) + assert isinstance(sh.cv_results_["n_resources"], np.ndarray) + + cv_results_df = pd.DataFrame(sh.cv_results_) + + # just make sure we don't have ties + assert len(cv_results_df["mean_test_score"].unique()) == len(cv_results_df) + + cv_results_df["params_str"] = cv_results_df["params"].apply(str) + table = cv_results_df.pivot( + index="params_str", columns="iter", values="mean_test_score" + ) + + # table looks like something like this: + # iter 0 1 2 3 4 5 + # params_str + # {'a': 'l2', 'b': 23} 0.75 NaN NaN NaN NaN NaN + # {'a': 'l1', 'b': 30} 0.90 0.875 NaN NaN NaN NaN + # {'a': 'l1', 'b': 0} 0.75 NaN NaN NaN NaN NaN + # {'a': 'l2', 'b': 3} 0.85 0.925 0.9125 0.90625 NaN NaN + # {'a': 'l1', 'b': 5} 0.80 NaN NaN NaN NaN NaN + # ... + + # where a NaN indicates that the candidate wasn't evaluated at a given + # iteration, because it wasn't part of the top-K at some previous + # iteration. We here make sure that candidates that aren't in the top-k at + # any given iteration are indeed not evaluated at the subsequent + # iterations. + nan_mask = pd.isna(table) + n_iter = sh.n_iterations_ + for it in range(n_iter - 1): + already_discarded_mask = nan_mask[it] + + # make sure that if a candidate is already discarded, we don't evaluate + # it later + assert ( + already_discarded_mask & nan_mask[it + 1] == already_discarded_mask + ).all() + + # make sure that the number of discarded candidate is correct + discarded_now_mask = ~already_discarded_mask & nan_mask[it + 1] + kept_mask = ~already_discarded_mask & ~discarded_now_mask + assert kept_mask.sum() == sh.n_candidates_[it + 1] + + # make sure that all discarded candidates have a lower score than the + # kept candidates + discarded_max_score = table[it].where(discarded_now_mask).max() + kept_min_score = table[it].where(kept_mask).min() + assert discarded_max_score < kept_min_score + + # We now make sure that the best candidate is chosen only from the last + # iteration. + # We also make sure this is true even if there were higher scores in + # earlier rounds (this isn't generally the case, but worth ensuring it's + # possible). + + last_iter = cv_results_df["iter"].max() + idx_best_last_iter = cv_results_df[cv_results_df["iter"] == last_iter][ + "mean_test_score" + ].idxmax() + idx_best_all_iters = cv_results_df["mean_test_score"].idxmax() + + assert sh.best_params_ == cv_results_df.iloc[idx_best_last_iter]["params"] + assert ( + cv_results_df.iloc[idx_best_last_iter]["mean_test_score"] + < cv_results_df.iloc[idx_best_all_iters]["mean_test_score"] + ) + assert ( + cv_results_df.iloc[idx_best_last_iter]["params"] + != cv_results_df.iloc[idx_best_all_iters]["params"] + ) + + +@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV)) +def test_base_estimator_inputs(Est): + # make sure that the base estimators are passed the correct parameters and + # number of samples at each iteration. + pd = pytest.importorskip("pandas") + + passed_n_samples_fit = [] + passed_n_samples_predict = [] + passed_params = [] + + class FastClassifierBookKeeping(FastClassifier): + def fit(self, X, y): + passed_n_samples_fit.append(X.shape[0]) + return super().fit(X, y) + + def predict(self, X): + passed_n_samples_predict.append(X.shape[0]) + return super().predict(X) + + def set_params(self, **params): + passed_params.append(params) + return super().set_params(**params) + + n_samples = 1024 + n_splits = 2 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": ("l1", "l2"), "b": list(range(30))} + base_estimator = FastClassifierBookKeeping() + + sh = Est( + base_estimator, + param_grid, + factor=2, + cv=n_splits, + return_train_score=False, + refit=False, + ) + if Est is HalvingRandomSearchCV: + # same number of candidates as with the grid + sh.set_params(n_candidates=2 * 30, min_resources="exhaust") + + sh.fit(X, y) + + assert len(passed_n_samples_fit) == len(passed_n_samples_predict) + passed_n_samples = [ + x + y for (x, y) in zip(passed_n_samples_fit, passed_n_samples_predict) + ] + + # Lists are of length n_splits * n_iter * n_candidates_at_i. + # Each chunk of size n_splits corresponds to the n_splits folds for the + # same candidate at the same iteration, so they contain equal values. We + # subsample such that the lists are of length n_iter * n_candidates_at_it + passed_n_samples = passed_n_samples[::n_splits] + passed_params = passed_params[::n_splits] + + cv_results_df = pd.DataFrame(sh.cv_results_) + + assert len(passed_params) == len(passed_n_samples) == len(cv_results_df) + + uniques, counts = np.unique(passed_n_samples, return_counts=True) + assert (sh.n_resources_ == uniques).all() + assert (sh.n_candidates_ == counts).all() + + assert (cv_results_df["params"] == passed_params).all() + assert (cv_results_df["n_resources"] == passed_n_samples).all() + + +@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV)) +def test_groups_support(Est): + # Check if ValueError (when groups is None) propagates to + # HalvingGridSearchCV and HalvingRandomSearchCV + # And also check if groups is correctly passed to the cv object + rng = np.random.RandomState(0) + + X, y = make_classification(n_samples=50, n_classes=2, random_state=0) + groups = rng.randint(0, 3, 50) + + clf = LinearSVC(random_state=0) + grid = {"C": [1]} + + group_cvs = [ + LeaveOneGroupOut(), + LeavePGroupsOut(2), + GroupKFold(n_splits=3), + GroupShuffleSplit(random_state=0), + ] + error_msg = "The 'groups' parameter should not be None." + for cv in group_cvs: + gs = Est(clf, grid, cv=cv, random_state=0) + with pytest.raises(ValueError, match=error_msg): + gs.fit(X, y) + gs.fit(X, y, groups=groups) + + non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit(random_state=0)] + for cv in non_group_cvs: + gs = Est(clf, grid, cv=cv) + # Should not raise an error + gs.fit(X, y) + + +@pytest.mark.parametrize("SearchCV", [HalvingRandomSearchCV, HalvingGridSearchCV]) +def test_min_resources_null(SearchCV): + """Check that we raise an error if the minimum resources is set to 0.""" + base_estimator = FastClassifier() + param_grid = {"a": [1]} + X = np.empty(0).reshape(0, 3) + + search = SearchCV(base_estimator, param_grid, min_resources="smallest") + + err_msg = "min_resources_=0: you might have passed an empty dataset X." + with pytest.raises(ValueError, match=err_msg): + search.fit(X, []) + + +@pytest.mark.parametrize("SearchCV", [HalvingGridSearchCV, HalvingRandomSearchCV]) +def test_select_best_index(SearchCV): + """Check the selection strategy of the halving search.""" + results = { # this isn't a 'real world' result dict + "iter": np.array([0, 0, 0, 0, 1, 1, 2, 2, 2]), + "mean_test_score": np.array([4, 3, 5, 1, 11, 10, 5, 6, 9]), + "params": np.array(["a", "b", "c", "d", "e", "f", "g", "h", "i"]), + } + + # we expect the index of 'i' + best_index = SearchCV._select_best_index(None, None, results) + assert best_index == 8 + + +def test_halving_random_search_list_of_dicts(): + """Check the behaviour of the `HalvingRandomSearchCV` with `param_distribution` + being a list of dictionary. + """ + X, y = make_classification(n_samples=150, n_features=4, random_state=42) + + params = [ + {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)}, + {"kernel": ["poly"], "degree": [2, 3]}, + ] + param_keys = ( + "param_C", + "param_degree", + "param_gamma", + "param_kernel", + ) + score_keys = ( + "mean_test_score", + "mean_train_score", + "rank_test_score", + "split0_test_score", + "split1_test_score", + "split2_test_score", + "split0_train_score", + "split1_train_score", + "split2_train_score", + "std_test_score", + "std_train_score", + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ) + extra_keys = ("n_resources", "iter") + + search = HalvingRandomSearchCV( + SVC(), cv=3, param_distributions=params, return_train_score=True, random_state=0 + ) + search.fit(X, y) + n_candidates = sum(search.n_candidates_) + cv_results = search.cv_results_ + # Check results structure + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys) + expected_cv_results_kinds = { + "param_C": "f", + "param_degree": "i", + "param_gamma": "f", + "param_kernel": "O", + } + check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds + ) + + assert all( + ( + cv_results["param_C"].mask[i] + and cv_results["param_gamma"].mask[i] + and not cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "poly" + ) + assert all( + ( + not cv_results["param_C"].mask[i] + and not cv_results["param_gamma"].mask[i] + and cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "rbf" + ) diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_validation.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..c2a96d9b1e97f9c973a8b53f072fe50ff9f5ae13 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_validation.py @@ -0,0 +1,2711 @@ +"""Test the validation module""" + +import os +import re +import sys +import tempfile +import warnings +from functools import partial +from io import StringIO +from time import sleep + +import numpy as np +import pytest +from scipy.sparse import issparse + +from sklearn import config_context +from sklearn.base import BaseEstimator, ClassifierMixin, clone +from sklearn.cluster import KMeans +from sklearn.datasets import ( + load_diabetes, + load_digits, + load_iris, + make_classification, + make_multilabel_classification, + make_regression, +) +from sklearn.ensemble import RandomForestClassifier +from sklearn.exceptions import FitFailedWarning +from sklearn.impute import SimpleImputer +from sklearn.linear_model import ( + LogisticRegression, + PassiveAggressiveClassifier, + Ridge, + RidgeClassifier, + SGDClassifier, +) +from sklearn.metrics import ( + accuracy_score, + check_scoring, + confusion_matrix, + explained_variance_score, + make_scorer, + mean_squared_error, + precision_recall_fscore_support, + precision_score, + r2_score, +) +from sklearn.metrics._scorer import _MultimetricScorer +from sklearn.model_selection import ( + GridSearchCV, + GroupKFold, + GroupShuffleSplit, + KFold, + LeaveOneGroupOut, + LeaveOneOut, + LeavePGroupsOut, + ShuffleSplit, + StratifiedKFold, + cross_val_predict, + cross_val_score, + cross_validate, + learning_curve, + permutation_test_score, + validation_curve, +) +from sklearn.model_selection._validation import ( + _check_is_permutation, + _fit_and_score, + _score, +) +from sklearn.model_selection.tests.common import OneTimeSplitter +from sklearn.model_selection.tests.test_search import FailingClassifier +from sklearn.multiclass import OneVsRestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPRegressor +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import LabelEncoder, scale +from sklearn.svm import SVC, LinearSVC +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingScorer, + ConsumingSplitter, + _Registry, + check_recorded_metadata, +) +from sklearn.utils import shuffle +from sklearn.utils._mocking import CheckingClassifier, MockDataFrame +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS +from sklearn.utils.validation import _num_samples + + +class MockImprovingEstimator(BaseEstimator): + """Dummy classifier to test the learning curve""" + + def __init__(self, n_max_train_sizes): + self.n_max_train_sizes = n_max_train_sizes + self.train_sizes = 0 + self.X_subset = None + + def fit(self, X_subset, y_subset=None): + self.X_subset = X_subset + self.train_sizes = X_subset.shape[0] + return self + + def predict(self, X): + raise NotImplementedError + + def score(self, X=None, Y=None): + # training score becomes worse (2 -> 1), test error better (0 -> 1) + if self._is_training_data(X): + return 2.0 - float(self.train_sizes) / self.n_max_train_sizes + else: + return float(self.train_sizes) / self.n_max_train_sizes + + def _is_training_data(self, X): + return X is self.X_subset + + +class MockIncrementalImprovingEstimator(MockImprovingEstimator): + """Dummy classifier that provides partial_fit""" + + def __init__(self, n_max_train_sizes, expected_fit_params=None): + super().__init__(n_max_train_sizes) + self.x = None + self.expected_fit_params = expected_fit_params + + def _is_training_data(self, X): + return self.x in X + + def partial_fit(self, X, y=None, **params): + self.train_sizes += X.shape[0] + self.x = X[0] + if self.expected_fit_params: + missing = set(self.expected_fit_params) - set(params) + if missing: + raise AssertionError( + f"Expected fit parameter(s) {list(missing)} not seen." + ) + for key, value in params.items(): + if key in self.expected_fit_params and _num_samples( + value + ) != _num_samples(X): + raise AssertionError( + f"Fit parameter {key} has length {_num_samples(value)}" + f"; expected {_num_samples(X)}." + ) + + +class MockEstimatorWithParameter(BaseEstimator): + """Dummy classifier to test the validation curve""" + + def __init__(self, param=0.5): + self.X_subset = None + self.param = param + + def fit(self, X_subset, y_subset): + self.X_subset = X_subset + self.train_sizes = X_subset.shape[0] + return self + + def predict(self, X): + raise NotImplementedError + + def score(self, X=None, y=None): + return self.param if self._is_training_data(X) else 1 - self.param + + def _is_training_data(self, X): + return X is self.X_subset + + +class MockEstimatorWithSingleFitCallAllowed(MockEstimatorWithParameter): + """Dummy classifier that disallows repeated calls of fit method""" + + def fit(self, X_subset, y_subset): + assert not hasattr(self, "fit_called_"), "fit is called the second time" + self.fit_called_ = True + return super().fit(X_subset, y_subset) + + def predict(self, X): + raise NotImplementedError + + +class MockClassifier(ClassifierMixin, BaseEstimator): + """Dummy classifier to test the cross-validation""" + + def __init__(self, a=0, allow_nd=False): + self.a = a + self.allow_nd = allow_nd + + def fit( + self, + X, + Y=None, + sample_weight=None, + class_prior=None, + sparse_sample_weight=None, + sparse_param=None, + dummy_int=None, + dummy_str=None, + dummy_obj=None, + callback=None, + ): + """The dummy arguments are to test that this fit function can + accept non-array arguments through cross-validation, such as: + - int + - str (this is actually array-like) + - object + - function + """ + self.dummy_int = dummy_int + self.dummy_str = dummy_str + self.dummy_obj = dummy_obj + if callback is not None: + callback(self) + + if self.allow_nd: + X = X.reshape(len(X), -1) + if X.ndim >= 3 and not self.allow_nd: + raise ValueError("X cannot be d") + if sample_weight is not None: + assert sample_weight.shape[0] == X.shape[0], ( + "MockClassifier extra fit_param " + "sample_weight.shape[0] is {0}, should be {1}".format( + sample_weight.shape[0], X.shape[0] + ) + ) + if class_prior is not None: + assert class_prior.shape[0] == len(np.unique(y)), ( + "MockClassifier extra fit_param class_prior.shape[0]" + " is {0}, should be {1}".format(class_prior.shape[0], len(np.unique(y))) + ) + if sparse_sample_weight is not None: + fmt = ( + "MockClassifier extra fit_param sparse_sample_weight" + ".shape[0] is {0}, should be {1}" + ) + assert sparse_sample_weight.shape[0] == X.shape[0], fmt.format( + sparse_sample_weight.shape[0], X.shape[0] + ) + if sparse_param is not None: + fmt = ( + "MockClassifier extra fit_param sparse_param.shape " + "is ({0}, {1}), should be ({2}, {3})" + ) + assert sparse_param.shape == P.shape, fmt.format( + sparse_param.shape[0], + sparse_param.shape[1], + P.shape[0], + P.shape[1], + ) + self.classes_ = np.unique(y) + return self + + def predict(self, T): + if self.allow_nd: + T = T.reshape(len(T), -1) + return T[:, 0] + + def predict_proba(self, T): + return T + + def score(self, X=None, Y=None): + return 1.0 / (1 + np.abs(self.a)) + + def get_params(self, deep=False): + return {"a": self.a, "allow_nd": self.allow_nd} + + +# XXX: use 2D array, since 1D X is being detected as a single sample in +# check_consistent_length +X = np.ones((15, 2)) +y = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6]) +# The number of samples per class needs to be > n_splits, +# for StratifiedKFold(n_splits=3) +y2 = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) +P = np.eye(5) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_cross_val_score(coo_container): + clf = MockClassifier() + X_sparse = coo_container(X) + + for a in range(-10, 10): + clf.a = a + # Smoke test + scores = cross_val_score(clf, X, y2) + assert_array_equal(scores, clf.score(X, y2)) + + # test with multioutput y + multioutput_y = np.column_stack([y2, y2[::-1]]) + scores = cross_val_score(clf, X_sparse, multioutput_y) + assert_array_equal(scores, clf.score(X_sparse, multioutput_y)) + + scores = cross_val_score(clf, X_sparse, y2) + assert_array_equal(scores, clf.score(X_sparse, y2)) + + # test with multioutput y + scores = cross_val_score(clf, X_sparse, multioutput_y) + assert_array_equal(scores, clf.score(X_sparse, multioutput_y)) + + # test with X and y as list + list_check = lambda x: isinstance(x, list) + clf = CheckingClassifier(check_X=list_check) + scores = cross_val_score(clf, X.tolist(), y2.tolist(), cv=3) + + clf = CheckingClassifier(check_y=list_check) + scores = cross_val_score(clf, X, y2.tolist(), cv=3) + + # test with 3d X and + X_3d = X[:, :, np.newaxis] + clf = MockClassifier(allow_nd=True) + scores = cross_val_score(clf, X_3d, y2) + + clf = MockClassifier(allow_nd=False) + with pytest.raises(ValueError): + cross_val_score(clf, X_3d, y2, error_score="raise") + + +def test_cross_validate_many_jobs(): + # regression test for #12154: cv='warn' with n_jobs>1 trigger a copy of + # the parameters leading to a failure in check_cv due to cv is 'warn' + # instead of cv == 'warn'. + X, y = load_iris(return_X_y=True) + clf = SVC(gamma="auto") + grid = GridSearchCV(clf, param_grid={"C": [1, 10]}) + cross_validate(grid, X, y, n_jobs=2) + + +def test_cross_validate_invalid_scoring_param(): + X, y = make_classification(random_state=0) + estimator = MockClassifier() + + # Test the errors + error_message_regexp = ".*must be unique strings.*" + + # List/tuple of callables should raise a message advising users to use + # dict of names to callables mapping + with pytest.raises(ValueError, match=error_message_regexp): + cross_validate( + estimator, + X, + y, + scoring=(make_scorer(precision_score), make_scorer(accuracy_score)), + ) + with pytest.raises(ValueError, match=error_message_regexp): + cross_validate(estimator, X, y, scoring=(make_scorer(precision_score),)) + + # So should empty lists/tuples + with pytest.raises(ValueError, match=error_message_regexp + "Empty list.*"): + cross_validate(estimator, X, y, scoring=()) + + # So should duplicated entries + with pytest.raises(ValueError, match=error_message_regexp + "Duplicate.*"): + cross_validate(estimator, X, y, scoring=("f1_micro", "f1_micro")) + + # Nested Lists should raise a generic error message + with pytest.raises(ValueError, match=error_message_regexp): + cross_validate(estimator, X, y, scoring=[[make_scorer(precision_score)]]) + + # Empty dict should raise invalid scoring error + with pytest.raises(ValueError, match="An empty dict"): + cross_validate(estimator, X, y, scoring=(dict())) + + multiclass_scorer = make_scorer(precision_recall_fscore_support) + + # Multiclass Scorers that return multiple values are not supported yet + # the warning message we're expecting to see + warning_message = ( + "Scoring failed. The score on this train-test " + f"partition for these parameters will be set to {np.nan}. " + "Details: \n" + ) + + with pytest.warns(UserWarning, match=warning_message): + cross_validate(estimator, X, y, scoring=multiclass_scorer) + + with pytest.warns(UserWarning, match=warning_message): + cross_validate(estimator, X, y, scoring={"foo": multiclass_scorer}) + + +def test_cross_validate_nested_estimator(): + # Non-regression test to ensure that nested + # estimators are properly returned in a list + # https://github.com/scikit-learn/scikit-learn/pull/17745 + (X, y) = load_iris(return_X_y=True) + pipeline = Pipeline( + [ + ("imputer", SimpleImputer()), + ("classifier", MockClassifier()), + ] + ) + + results = cross_validate(pipeline, X, y, return_estimator=True) + estimators = results["estimator"] + + assert isinstance(estimators, list) + assert all(isinstance(estimator, Pipeline) for estimator in estimators) + + +@pytest.mark.parametrize("use_sparse", [False, True]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_cross_validate(use_sparse: bool, csr_container): + # Compute train and test mse/r2 scores + cv = KFold() + + # Regression + X_reg, y_reg = make_regression(n_samples=30, random_state=0) + reg = Ridge(random_state=0) + + # Classification + X_clf, y_clf = make_classification(n_samples=30, random_state=0) + clf = SVC(kernel="linear", random_state=0) + + if use_sparse: + X_reg = csr_container(X_reg) + X_clf = csr_container(X_clf) + + for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)): + # It's okay to evaluate regression metrics on classification too + mse_scorer = check_scoring(est, scoring="neg_mean_squared_error") + r2_scorer = check_scoring(est, scoring="r2") + train_mse_scores = [] + test_mse_scores = [] + train_r2_scores = [] + test_r2_scores = [] + fitted_estimators = [] + + for train, test in cv.split(X, y): + est = clone(est).fit(X[train], y[train]) + train_mse_scores.append(mse_scorer(est, X[train], y[train])) + train_r2_scores.append(r2_scorer(est, X[train], y[train])) + test_mse_scores.append(mse_scorer(est, X[test], y[test])) + test_r2_scores.append(r2_scorer(est, X[test], y[test])) + fitted_estimators.append(est) + + train_mse_scores = np.array(train_mse_scores) + test_mse_scores = np.array(test_mse_scores) + train_r2_scores = np.array(train_r2_scores) + test_r2_scores = np.array(test_r2_scores) + fitted_estimators = np.array(fitted_estimators) + + scores = ( + train_mse_scores, + test_mse_scores, + train_r2_scores, + test_r2_scores, + fitted_estimators, + ) + + # To ensure that the test does not suffer from + # large statistical fluctuations due to slicing small datasets, + # we pass the cross-validation instance + check_cross_validate_single_metric(est, X, y, scores, cv) + check_cross_validate_multi_metric(est, X, y, scores, cv) + + +def check_cross_validate_single_metric(clf, X, y, scores, cv): + ( + train_mse_scores, + test_mse_scores, + train_r2_scores, + test_r2_scores, + fitted_estimators, + ) = scores + # Test single metric evaluation when scoring is string or singleton list + for return_train_score, dict_len in ((True, 4), (False, 3)): + # Single metric passed as a string + if return_train_score: + mse_scores_dict = cross_validate( + clf, + X, + y, + scoring="neg_mean_squared_error", + return_train_score=True, + cv=cv, + ) + assert_array_almost_equal(mse_scores_dict["train_score"], train_mse_scores) + else: + mse_scores_dict = cross_validate( + clf, + X, + y, + scoring="neg_mean_squared_error", + return_train_score=False, + cv=cv, + ) + assert isinstance(mse_scores_dict, dict) + assert len(mse_scores_dict) == dict_len + assert_array_almost_equal(mse_scores_dict["test_score"], test_mse_scores) + + # Single metric passed as a list + if return_train_score: + # It must be True by default - deprecated + r2_scores_dict = cross_validate( + clf, X, y, scoring=["r2"], return_train_score=True, cv=cv + ) + assert_array_almost_equal(r2_scores_dict["train_r2"], train_r2_scores, True) + else: + r2_scores_dict = cross_validate( + clf, X, y, scoring=["r2"], return_train_score=False, cv=cv + ) + assert isinstance(r2_scores_dict, dict) + assert len(r2_scores_dict) == dict_len + assert_array_almost_equal(r2_scores_dict["test_r2"], test_r2_scores) + + # Test return_estimator option + mse_scores_dict = cross_validate( + clf, X, y, scoring="neg_mean_squared_error", return_estimator=True, cv=cv + ) + for k, est in enumerate(mse_scores_dict["estimator"]): + est_coef = est.coef_.copy() + if issparse(est_coef): + est_coef = est_coef.toarray() + + fitted_est_coef = fitted_estimators[k].coef_.copy() + if issparse(fitted_est_coef): + fitted_est_coef = fitted_est_coef.toarray() + + assert_almost_equal(est_coef, fitted_est_coef) + assert_almost_equal(est.intercept_, fitted_estimators[k].intercept_) + + +def check_cross_validate_multi_metric(clf, X, y, scores, cv): + # Test multimetric evaluation when scoring is a list / dict + ( + train_mse_scores, + test_mse_scores, + train_r2_scores, + test_r2_scores, + fitted_estimators, + ) = scores + + def custom_scorer(clf, X, y): + y_pred = clf.predict(X) + return { + "r2": r2_score(y, y_pred), + "neg_mean_squared_error": -mean_squared_error(y, y_pred), + } + + all_scoring = ( + ("r2", "neg_mean_squared_error"), + { + "r2": make_scorer(r2_score), + "neg_mean_squared_error": "neg_mean_squared_error", + }, + custom_scorer, + ) + + keys_sans_train = { + "test_r2", + "test_neg_mean_squared_error", + "fit_time", + "score_time", + } + keys_with_train = keys_sans_train.union( + {"train_r2", "train_neg_mean_squared_error"} + ) + + for return_train_score in (True, False): + for scoring in all_scoring: + if return_train_score: + # return_train_score must be True by default - deprecated + cv_results = cross_validate( + clf, X, y, scoring=scoring, return_train_score=True, cv=cv + ) + assert_array_almost_equal(cv_results["train_r2"], train_r2_scores) + assert_array_almost_equal( + cv_results["train_neg_mean_squared_error"], train_mse_scores + ) + else: + cv_results = cross_validate( + clf, X, y, scoring=scoring, return_train_score=False, cv=cv + ) + assert isinstance(cv_results, dict) + assert set(cv_results.keys()) == ( + keys_with_train if return_train_score else keys_sans_train + ) + assert_array_almost_equal(cv_results["test_r2"], test_r2_scores) + assert_array_almost_equal( + cv_results["test_neg_mean_squared_error"], test_mse_scores + ) + + # Make sure all the arrays are of np.ndarray type + assert isinstance(cv_results["test_r2"], np.ndarray) + assert isinstance(cv_results["test_neg_mean_squared_error"], np.ndarray) + assert isinstance(cv_results["fit_time"], np.ndarray) + assert isinstance(cv_results["score_time"], np.ndarray) + + # Ensure all the times are within sane limits + assert np.all(cv_results["fit_time"] >= 0) + assert np.all(cv_results["fit_time"] < 10) + assert np.all(cv_results["score_time"] >= 0) + assert np.all(cv_results["score_time"] < 10) + + +def test_cross_val_score_predict_groups(): + # Check if ValueError (when groups is None) propagates to cross_val_score + # and cross_val_predict + # And also check if groups is correctly passed to the cv object + X, y = make_classification(n_samples=20, n_classes=2, random_state=0) + + clf = SVC(kernel="linear") + + group_cvs = [ + LeaveOneGroupOut(), + LeavePGroupsOut(2), + GroupKFold(), + GroupShuffleSplit(), + ] + error_message = "The 'groups' parameter should not be None." + for cv in group_cvs: + with pytest.raises(ValueError, match=error_message): + cross_val_score(estimator=clf, X=X, y=y, cv=cv) + with pytest.raises(ValueError, match=error_message): + cross_val_predict(estimator=clf, X=X, y=y, cv=cv) + + +def test_cross_val_score_pandas(): + # check cross_val_score doesn't destroy pandas dataframe + types = [(MockDataFrame, MockDataFrame)] + try: + from pandas import DataFrame, Series + + types.append((Series, DataFrame)) + except ImportError: + pass + for TargetType, InputFeatureType in types: + # X dataframe, y series + # 3 fold cross val is used so we need at least 3 samples per class + X_df, y_ser = InputFeatureType(X), TargetType(y2) + check_df = lambda x: isinstance(x, InputFeatureType) + check_series = lambda x: isinstance(x, TargetType) + clf = CheckingClassifier(check_X=check_df, check_y=check_series) + cross_val_score(clf, X_df, y_ser, cv=3) + + +def test_cross_val_score_mask(): + # test that cross_val_score works with boolean masks + svm = SVC(kernel="linear") + iris = load_iris() + X, y = iris.data, iris.target + kfold = KFold(5) + scores_indices = cross_val_score(svm, X, y, cv=kfold) + kfold = KFold(5) + cv_masks = [] + for train, test in kfold.split(X, y): + mask_train = np.zeros(len(y), dtype=bool) + mask_test = np.zeros(len(y), dtype=bool) + mask_train[train] = 1 + mask_test[test] = 1 + cv_masks.append((train, test)) + scores_masks = cross_val_score(svm, X, y, cv=cv_masks) + assert_array_equal(scores_indices, scores_masks) + + +def test_cross_val_score_precomputed(): + # test for svm with precomputed kernel + svm = SVC(kernel="precomputed") + iris = load_iris() + X, y = iris.data, iris.target + linear_kernel = np.dot(X, X.T) + score_precomputed = cross_val_score(svm, linear_kernel, y) + svm = SVC(kernel="linear") + score_linear = cross_val_score(svm, X, y) + assert_array_almost_equal(score_precomputed, score_linear) + + # test with callable + svm = SVC(kernel=lambda x, y: np.dot(x, y.T)) + score_callable = cross_val_score(svm, X, y) + assert_array_almost_equal(score_precomputed, score_callable) + + # Error raised for non-square X + svm = SVC(kernel="precomputed") + with pytest.raises(ValueError): + cross_val_score(svm, X, y) + + # test error is raised when the precomputed kernel is not array-like + # or sparse + with pytest.raises(ValueError): + cross_val_score(svm, linear_kernel.tolist(), y) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_cross_val_score_fit_params(coo_container): + clf = MockClassifier() + n_samples = X.shape[0] + n_classes = len(np.unique(y)) + + W_sparse = coo_container( + (np.array([1]), (np.array([1]), np.array([0]))), shape=(15, 1) + ) + P_sparse = coo_container(np.eye(5)) + + DUMMY_INT = 42 + DUMMY_STR = "42" + DUMMY_OBJ = object() + + def assert_fit_params(clf): + # Function to test that the values are passed correctly to the + # classifier arguments for non-array type + + assert clf.dummy_int == DUMMY_INT + assert clf.dummy_str == DUMMY_STR + assert clf.dummy_obj == DUMMY_OBJ + + fit_params = { + "sample_weight": np.ones(n_samples), + "class_prior": np.full(n_classes, 1.0 / n_classes), + "sparse_sample_weight": W_sparse, + "sparse_param": P_sparse, + "dummy_int": DUMMY_INT, + "dummy_str": DUMMY_STR, + "dummy_obj": DUMMY_OBJ, + "callback": assert_fit_params, + } + cross_val_score(clf, X, y2, params=fit_params) + + +def test_cross_val_score_score_func(): + clf = MockClassifier() + _score_func_args = [] + + def score_func(y_test, y_predict): + _score_func_args.append((y_test, y_predict)) + return 1.0 + + with warnings.catch_warnings(record=True): + scoring = make_scorer(score_func) + score = cross_val_score(clf, X, y, scoring=scoring, cv=3) + assert_array_equal(score, [1.0, 1.0, 1.0]) + # Test that score function is called only 3 times (for cv=3) + assert len(_score_func_args) == 3 + + +def test_cross_val_score_with_score_func_classification(): + iris = load_iris() + clf = SVC(kernel="linear") + + # Default score (should be the accuracy score) + scores = cross_val_score(clf, iris.data, iris.target) + assert_array_almost_equal(scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2) + + # Correct classification score (aka. zero / one score) - should be the + # same as the default estimator score + zo_scores = cross_val_score(clf, iris.data, iris.target, scoring="accuracy") + assert_array_almost_equal(zo_scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2) + + # F1 score (class are balanced so f1_score should be equal to zero/one + # score + f1_scores = cross_val_score(clf, iris.data, iris.target, scoring="f1_weighted") + assert_array_almost_equal(f1_scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2) + + +def test_cross_val_score_with_score_func_regression(): + X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0) + reg = Ridge() + + # Default score of the Ridge regression estimator + scores = cross_val_score(reg, X, y) + assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) + + # R2 score (aka. determination coefficient) - should be the + # same as the default estimator score + r2_scores = cross_val_score(reg, X, y, scoring="r2") + assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) + + # Mean squared error; this is a loss function, so "scores" are negative + neg_mse_scores = cross_val_score(reg, X, y, scoring="neg_mean_squared_error") + expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) + assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2) + + # Explained variance + scoring = make_scorer(explained_variance_score) + ev_scores = cross_val_score(reg, X, y, scoring=scoring) + assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_permutation_score(coo_container): + iris = load_iris() + X = iris.data + X_sparse = coo_container(X) + y = iris.target + svm = SVC(kernel="linear") + cv = StratifiedKFold(2) + + score, scores, pvalue = permutation_test_score( + svm, X, y, n_permutations=30, cv=cv, scoring="accuracy" + ) + assert score > 0.9 + assert_almost_equal(pvalue, 0.0, 1) + + score_group, _, pvalue_group = permutation_test_score( + svm, + X, + y, + n_permutations=30, + cv=cv, + scoring="accuracy", + groups=np.ones(y.size), + random_state=0, + ) + assert score_group == score + assert pvalue_group == pvalue + + # check that we obtain the same results with a sparse representation + svm_sparse = SVC(kernel="linear") + cv_sparse = StratifiedKFold(2) + score_group, _, pvalue_group = permutation_test_score( + svm_sparse, + X_sparse, + y, + n_permutations=30, + cv=cv_sparse, + scoring="accuracy", + groups=np.ones(y.size), + random_state=0, + ) + + assert score_group == score + assert pvalue_group == pvalue + + # test with custom scoring object + def custom_score(y_true, y_pred): + return ((y_true == y_pred).sum() - (y_true != y_pred).sum()) / y_true.shape[0] + + scorer = make_scorer(custom_score) + score, _, pvalue = permutation_test_score( + svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0 + ) + assert_almost_equal(score, 0.93, 2) + assert_almost_equal(pvalue, 0.01, 3) + + # set random y + y = np.mod(np.arange(len(y)), 3) + + score, scores, pvalue = permutation_test_score( + svm, X, y, n_permutations=30, cv=cv, scoring="accuracy" + ) + + assert score < 0.5 + assert pvalue > 0.2 + + +def test_permutation_test_score_allow_nans(): + # Check that permutation_test_score allows input data with NaNs + X = np.arange(200, dtype=np.float64).reshape(10, -1) + X[2, :] = np.nan + y = np.repeat([0, 1], X.shape[0] / 2) + p = Pipeline( + [ + ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)), + ("classifier", MockClassifier()), + ] + ) + permutation_test_score(p, X, y) + + +def test_permutation_test_score_params(): + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + clf = CheckingClassifier(expected_sample_weight=True) + + err_msg = r"Expected sample_weight to be passed" + with pytest.raises(AssertionError, match=err_msg): + permutation_test_score(clf, X, y) + + err_msg = r"sample_weight.shape == \(1,\), expected \(8,\)!" + with pytest.raises(ValueError, match=err_msg): + permutation_test_score(clf, X, y, params={"sample_weight": np.ones(1)}) + permutation_test_score(clf, X, y, params={"sample_weight": np.ones(10)}) + + +def test_cross_val_score_allow_nans(): + # Check that cross_val_score allows input data with NaNs + X = np.arange(200, dtype=np.float64).reshape(10, -1) + X[2, :] = np.nan + y = np.repeat([0, 1], X.shape[0] / 2) + p = Pipeline( + [ + ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)), + ("classifier", MockClassifier()), + ] + ) + cross_val_score(p, X, y) + + +def test_cross_val_score_multilabel(): + X = np.array( + [ + [-3, 4], + [2, 4], + [3, 3], + [0, 2], + [-3, 1], + [-2, 1], + [0, 0], + [-2, -1], + [-1, -2], + [1, -2], + ] + ) + y = np.array( + [[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]] + ) + clf = KNeighborsClassifier(n_neighbors=1) + scoring_micro = make_scorer(precision_score, average="micro") + scoring_macro = make_scorer(precision_score, average="macro") + scoring_samples = make_scorer(precision_score, average="samples") + score_micro = cross_val_score(clf, X, y, scoring=scoring_micro) + score_macro = cross_val_score(clf, X, y, scoring=scoring_macro) + score_samples = cross_val_score(clf, X, y, scoring=scoring_samples) + assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3]) + assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) + assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_cross_val_predict(coo_container): + X, y = load_diabetes(return_X_y=True) + cv = KFold() + + est = Ridge() + + # Naive loop (should be same as cross_val_predict): + preds2 = np.zeros_like(y) + for train, test in cv.split(X, y): + est.fit(X[train], y[train]) + preds2[test] = est.predict(X[test]) + + preds = cross_val_predict(est, X, y, cv=cv) + assert_array_almost_equal(preds, preds2) + + preds = cross_val_predict(est, X, y) + assert len(preds) == len(y) + + cv = LeaveOneOut() + preds = cross_val_predict(est, X, y, cv=cv) + assert len(preds) == len(y) + + Xsp = X.copy() + Xsp *= Xsp > np.median(Xsp) + Xsp = coo_container(Xsp) + preds = cross_val_predict(est, Xsp, y) + assert_array_almost_equal(len(preds), len(y)) + + preds = cross_val_predict(KMeans(n_init="auto"), X) + assert len(preds) == len(y) + + class BadCV: + def split(self, X, y=None, groups=None): + for i in range(4): + yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8]) + + with pytest.raises(ValueError): + cross_val_predict(est, X, y, cv=BadCV()) + + X, y = load_iris(return_X_y=True) + + warning_message = ( + r"Number of classes in training fold \(2\) does " + r"not match total number of classes \(3\). " + "Results may not be appropriate for your use case." + ) + with pytest.warns(RuntimeWarning, match=warning_message): + cross_val_predict( + LogisticRegression(solver="liblinear"), + X, + y, + method="predict_proba", + cv=KFold(2), + ) + + +def test_cross_val_predict_decision_function_shape(): + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + + preds = cross_val_predict( + LogisticRegression(solver="liblinear"), X, y, method="decision_function" + ) + assert preds.shape == (50,) + + X, y = load_iris(return_X_y=True) + + preds = cross_val_predict( + LogisticRegression(solver="liblinear"), X, y, method="decision_function" + ) + assert preds.shape == (150, 3) + + # This specifically tests imbalanced splits for binary + # classification with decision_function. This is only + # applicable to classifiers that can be fit on a single + # class. + X = X[:100] + y = y[:100] + error_message = ( + "Only 1 class/es in training fold," + " but 2 in overall dataset. This" + " is not supported for decision_function" + " with imbalanced folds. To fix " + "this, use a cross-validation technique " + "resulting in properly stratified folds" + ) + with pytest.raises(ValueError, match=error_message): + cross_val_predict( + RidgeClassifier(), X, y, method="decision_function", cv=KFold(2) + ) + + X, y = load_digits(return_X_y=True) + est = SVC(kernel="linear", decision_function_shape="ovo") + + preds = cross_val_predict(est, X, y, method="decision_function") + assert preds.shape == (1797, 45) + + ind = np.argsort(y) + X, y = X[ind], y[ind] + error_message_regexp = ( + r"Output shape \(599L?, 21L?\) of " + "decision_function does not match number of " + r"classes \(7\) in fold. Irregular " + "decision_function .*" + ) + with pytest.raises(ValueError, match=error_message_regexp): + cross_val_predict(est, X, y, cv=KFold(n_splits=3), method="decision_function") + + +def test_cross_val_predict_predict_proba_shape(): + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + + preds = cross_val_predict( + LogisticRegression(solver="liblinear"), X, y, method="predict_proba" + ) + assert preds.shape == (50, 2) + + X, y = load_iris(return_X_y=True) + + preds = cross_val_predict( + LogisticRegression(solver="liblinear"), X, y, method="predict_proba" + ) + assert preds.shape == (150, 3) + + +def test_cross_val_predict_predict_log_proba_shape(): + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + + preds = cross_val_predict( + LogisticRegression(solver="liblinear"), X, y, method="predict_log_proba" + ) + assert preds.shape == (50, 2) + + X, y = load_iris(return_X_y=True) + + preds = cross_val_predict( + LogisticRegression(solver="liblinear"), X, y, method="predict_log_proba" + ) + assert preds.shape == (150, 3) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_cross_val_predict_input_types(coo_container): + iris = load_iris() + X, y = iris.data, iris.target + X_sparse = coo_container(X) + multioutput_y = np.column_stack([y, y[::-1]]) + + clf = Ridge(fit_intercept=False, random_state=0) + # 3 fold cv is used --> at least 3 samples per class + # Smoke test + predictions = cross_val_predict(clf, X, y) + assert predictions.shape == (150,) + + # test with multioutput y + predictions = cross_val_predict(clf, X_sparse, multioutput_y) + assert predictions.shape == (150, 2) + + predictions = cross_val_predict(clf, X_sparse, y) + assert_array_equal(predictions.shape, (150,)) + + # test with multioutput y + predictions = cross_val_predict(clf, X_sparse, multioutput_y) + assert_array_equal(predictions.shape, (150, 2)) + + # test with X and y as list + list_check = lambda x: isinstance(x, list) + clf = CheckingClassifier(check_X=list_check) + predictions = cross_val_predict(clf, X.tolist(), y.tolist()) + + clf = CheckingClassifier(check_y=list_check) + predictions = cross_val_predict(clf, X, y.tolist()) + + # test with X and y as list and non empty method + predictions = cross_val_predict( + LogisticRegression(solver="liblinear"), + X.tolist(), + y.tolist(), + method="decision_function", + ) + predictions = cross_val_predict( + LogisticRegression(solver="liblinear"), + X, + y.tolist(), + method="decision_function", + ) + + # test with 3d X and + X_3d = X[:, :, np.newaxis] + check_3d = lambda x: x.ndim == 3 + clf = CheckingClassifier(check_X=check_3d) + predictions = cross_val_predict(clf, X_3d, y) + assert_array_equal(predictions.shape, (150,)) + + +def test_cross_val_predict_pandas(): + # check cross_val_score doesn't destroy pandas dataframe + types = [(MockDataFrame, MockDataFrame)] + try: + from pandas import DataFrame, Series + + types.append((Series, DataFrame)) + except ImportError: + pass + for TargetType, InputFeatureType in types: + # X dataframe, y series + X_df, y_ser = InputFeatureType(X), TargetType(y2) + check_df = lambda x: isinstance(x, InputFeatureType) + check_series = lambda x: isinstance(x, TargetType) + clf = CheckingClassifier(check_X=check_df, check_y=check_series) + cross_val_predict(clf, X_df, y_ser, cv=3) + + +def test_cross_val_predict_unbalanced(): + X, y = make_classification( + n_samples=100, + n_features=2, + n_redundant=0, + n_informative=2, + n_clusters_per_class=1, + random_state=1, + ) + # Change the first sample to a new class + y[0] = 2 + clf = LogisticRegression(random_state=1, solver="liblinear") + cv = StratifiedKFold(n_splits=2) + train, test = list(cv.split(X, y)) + yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba") + assert y[test[0]][0] == 2 # sanity check for further assertions + assert np.all(yhat_proba[test[0]][:, 2] == 0) + assert np.all(yhat_proba[test[0]][:, 0:1] > 0) + assert np.all(yhat_proba[test[1]] > 0) + assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape), decimal=12) + + +def test_cross_val_predict_y_none(): + # ensure that cross_val_predict works when y is None + mock_classifier = MockClassifier() + rng = np.random.RandomState(42) + X = rng.rand(100, 10) + y_hat = cross_val_predict(mock_classifier, X, y=None, cv=5, method="predict") + assert_allclose(X[:, 0], y_hat) + y_hat_proba = cross_val_predict( + mock_classifier, X, y=None, cv=5, method="predict_proba" + ) + assert_allclose(X, y_hat_proba) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_cross_val_score_sparse_fit_params(coo_container): + iris = load_iris() + X, y = iris.data, iris.target + clf = MockClassifier() + fit_params = {"sparse_sample_weight": coo_container(np.eye(X.shape[0]))} + a = cross_val_score(clf, X, y, params=fit_params, cv=3) + assert_array_equal(a, np.ones(3)) + + +def test_learning_curve(): + n_samples = 30 + n_splits = 3 + X, y = make_classification( + n_samples=n_samples, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(n_samples * ((n_splits - 1) / n_splits)) + for shuffle_train in [False, True]: + with warnings.catch_warnings(record=True) as w: + ( + train_sizes, + train_scores, + test_scores, + fit_times, + score_times, + ) = learning_curve( + estimator, + X, + y, + cv=KFold(n_splits=n_splits), + train_sizes=np.linspace(0.1, 1.0, 10), + shuffle=shuffle_train, + return_times=True, + ) + if len(w) > 0: + raise RuntimeError("Unexpected warning: %r" % w[0].message) + assert train_scores.shape == (10, 3) + assert test_scores.shape == (10, 3) + assert fit_times.shape == (10, 3) + assert score_times.shape == (10, 3) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) + + # Cannot use assert_array_almost_equal for fit and score times because + # the values are hardware-dependant + assert fit_times.dtype == "float64" + assert score_times.dtype == "float64" + + # Test a custom cv splitter that can iterate only once + with warnings.catch_warnings(record=True) as w: + train_sizes2, train_scores2, test_scores2 = learning_curve( + estimator, + X, + y, + cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples), + train_sizes=np.linspace(0.1, 1.0, 10), + shuffle=shuffle_train, + ) + if len(w) > 0: + raise RuntimeError("Unexpected warning: %r" % w[0].message) + assert_array_almost_equal(train_scores2, train_scores) + assert_array_almost_equal(test_scores2, test_scores) + + +def test_learning_curve_unsupervised(): + X, _ = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(20) + train_sizes, train_scores, test_scores = learning_curve( + estimator, X, y=None, cv=3, train_sizes=np.linspace(0.1, 1.0, 10) + ) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) + + +def test_learning_curve_verbose(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(20) + + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + train_sizes, train_scores, test_scores = learning_curve( + estimator, X, y, cv=3, verbose=1 + ) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + + assert "[learning_curve]" in out + + +def test_learning_curve_incremental_learning_not_possible(): + X, y = make_classification( + n_samples=2, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + # The mockup does not have partial_fit() + estimator = MockImprovingEstimator(1) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, exploit_incremental_learning=True) + + +def test_learning_curve_incremental_learning(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockIncrementalImprovingEstimator(20) + for shuffle_train in [False, True]: + train_sizes, train_scores, test_scores = learning_curve( + estimator, + X, + y, + cv=3, + exploit_incremental_learning=True, + train_sizes=np.linspace(0.1, 1.0, 10), + shuffle=shuffle_train, + ) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) + + +def test_learning_curve_incremental_learning_unsupervised(): + X, _ = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockIncrementalImprovingEstimator(20) + train_sizes, train_scores, test_scores = learning_curve( + estimator, + X, + y=None, + cv=3, + exploit_incremental_learning=True, + train_sizes=np.linspace(0.1, 1.0, 10), + ) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) + + +def test_learning_curve_batch_and_incremental_learning_are_equal(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + train_sizes = np.linspace(0.2, 1.0, 5) + estimator = PassiveAggressiveClassifier(max_iter=1, tol=None, shuffle=False) + + train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve( + estimator, + X, + y, + train_sizes=train_sizes, + cv=3, + exploit_incremental_learning=True, + ) + train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve( + estimator, + X, + y, + cv=3, + train_sizes=train_sizes, + exploit_incremental_learning=False, + ) + + assert_array_equal(train_sizes_inc, train_sizes_batch) + assert_array_almost_equal( + train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1) + ) + assert_array_almost_equal( + test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1) + ) + + +def test_learning_curve_n_sample_range_out_of_bounds(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(20) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0, 1]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0.0, 1.0]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0.1, 1.1]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0, 20]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[1, 21]) + + +def test_learning_curve_remove_duplicate_sample_sizes(): + X, y = make_classification( + n_samples=3, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(2) + warning_message = ( + "Removed duplicate entries from 'train_sizes'. Number of ticks " + "will be less than the size of 'train_sizes': 2 instead of 3." + ) + with pytest.warns(RuntimeWarning, match=warning_message): + train_sizes, _, _ = learning_curve( + estimator, X, y, cv=3, train_sizes=np.linspace(0.33, 1.0, 3) + ) + assert_array_equal(train_sizes, [1, 2]) + + +def test_learning_curve_with_boolean_indices(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(20) + cv = KFold(n_splits=3) + train_sizes, train_scores, test_scores = learning_curve( + estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10) + ) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) + + +def test_learning_curve_with_shuffle(): + # Following test case was designed this way to verify the code + # changes made in pull request: #7506. + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [11, 12], + [13, 14], + [15, 16], + [17, 18], + [19, 20], + [7, 8], + [9, 10], + [11, 12], + [13, 14], + [15, 16], + [17, 18], + ] + ) + y = np.array([1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 2, 3, 4]) + groups = np.array([1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4, 4]) + # Splits on these groups fail without shuffle as the first iteration + # of the learning curve doesn't contain label 4 in the training set. + estimator = PassiveAggressiveClassifier(max_iter=5, tol=None, shuffle=False) + + cv = GroupKFold(n_splits=2) + train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve( + estimator, + X, + y, + cv=cv, + n_jobs=1, + train_sizes=np.linspace(0.3, 1.0, 3), + groups=groups, + shuffle=True, + random_state=2, + ) + assert_array_almost_equal( + train_scores_batch.mean(axis=1), np.array([0.75, 0.3, 0.36111111]) + ) + assert_array_almost_equal( + test_scores_batch.mean(axis=1), np.array([0.36111111, 0.25, 0.25]) + ) + with pytest.raises(ValueError): + learning_curve( + estimator, + X, + y, + cv=cv, + n_jobs=1, + train_sizes=np.linspace(0.3, 1.0, 3), + groups=groups, + error_score="raise", + ) + + train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve( + estimator, + X, + y, + cv=cv, + n_jobs=1, + train_sizes=np.linspace(0.3, 1.0, 3), + groups=groups, + shuffle=True, + random_state=2, + exploit_incremental_learning=True, + ) + assert_array_almost_equal( + train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1) + ) + assert_array_almost_equal( + test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1) + ) + + +def test_learning_curve_params(): + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + clf = CheckingClassifier(expected_sample_weight=True) + + err_msg = r"Expected sample_weight to be passed" + with pytest.raises(AssertionError, match=err_msg): + learning_curve(clf, X, y, error_score="raise") + + err_msg = r"sample_weight.shape == \(1,\), expected \(2,\)!" + with pytest.raises(ValueError, match=err_msg): + learning_curve( + clf, X, y, error_score="raise", params={"sample_weight": np.ones(1)} + ) + learning_curve( + clf, X, y, error_score="raise", params={"sample_weight": np.ones(10)} + ) + + +def test_learning_curve_incremental_learning_params(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockIncrementalImprovingEstimator(20, ["sample_weight"]) + err_msg = r"Expected fit parameter\(s\) \['sample_weight'\] not seen." + with pytest.raises(AssertionError, match=err_msg): + learning_curve( + estimator, + X, + y, + cv=3, + exploit_incremental_learning=True, + train_sizes=np.linspace(0.1, 1.0, 10), + error_score="raise", + ) + + err_msg = "Fit parameter sample_weight has length 3; expected" + with pytest.raises(AssertionError, match=err_msg): + learning_curve( + estimator, + X, + y, + cv=3, + exploit_incremental_learning=True, + train_sizes=np.linspace(0.1, 1.0, 10), + error_score="raise", + params={"sample_weight": np.ones(3)}, + ) + + learning_curve( + estimator, + X, + y, + cv=3, + exploit_incremental_learning=True, + train_sizes=np.linspace(0.1, 1.0, 10), + error_score="raise", + params={"sample_weight": np.ones(2)}, + ) + + +def test_validation_curve(): + X, y = make_classification( + n_samples=2, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + param_range = np.linspace(0, 1, 10) + with warnings.catch_warnings(record=True) as w: + train_scores, test_scores = validation_curve( + MockEstimatorWithParameter(), + X, + y, + param_name="param", + param_range=param_range, + cv=2, + ) + if len(w) > 0: + raise RuntimeError("Unexpected warning: %r" % w[0].message) + + assert_array_almost_equal(train_scores.mean(axis=1), param_range) + assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range) + + +def test_validation_curve_clone_estimator(): + X, y = make_classification( + n_samples=2, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + + param_range = np.linspace(1, 0, 10) + _, _ = validation_curve( + MockEstimatorWithSingleFitCallAllowed(), + X, + y, + param_name="param", + param_range=param_range, + cv=2, + ) + + +def test_validation_curve_cv_splits_consistency(): + n_samples = 100 + n_splits = 5 + X, y = make_classification(n_samples=100, random_state=0) + + scores1 = validation_curve( + SVC(kernel="linear", random_state=0), + X, + y, + param_name="C", + param_range=[0.1, 0.1, 0.2, 0.2], + cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples), + ) + # The OneTimeSplitter is a non-re-entrant cv splitter. Unless, the + # `split` is called for each parameter, the following should produce + # identical results for param setting 1 and param setting 2 as both have + # the same C value. + assert_array_almost_equal(*np.vsplit(np.hstack(scores1)[(0, 2, 1, 3), :], 2)) + + scores2 = validation_curve( + SVC(kernel="linear", random_state=0), + X, + y, + param_name="C", + param_range=[0.1, 0.1, 0.2, 0.2], + cv=KFold(n_splits=n_splits, shuffle=True), + ) + + # For scores2, compare the 1st and 2nd parameter's scores + # (Since the C value for 1st two param setting is 0.1, they must be + # consistent unless the train test folds differ between the param settings) + assert_array_almost_equal(*np.vsplit(np.hstack(scores2)[(0, 2, 1, 3), :], 2)) + + scores3 = validation_curve( + SVC(kernel="linear", random_state=0), + X, + y, + param_name="C", + param_range=[0.1, 0.1, 0.2, 0.2], + cv=KFold(n_splits=n_splits), + ) + + # OneTimeSplitter is basically unshuffled KFold(n_splits=5). Sanity check. + assert_array_almost_equal(np.array(scores3), np.array(scores1)) + + +def test_validation_curve_params(): + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + clf = CheckingClassifier(expected_sample_weight=True) + + err_msg = r"Expected sample_weight to be passed" + with pytest.raises(AssertionError, match=err_msg): + validation_curve( + clf, + X, + y, + param_name="foo_param", + param_range=[1, 2, 3], + error_score="raise", + ) + + err_msg = r"sample_weight.shape == \(1,\), expected \(8,\)!" + with pytest.raises(ValueError, match=err_msg): + validation_curve( + clf, + X, + y, + param_name="foo_param", + param_range=[1, 2, 3], + error_score="raise", + params={"sample_weight": np.ones(1)}, + ) + validation_curve( + clf, + X, + y, + param_name="foo_param", + param_range=[1, 2, 3], + error_score="raise", + params={"sample_weight": np.ones(10)}, + ) + + +def test_check_is_permutation(): + rng = np.random.RandomState(0) + p = np.arange(100) + rng.shuffle(p) + assert _check_is_permutation(p, 100) + assert not _check_is_permutation(np.delete(p, 23), 100) + + p[0] = 23 + assert not _check_is_permutation(p, 100) + + # Check if the additional duplicate indices are caught + assert not _check_is_permutation(np.hstack((p, 0)), 100) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_cross_val_predict_sparse_prediction(csr_container): + # check that cross_val_predict gives same result for sparse and dense input + X, y = make_multilabel_classification( + n_classes=2, + n_labels=1, + allow_unlabeled=False, + return_indicator=True, + random_state=1, + ) + X_sparse = csr_container(X) + y_sparse = csr_container(y) + classif = OneVsRestClassifier(SVC(kernel="linear")) + preds = cross_val_predict(classif, X, y, cv=10) + preds_sparse = cross_val_predict(classif, X_sparse, y_sparse, cv=10) + preds_sparse = preds_sparse.toarray() + assert_array_almost_equal(preds_sparse, preds) + + +def check_cross_val_predict_binary(est, X, y, method): + """Helper for tests of cross_val_predict with binary classification""" + cv = KFold(n_splits=3, shuffle=False) + + # Generate expected outputs + if y.ndim == 1: + exp_shape = (len(X),) if method == "decision_function" else (len(X), 2) + else: + exp_shape = y.shape + expected_predictions = np.zeros(exp_shape) + for train, test in cv.split(X, y): + est = clone(est).fit(X[train], y[train]) + expected_predictions[test] = getattr(est, method)(X[test]) + + # Check actual outputs for several representations of y + for tg in [y, y + 1, y - 2, y.astype("str")]: + assert_allclose( + cross_val_predict(est, X, tg, method=method, cv=cv), expected_predictions + ) + + +def check_cross_val_predict_multiclass(est, X, y, method): + """Helper for tests of cross_val_predict with multiclass classification""" + cv = KFold(n_splits=3, shuffle=False) + + # Generate expected outputs + float_min = np.finfo(np.float64).min + default_values = { + "decision_function": float_min, + "predict_log_proba": float_min, + "predict_proba": 0, + } + expected_predictions = np.full( + (len(X), len(set(y))), default_values[method], dtype=np.float64 + ) + _, y_enc = np.unique(y, return_inverse=True) + for train, test in cv.split(X, y_enc): + est = clone(est).fit(X[train], y_enc[train]) + fold_preds = getattr(est, method)(X[test]) + i_cols_fit = np.unique(y_enc[train]) + expected_predictions[np.ix_(test, i_cols_fit)] = fold_preds + + # Check actual outputs for several representations of y + for tg in [y, y + 1, y - 2, y.astype("str")]: + assert_allclose( + cross_val_predict(est, X, tg, method=method, cv=cv), expected_predictions + ) + + +def check_cross_val_predict_multilabel(est, X, y, method): + """Check the output of cross_val_predict for 2D targets using + Estimators which provide a predictions as a list with one + element per class. + """ + cv = KFold(n_splits=3, shuffle=False) + + # Create empty arrays of the correct size to hold outputs + float_min = np.finfo(np.float64).min + default_values = { + "decision_function": float_min, + "predict_log_proba": float_min, + "predict_proba": 0, + } + n_targets = y.shape[1] + expected_preds = [] + for i_col in range(n_targets): + n_classes_in_label = len(set(y[:, i_col])) + if n_classes_in_label == 2 and method == "decision_function": + exp_shape = (len(X),) + else: + exp_shape = (len(X), n_classes_in_label) + expected_preds.append( + np.full(exp_shape, default_values[method], dtype=np.float64) + ) + + # Generate expected outputs + y_enc_cols = [ + np.unique(y[:, i], return_inverse=True)[1][:, np.newaxis] + for i in range(y.shape[1]) + ] + y_enc = np.concatenate(y_enc_cols, axis=1) + for train, test in cv.split(X, y_enc): + est = clone(est).fit(X[train], y_enc[train]) + fold_preds = getattr(est, method)(X[test]) + for i_col in range(n_targets): + fold_cols = np.unique(y_enc[train][:, i_col]) + if expected_preds[i_col].ndim == 1: + # Decision function with <=2 classes + expected_preds[i_col][test] = fold_preds[i_col] + else: + idx = np.ix_(test, fold_cols) + expected_preds[i_col][idx] = fold_preds[i_col] + + # Check actual outputs for several representations of y + for tg in [y, y + 1, y - 2, y.astype("str")]: + cv_predict_output = cross_val_predict(est, X, tg, method=method, cv=cv) + assert len(cv_predict_output) == len(expected_preds) + for i in range(len(cv_predict_output)): + assert_allclose(cv_predict_output[i], expected_preds[i]) + + +def check_cross_val_predict_with_method_binary(est): + # This test includes the decision_function with two classes. + # This is a special case: it has only one column of output. + X, y = make_classification(n_classes=2, random_state=0) + for method in ["decision_function", "predict_proba", "predict_log_proba"]: + check_cross_val_predict_binary(est, X, y, method) + + +def check_cross_val_predict_with_method_multiclass(est): + iris = load_iris() + X, y = iris.data, iris.target + X, y = shuffle(X, y, random_state=0) + for method in ["decision_function", "predict_proba", "predict_log_proba"]: + check_cross_val_predict_multiclass(est, X, y, method) + + +def test_cross_val_predict_with_method(): + check_cross_val_predict_with_method_binary(LogisticRegression(solver="liblinear")) + check_cross_val_predict_with_method_multiclass( + LogisticRegression(solver="liblinear") + ) + + +def test_cross_val_predict_method_checking(): + # Regression test for issue #9639. Tests that cross_val_predict does not + # check estimator methods (e.g. predict_proba) before fitting + iris = load_iris() + X, y = iris.data, iris.target + X, y = shuffle(X, y, random_state=0) + for method in ["decision_function", "predict_proba", "predict_log_proba"]: + est = SGDClassifier(loss="log_loss", random_state=2) + check_cross_val_predict_multiclass(est, X, y, method) + + +def test_gridsearchcv_cross_val_predict_with_method(): + iris = load_iris() + X, y = iris.data, iris.target + X, y = shuffle(X, y, random_state=0) + est = GridSearchCV( + LogisticRegression(random_state=42, solver="liblinear"), {"C": [0.1, 1]}, cv=2 + ) + for method in ["decision_function", "predict_proba", "predict_log_proba"]: + check_cross_val_predict_multiclass(est, X, y, method) + + +def test_cross_val_predict_with_method_multilabel_ovr(): + # OVR does multilabel predictions, but only arrays of + # binary indicator columns. The output of predict_proba + # is a 2D array with shape (n_samples, n_classes). + n_samp = 100 + n_classes = 4 + X, y = make_multilabel_classification( + n_samples=n_samp, n_labels=3, n_classes=n_classes, n_features=5, random_state=42 + ) + est = OneVsRestClassifier(LogisticRegression(solver="liblinear", random_state=0)) + for method in ["predict_proba", "decision_function"]: + check_cross_val_predict_binary(est, X, y, method=method) + + +class RFWithDecisionFunction(RandomForestClassifier): + # None of the current multioutput-multiclass estimators have + # decision function methods. Create a mock decision function + # to test the cross_val_predict function's handling of this case. + def decision_function(self, X): + probs = self.predict_proba(X) + msg = "This helper should only be used on multioutput-multiclass tasks" + assert isinstance(probs, list), msg + probs = [p[:, -1] if p.shape[1] == 2 else p for p in probs] + return probs + + +def test_cross_val_predict_with_method_multilabel_rf(): + # The RandomForest allows multiple classes in each label. + # Output of predict_proba is a list of outputs of predict_proba + # for each individual label. + n_classes = 4 + X, y = make_multilabel_classification( + n_samples=100, n_labels=3, n_classes=n_classes, n_features=5, random_state=42 + ) + y[:, 0] += y[:, 1] # Put three classes in the first column + for method in ["predict_proba", "predict_log_proba", "decision_function"]: + est = RFWithDecisionFunction(n_estimators=5, random_state=0) + with warnings.catch_warnings(): + # Suppress "RuntimeWarning: divide by zero encountered in log" + warnings.simplefilter("ignore") + check_cross_val_predict_multilabel(est, X, y, method=method) + + +def test_cross_val_predict_with_method_rare_class(): + # Test a multiclass problem where one class will be missing from + # one of the CV training sets. + rng = np.random.RandomState(0) + X = rng.normal(0, 1, size=(14, 10)) + y = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 3]) + est = LogisticRegression(solver="liblinear") + for method in ["predict_proba", "predict_log_proba", "decision_function"]: + with warnings.catch_warnings(): + # Suppress warning about too few examples of a class + warnings.simplefilter("ignore") + check_cross_val_predict_multiclass(est, X, y, method) + + +def test_cross_val_predict_with_method_multilabel_rf_rare_class(): + # The RandomForest allows anything for the contents of the labels. + # Output of predict_proba is a list of outputs of predict_proba + # for each individual label. + # In this test, the first label has a class with a single example. + # We'll have one CV fold where the training data don't include it. + rng = np.random.RandomState(0) + X = rng.normal(0, 1, size=(5, 10)) + y = np.array([[0, 0], [1, 1], [2, 1], [0, 1], [1, 0]]) + for method in ["predict_proba", "predict_log_proba"]: + est = RFWithDecisionFunction(n_estimators=5, random_state=0) + with warnings.catch_warnings(): + # Suppress "RuntimeWarning: divide by zero encountered in log" + warnings.simplefilter("ignore") + check_cross_val_predict_multilabel(est, X, y, method=method) + + +def get_expected_predictions(X, y, cv, classes, est, method): + expected_predictions = np.zeros([len(y), classes]) + func = getattr(est, method) + + for train, test in cv.split(X, y): + est.fit(X[train], y[train]) + expected_predictions_ = func(X[test]) + # To avoid 2 dimensional indexing + if method == "predict_proba": + exp_pred_test = np.zeros((len(test), classes)) + else: + exp_pred_test = np.full( + (len(test), classes), np.finfo(expected_predictions.dtype).min + ) + exp_pred_test[:, est.classes_] = expected_predictions_ + expected_predictions[test] = exp_pred_test + + return expected_predictions + + +def test_cross_val_predict_class_subset(): + X = np.arange(200).reshape(100, 2) + y = np.array([x // 10 for x in range(100)]) + classes = 10 + + kfold3 = KFold(n_splits=3) + kfold4 = KFold(n_splits=4) + + le = LabelEncoder() + + methods = ["decision_function", "predict_proba", "predict_log_proba"] + for method in methods: + est = LogisticRegression(solver="liblinear") + + # Test with n_splits=3 + predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) + + # Runs a naive loop (should be same as cross_val_predict): + expected_predictions = get_expected_predictions( + X, y, kfold3, classes, est, method + ) + assert_array_almost_equal(expected_predictions, predictions) + + # Test with n_splits=4 + predictions = cross_val_predict(est, X, y, method=method, cv=kfold4) + expected_predictions = get_expected_predictions( + X, y, kfold4, classes, est, method + ) + assert_array_almost_equal(expected_predictions, predictions) + + # Testing unordered labels + y = shuffle(np.repeat(range(10), 10), random_state=0) + predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) + y = le.fit_transform(y) + expected_predictions = get_expected_predictions( + X, y, kfold3, classes, est, method + ) + assert_array_almost_equal(expected_predictions, predictions) + + +def test_score_memmap(): + # Ensure a scalar score of memmap type is accepted + iris = load_iris() + X, y = iris.data, iris.target + clf = MockClassifier() + tf = tempfile.NamedTemporaryFile(mode="wb", delete=False) + tf.write(b"Hello world!!!!!") + tf.close() + scores = np.memmap(tf.name, dtype=np.float64) + score = np.memmap(tf.name, shape=(), mode="r", dtype=np.float64) + try: + cross_val_score(clf, X, y, scoring=lambda est, X, y: score) + with pytest.raises(ValueError): + cross_val_score(clf, X, y, scoring=lambda est, X, y: scores) + finally: + # Best effort to release the mmap file handles before deleting the + # backing file under Windows + scores, score = None, None + for _ in range(3): + try: + os.unlink(tf.name) + break + except OSError: + sleep(1.0) + + +def test_permutation_test_score_pandas(): + # check permutation_test_score doesn't destroy pandas dataframe + types = [(MockDataFrame, MockDataFrame)] + try: + from pandas import DataFrame, Series + + types.append((Series, DataFrame)) + except ImportError: + pass + for TargetType, InputFeatureType in types: + # X dataframe, y series + iris = load_iris() + X, y = iris.data, iris.target + X_df, y_ser = InputFeatureType(X), TargetType(y) + check_df = lambda x: isinstance(x, InputFeatureType) + check_series = lambda x: isinstance(x, TargetType) + clf = CheckingClassifier(check_X=check_df, check_y=check_series) + permutation_test_score(clf, X_df, y_ser) + + +def test_fit_and_score_failing(): + # Create a failing classifier to deliberately fail + failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER) + # dummy X data + X = np.arange(1, 10) + train, test = np.arange(0, 5), np.arange(5, 9) + fit_and_score_args = dict( + estimator=failing_clf, + X=X, + y=None, + scorer=dict(), + train=train, + test=test, + verbose=0, + parameters=None, + fit_params=None, + score_params=None, + ) + # passing error score to trigger the warning message + fit_and_score_args["error_score"] = "raise" + # check if exception was raised, with default error_score='raise' + with pytest.raises(ValueError, match="Failing classifier failed as required"): + _fit_and_score(**fit_and_score_args) + + assert failing_clf.score() == 0.0 # FailingClassifier coverage + + +def test_fit_and_score_working(): + X, y = make_classification(n_samples=30, random_state=0) + clf = SVC(kernel="linear", random_state=0) + train, test = next(ShuffleSplit().split(X)) + # Test return_parameters option + fit_and_score_args = dict( + estimator=clf, + X=X, + y=y, + scorer=dict(), + train=train, + test=test, + verbose=0, + parameters={"max_iter": 100, "tol": 0.1}, + fit_params=None, + score_params=None, + return_parameters=True, + ) + result = _fit_and_score(**fit_and_score_args) + assert result["parameters"] == fit_and_score_args["parameters"] + + +class DataDependentFailingClassifier(BaseEstimator): + def __init__(self, max_x_value=None): + self.max_x_value = max_x_value + + def fit(self, X, y=None): + num_values_too_high = (X > self.max_x_value).sum() + if num_values_too_high: + raise ValueError( + f"Classifier fit failed with {num_values_too_high} values too high" + ) + + def score(self, X=None, Y=None): + return 0.0 + + +@pytest.mark.parametrize("error_score", [np.nan, 0]) +def test_cross_validate_some_failing_fits_warning(error_score): + # Create a failing classifier to deliberately fail + failing_clf = DataDependentFailingClassifier(max_x_value=8) + # dummy X data + X = np.arange(1, 10) + y = np.ones(9) + # passing error score to trigger the warning message + cross_validate_args = [failing_clf, X, y] + cross_validate_kwargs = {"cv": 3, "error_score": error_score} + # check if the warning message type is as expected + + individual_fit_error_message = ( + "ValueError: Classifier fit failed with 1 values too high" + ) + warning_message = re.compile( + ( + "2 fits failed.+total of 3.+The score on these" + " train-test partitions for these parameters will be set to" + f" {cross_validate_kwargs['error_score']}.+{individual_fit_error_message}" + ), + flags=re.DOTALL, + ) + + with pytest.warns(FitFailedWarning, match=warning_message): + cross_validate(*cross_validate_args, **cross_validate_kwargs) + + +@pytest.mark.parametrize("error_score", [np.nan, 0]) +def test_cross_validate_all_failing_fits_error(error_score): + # Create a failing classifier to deliberately fail + failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER) + # dummy X data + X = np.arange(1, 10) + y = np.ones(9) + + cross_validate_args = [failing_clf, X, y] + cross_validate_kwargs = {"cv": 7, "error_score": error_score} + + individual_fit_error_message = "ValueError: Failing classifier failed as required" + error_message = re.compile( + ( + "All the 7 fits failed.+your model is misconfigured.+" + f"{individual_fit_error_message}" + ), + flags=re.DOTALL, + ) + + with pytest.raises(ValueError, match=error_message): + cross_validate(*cross_validate_args, **cross_validate_kwargs) + + +def _failing_scorer(estimator, X, y, error_msg): + raise ValueError(error_msg) + + +@pytest.mark.filterwarnings("ignore:lbfgs failed to converge") +@pytest.mark.parametrize("error_score", [np.nan, 0, "raise"]) +def test_cross_val_score_failing_scorer(error_score): + # check that an estimator can fail during scoring in `cross_val_score` and + # that we can optionally replaced it with `error_score` + X, y = load_iris(return_X_y=True) + clf = LogisticRegression(max_iter=5).fit(X, y) + + error_msg = "This scorer is supposed to fail!!!" + failing_scorer = partial(_failing_scorer, error_msg=error_msg) + + if error_score == "raise": + with pytest.raises(ValueError, match=error_msg): + cross_val_score( + clf, X, y, cv=3, scoring=failing_scorer, error_score=error_score + ) + else: + warning_msg = ( + "Scoring failed. The score on this train-test partition for " + f"these parameters will be set to {error_score}" + ) + with pytest.warns(UserWarning, match=warning_msg): + scores = cross_val_score( + clf, X, y, cv=3, scoring=failing_scorer, error_score=error_score + ) + assert_allclose(scores, error_score) + + +@pytest.mark.filterwarnings("ignore:lbfgs failed to converge") +@pytest.mark.parametrize("error_score", [np.nan, 0, "raise"]) +@pytest.mark.parametrize("return_train_score", [True, False]) +@pytest.mark.parametrize("with_multimetric", [False, True]) +def test_cross_validate_failing_scorer( + error_score, return_train_score, with_multimetric +): + # Check that an estimator can fail during scoring in `cross_validate` and + # that we can optionally replace it with `error_score`. In the multimetric + # case also check the result of a non-failing scorer where the other scorers + # are failing. + X, y = load_iris(return_X_y=True) + clf = LogisticRegression(max_iter=5).fit(X, y) + + error_msg = "This scorer is supposed to fail!!!" + failing_scorer = partial(_failing_scorer, error_msg=error_msg) + if with_multimetric: + non_failing_scorer = make_scorer(mean_squared_error) + scoring = { + "score_1": failing_scorer, + "score_2": non_failing_scorer, + "score_3": failing_scorer, + } + else: + scoring = failing_scorer + + if error_score == "raise": + with pytest.raises(ValueError, match=error_msg): + cross_validate( + clf, + X, + y, + cv=3, + scoring=scoring, + return_train_score=return_train_score, + error_score=error_score, + ) + else: + warning_msg = ( + "Scoring failed. The score on this train-test partition for " + f"these parameters will be set to {error_score}" + ) + with pytest.warns(UserWarning, match=warning_msg): + results = cross_validate( + clf, + X, + y, + cv=3, + scoring=scoring, + return_train_score=return_train_score, + error_score=error_score, + ) + for key in results: + if "_score" in key: + if "_score_2" in key: + # check the test (and optionally train) score for the + # scorer that should be non-failing + for i in results[key]: + assert isinstance(i, float) + else: + # check the test (and optionally train) score for all + # scorers that should be assigned to `error_score`. + assert_allclose(results[key], error_score) + + +def three_params_scorer(i, j, k): + return 3.4213 + + +@pytest.mark.parametrize( + "train_score, scorer, verbose, split_prg, cdt_prg, expected", + [ + ( + False, + three_params_scorer, + 2, + (1, 3), + (0, 1), + r"\[CV\] END ...................................................." + r" total time= 0.\ds", + ), + ( + True, + _MultimetricScorer( + scorers={"sc1": three_params_scorer, "sc2": three_params_scorer} + ), + 3, + (1, 3), + (0, 1), + r"\[CV 2/3\] END sc1: \(train=3.421, test=3.421\) sc2: " + r"\(train=3.421, test=3.421\) total time= 0.\ds", + ), + ( + False, + _MultimetricScorer( + scorers={"sc1": three_params_scorer, "sc2": three_params_scorer} + ), + 10, + (1, 3), + (0, 1), + r"\[CV 2/3; 1/1\] END ....... sc1: \(test=3.421\) sc2: \(test=3.421\)" + r" total time= 0.\ds", + ), + ], +) +def test_fit_and_score_verbosity( + capsys, train_score, scorer, verbose, split_prg, cdt_prg, expected +): + X, y = make_classification(n_samples=30, random_state=0) + clf = SVC(kernel="linear", random_state=0) + train, test = next(ShuffleSplit().split(X)) + + # test print without train score + fit_and_score_args = dict( + estimator=clf, + X=X, + y=y, + scorer=scorer, + train=train, + test=test, + verbose=verbose, + parameters=None, + fit_params=None, + score_params=None, + return_train_score=train_score, + split_progress=split_prg, + candidate_progress=cdt_prg, + ) + _fit_and_score(**fit_and_score_args) + out, _ = capsys.readouterr() + outlines = out.split("\n") + if len(outlines) > 2: + assert re.match(expected, outlines[1]) + else: + assert re.match(expected, outlines[0]) + + +def test_score(): + error_message = "scoring must return a number, got None" + + def two_params_scorer(estimator, X_test): + return None + + with pytest.raises(ValueError, match=error_message): + _score( + estimator=None, + X_test=None, + y_test=None, + scorer=two_params_scorer, + score_params=None, + error_score=np.nan, + ) + + +def test_callable_multimetric_confusion_matrix_cross_validate(): + def custom_scorer(clf, X, y): + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]} + + X, y = make_classification(n_samples=40, n_features=4, random_state=42) + est = LinearSVC(random_state=42) + est.fit(X, y) + cv_results = cross_validate(est, X, y, cv=5, scoring=custom_scorer) + + score_names = ["tn", "fp", "fn", "tp"] + for name in score_names: + assert "test_{}".format(name) in cv_results + + +def test_learning_curve_partial_fit_regressors(): + """Check that regressors with partial_fit is supported. + + Non-regression test for #22981. + """ + X, y = make_regression(random_state=42) + + # Does not error + learning_curve(MLPRegressor(), X, y, exploit_incremental_learning=True, cv=2) + + +def test_learning_curve_some_failing_fits_warning(global_random_seed): + """Checks for fit failures in `learning_curve` and raises the required warning""" + + X, y = make_classification( + n_samples=30, + n_classes=3, + n_informative=6, + shuffle=False, + random_state=global_random_seed, + ) + # sorting the target to trigger SVC error on the 2 first splits because a single + # class is present + sorted_idx = np.argsort(y) + X, y = X[sorted_idx], y[sorted_idx] + + svc = SVC() + warning_message = "10 fits failed out of a total of 25" + + with pytest.warns(FitFailedWarning, match=warning_message): + _, train_score, test_score, *_ = learning_curve( + svc, X, y, cv=5, error_score=np.nan + ) + + # the first 2 splits should lead to warnings and thus np.nan scores + for idx in range(2): + assert np.isnan(train_score[idx]).all() + assert np.isnan(test_score[idx]).all() + + for idx in range(2, train_score.shape[0]): + assert not np.isnan(train_score[idx]).any() + assert not np.isnan(test_score[idx]).any() + + +def test_cross_validate_return_indices(global_random_seed): + """Check the behaviour of `return_indices` in `cross_validate`.""" + X, y = load_iris(return_X_y=True) + X = scale(X) # scale features for better convergence + estimator = LogisticRegression() + + cv = KFold(n_splits=3, shuffle=True, random_state=global_random_seed) + cv_results = cross_validate(estimator, X, y, cv=cv, n_jobs=2, return_indices=False) + assert "indices" not in cv_results + + cv_results = cross_validate(estimator, X, y, cv=cv, n_jobs=2, return_indices=True) + assert "indices" in cv_results + train_indices = cv_results["indices"]["train"] + test_indices = cv_results["indices"]["test"] + assert len(train_indices) == cv.n_splits + assert len(test_indices) == cv.n_splits + + assert_array_equal([indices.size for indices in train_indices], 100) + assert_array_equal([indices.size for indices in test_indices], 50) + + for split_idx, (expected_train_idx, expected_test_idx) in enumerate(cv.split(X, y)): + assert_array_equal(train_indices[split_idx], expected_train_idx) + assert_array_equal(test_indices[split_idx], expected_test_idx) + + +# Tests for metadata routing in cross_val* and in *curve +# ====================================================== + + +# TODO(1.8): remove `learning_curve`, `validation_curve` and `permutation_test_score`. +@pytest.mark.parametrize( + "func, extra_args", + [ + (learning_curve, {}), + (permutation_test_score, {}), + (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), + ], +) +def test_fit_param_deprecation(func, extra_args): + """Check that we warn about deprecating `fit_params`.""" + with pytest.warns(FutureWarning, match="`fit_params` is deprecated"): + func( + estimator=ConsumingClassifier(), X=X, y=y, cv=2, fit_params={}, **extra_args + ) + + with pytest.raises( + ValueError, match="`params` and `fit_params` cannot both be provided" + ): + func( + estimator=ConsumingClassifier(), + X=X, + y=y, + fit_params={}, + params={}, + **extra_args, + ) + + +@pytest.mark.parametrize( + "func, extra_args", + [ + (cross_validate, {}), + (cross_val_score, {}), + (cross_val_predict, {}), + (learning_curve, {}), + (permutation_test_score, {}), + (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), + ], +) +@config_context(enable_metadata_routing=True) +def test_groups_with_routing_validation(func, extra_args): + """Check that we raise an error if `groups` are passed to the cv method instead + of `params` when metadata routing is enabled. + """ + with pytest.raises(ValueError, match="`groups` can only be passed if"): + func( + estimator=ConsumingClassifier(), + X=X, + y=y, + groups=[], + **extra_args, + ) + + +@pytest.mark.parametrize( + "func, extra_args", + [ + (cross_validate, {}), + (cross_val_score, {}), + (cross_val_predict, {}), + (learning_curve, {}), + (permutation_test_score, {}), + (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), + ], +) +@config_context(enable_metadata_routing=True) +def test_passed_unrequested_metadata(func, extra_args): + """Check that we raise an error when passing metadata that is not + requested.""" + err_msg = re.escape("but are not explicitly set as requested or not requested") + with pytest.raises(ValueError, match=err_msg): + func( + estimator=ConsumingClassifier(), + X=X, + y=y, + params=dict(metadata=[]), + **extra_args, + ) + + +@pytest.mark.parametrize( + "func, extra_args", + [ + (cross_validate, {}), + (cross_val_score, {}), + (cross_val_predict, {}), + (learning_curve, {}), + (permutation_test_score, {}), + (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), + ], +) +@config_context(enable_metadata_routing=True) +def test_validation_functions_routing(func, extra_args): + """Check that the respective cv method is properly dispatching the metadata + to the consumer.""" + scorer_registry = _Registry() + scorer = ConsumingScorer(registry=scorer_registry).set_score_request( + sample_weight="score_weights", metadata="score_metadata" + ) + splitter_registry = _Registry() + splitter = ConsumingSplitter(registry=splitter_registry).set_split_request( + groups="split_groups", metadata="split_metadata" + ) + estimator_registry = _Registry() + estimator = ConsumingClassifier(registry=estimator_registry).set_fit_request( + sample_weight="fit_sample_weight", metadata="fit_metadata" + ) + + n_samples = _num_samples(X) + rng = np.random.RandomState(0) + score_weights = rng.rand(n_samples) + score_metadata = rng.rand(n_samples) + split_groups = rng.randint(0, 3, n_samples) + split_metadata = rng.rand(n_samples) + fit_sample_weight = rng.rand(n_samples) + fit_metadata = rng.rand(n_samples) + + scoring_args = { + cross_validate: dict(scoring=dict(my_scorer=scorer, accuracy="accuracy")), + cross_val_score: dict(scoring=scorer), + learning_curve: dict(scoring=scorer), + validation_curve: dict(scoring=scorer), + permutation_test_score: dict(scoring=scorer), + cross_val_predict: dict(), + } + + params = dict( + split_groups=split_groups, + split_metadata=split_metadata, + fit_sample_weight=fit_sample_weight, + fit_metadata=fit_metadata, + ) + + if func is not cross_val_predict: + params.update( + score_weights=score_weights, + score_metadata=score_metadata, + ) + + func( + estimator, + X=X, + y=y, + cv=splitter, + **scoring_args[func], + **extra_args, + params=params, + ) + + if func is not cross_val_predict: + # cross_val_predict doesn't need a scorer + assert len(scorer_registry) + for _scorer in scorer_registry: + check_recorded_metadata( + obj=_scorer, + method="score", + parent=func.__name__, + split_params=("sample_weight", "metadata"), + sample_weight=score_weights, + metadata=score_metadata, + ) + + assert len(splitter_registry) + for _splitter in splitter_registry: + check_recorded_metadata( + obj=_splitter, + method="split", + parent=func.__name__, + groups=split_groups, + metadata=split_metadata, + ) + + assert len(estimator_registry) + for _estimator in estimator_registry: + check_recorded_metadata( + obj=_estimator, + method="fit", + parent=func.__name__, + split_params=("sample_weight", "metadata"), + sample_weight=fit_sample_weight, + metadata=fit_metadata, + ) + + +@config_context(enable_metadata_routing=True) +def test_learning_curve_exploit_incremental_learning_routing(): + """Test that learning_curve routes metadata to the estimator correctly while + partial_fitting it with `exploit_incremental_learning=True`.""" + + n_samples = _num_samples(X) + rng = np.random.RandomState(0) + fit_sample_weight = rng.rand(n_samples) + fit_metadata = rng.rand(n_samples) + + estimator_registry = _Registry() + estimator = ConsumingClassifier( + registry=estimator_registry + ).set_partial_fit_request( + sample_weight="fit_sample_weight", metadata="fit_metadata" + ) + + learning_curve( + estimator, + X=X, + y=y, + cv=ConsumingSplitter(), + exploit_incremental_learning=True, + params=dict(fit_sample_weight=fit_sample_weight, fit_metadata=fit_metadata), + ) + + assert len(estimator_registry) + for _estimator in estimator_registry: + check_recorded_metadata( + obj=_estimator, + method="partial_fit", + parent="learning_curve", + split_params=("sample_weight", "metadata"), + sample_weight=fit_sample_weight, + metadata=fit_metadata, + ) + + +# End of metadata routing tests +# ============================= diff --git a/.venv/Lib/site-packages/sklearn/neighbors/__init__.py b/.venv/Lib/site-packages/sklearn/neighbors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2fd0b2ea9663bdd7905ed771de61fccb227bd62c --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/__init__.py @@ -0,0 +1,42 @@ +"""The k-nearest neighbors algorithms.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._ball_tree import BallTree +from ._base import VALID_METRICS, VALID_METRICS_SPARSE, sort_graph_by_row_values +from ._classification import KNeighborsClassifier, RadiusNeighborsClassifier +from ._graph import ( + KNeighborsTransformer, + RadiusNeighborsTransformer, + kneighbors_graph, + radius_neighbors_graph, +) +from ._kd_tree import KDTree +from ._kde import KernelDensity +from ._lof import LocalOutlierFactor +from ._nca import NeighborhoodComponentsAnalysis +from ._nearest_centroid import NearestCentroid +from ._regression import KNeighborsRegressor, RadiusNeighborsRegressor +from ._unsupervised import NearestNeighbors + +__all__ = [ + "BallTree", + "KDTree", + "KNeighborsClassifier", + "KNeighborsRegressor", + "KNeighborsTransformer", + "NearestCentroid", + "NearestNeighbors", + "RadiusNeighborsClassifier", + "RadiusNeighborsRegressor", + "RadiusNeighborsTransformer", + "kneighbors_graph", + "radius_neighbors_graph", + "KernelDensity", + "LocalOutlierFactor", + "NeighborhoodComponentsAnalysis", + "sort_graph_by_row_values", + "VALID_METRICS", + "VALID_METRICS_SPARSE", +] diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..39689881f1436f44d1ecdfc104a676a25a40bcc4 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.lib differ diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.pyd new file mode 100644 index 0000000000000000000000000000000000000000..45d28aa5e3d717c6386baf90c580c2df711f963e Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.pyd differ diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.pyx.tp b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..e0132c10facbc11ea1353d0d7848f9ff8fd0a17c --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.pyx.tp @@ -0,0 +1,284 @@ +{{py: + +# Generated file: _ball_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +}} + + +__all__ = ['BallTree', 'BallTree64', 'BallTree32'] + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'BallTree{{name_suffix}}', + 'binary_tree': 'ball_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'BrayCurtisDistance{{name_suffix}}', + 'CanberraDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'DiceDistance{{name_suffix}}', + 'EuclideanDistance{{name_suffix}}', + 'HammingDistance{{name_suffix}}', + 'HaversineDistance{{name_suffix}}', + 'JaccardDistance{{name_suffix}}', + 'MahalanobisDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}', + 'PyFuncDistance{{name_suffix}}', + 'RogersTanimotoDistance{{name_suffix}}', + 'RussellRaoDistance{{name_suffix}}', + 'SEuclideanDistance{{name_suffix}}', + 'SokalMichenerDistance{{name_suffix}}', + 'SokalSneathDistance{{name_suffix}}', + 'WMinkowskiDistance{{name_suffix}}', +] + +{{endfor}} + +include "_binary_tree.pxi" + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) + pass + +{{endfor}} + + +#---------------------------------------------------------------------- +# The functions below specialized the Binary Tree as a Ball Tree +# +# Note that these functions use the concept of "reduced distance". +# The reduced distance, defined for some metrics, is a quantity which +# is more efficient to compute than the distance, but preserves the +# relative rankings of the true distance. For example, the reduced +# distance for the Euclidean metric is the squared-euclidean distance. +# For some metrics, the reduced distance is simply the distance. + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t n_nodes, + intp_t n_features, +) except -1: + """Allocate arrays needed for the KD Tree""" + tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype={{INPUT_DTYPE}}) + return 0 + + +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + intp_t i_node, + intp_t idx_start, + intp_t idx_end, +) except -1: + """Initialize the node for the dataset stored in tree.data""" + cdef intp_t n_features = tree.data.shape[1] + cdef intp_t n_points = idx_end - idx_start + + cdef intp_t i, j + cdef float64_t radius + cdef const {{INPUT_DTYPE_t}} *this_pt + + cdef intp_t* idx_array = &tree.idx_array[0] + cdef const {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] + cdef {{INPUT_DTYPE_t}}* centroid = &tree.node_bounds[0, i_node, 0] + + cdef bint with_sample_weight = tree.sample_weight is not None + cdef const {{INPUT_DTYPE_t}}* sample_weight + cdef float64_t sum_weight_node + if with_sample_weight: + sample_weight = &tree.sample_weight[0] + + # determine Node centroid + for j in range(n_features): + centroid[j] = 0 + + if with_sample_weight: + sum_weight_node = 0 + for i in range(idx_start, idx_end): + sum_weight_node += sample_weight[idx_array[i]] + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] * sample_weight[idx_array[i]] + + for j in range(n_features): + centroid[j] /= sum_weight_node + else: + for i in range(idx_start, idx_end): + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] + + for j in range(n_features): + centroid[j] /= n_points + + # determine Node radius + radius = 0 + for i in range(idx_start, idx_end): + radius = fmax(radius, + tree.rdist(centroid, + data + n_features * idx_array[i], + n_features)) + + node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) + node_data[i_node].idx_start = idx_start + node_data[i_node].idx_end = idx_end + return 0 + + +cdef inline float64_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return fmax(0, dist_pt - tree.node_data[i_node].radius) + + +cdef inline float64_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return dist_pt + tree.node_data[i_node].radius + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + float64_t* min_dist, + float64_t* max_dist, +) except -1 nogil: + """Compute the minimum and maximum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + cdef float64_t rad = tree.node_data[i_node].radius + min_dist[0] = fmax(0, dist_pt - rad) + max_dist[0] = dist_pt + rad + return 0 + + +cdef inline float64_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline float64_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline float64_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the minimum distance between two nodes""" + cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return fmax(0, (dist_pt - tree1.node_data[i_node1].radius + - tree2.node_data[i_node2].radius)) + + +cdef inline float64_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the maximum distance between two nodes""" + cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return (dist_pt + tree1.node_data[i_node1].radius + + tree2.node_data[i_node2].radius) + + +cdef inline float64_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the minimum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + + +cdef inline float64_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the maximum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} + + +class BallTree(BallTree64): + __doc__ = CLASS_DOC.format(BinaryTree="BallTree") + pass diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_base.py b/.venv/Lib/site-packages/sklearn/neighbors/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..419f84ebacda869f3f62f93432c8f9e9a3d039dc --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_base.py @@ -0,0 +1,1403 @@ +"""Base and mixin classes for nearest neighbors.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import numbers +import warnings +from abc import ABCMeta, abstractmethod +from functools import partial +from numbers import Integral, Real + +import numpy as np +from joblib import effective_n_jobs +from scipy.sparse import csr_matrix, issparse + +from ..base import BaseEstimator, MultiOutputMixin, is_classifier +from ..exceptions import DataConversionWarning, EfficiencyWarning +from ..metrics import DistanceMetric, pairwise_distances_chunked +from ..metrics._pairwise_distances_reduction import ( + ArgKmin, + RadiusNeighbors, +) +from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS +from ..utils import ( + check_array, + gen_even_slices, + get_tags, +) +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.fixes import parse_version, sp_base_version +from ..utils.multiclass import check_classification_targets +from ..utils.parallel import Parallel, delayed +from ..utils.validation import _to_object_array, check_is_fitted, validate_data +from ._ball_tree import BallTree +from ._kd_tree import KDTree + +SCIPY_METRICS = [ + "braycurtis", + "canberra", + "chebyshev", + "correlation", + "cosine", + "dice", + "hamming", + "jaccard", + "mahalanobis", + "minkowski", + "rogerstanimoto", + "russellrao", + "seuclidean", + "sokalsneath", + "sqeuclidean", + "yule", +] +if sp_base_version < parse_version("1.17"): + # Deprecated in SciPy 1.15 and removed in SciPy 1.17 + SCIPY_METRICS += ["sokalmichener"] +if sp_base_version < parse_version("1.11"): + # Deprecated in SciPy 1.9 and removed in SciPy 1.11 + SCIPY_METRICS += ["kulsinski"] +if sp_base_version < parse_version("1.9"): + # Deprecated in SciPy 1.0 and removed in SciPy 1.9 + SCIPY_METRICS += ["matching"] + +VALID_METRICS = dict( + ball_tree=BallTree.valid_metrics, + kd_tree=KDTree.valid_metrics, + # The following list comes from the + # sklearn.metrics.pairwise doc string + brute=sorted(set(PAIRWISE_DISTANCE_FUNCTIONS).union(SCIPY_METRICS)), +) + +VALID_METRICS_SPARSE = dict( + ball_tree=[], + kd_tree=[], + brute=(PAIRWISE_DISTANCE_FUNCTIONS.keys() - {"haversine", "nan_euclidean"}), +) + + +def _get_weights(dist, weights): + """Get the weights from an array of distances and a parameter ``weights``. + + Assume weights have already been validated. + + Parameters + ---------- + dist : ndarray + The input distances. + + weights : {'uniform', 'distance'}, callable or None + The kind of weighting used. + + Returns + ------- + weights_arr : array of the same shape as ``dist`` + If ``weights == 'uniform'``, then returns None. + """ + if weights in (None, "uniform"): + return None + + if weights == "distance": + # if user attempts to classify a point that was zero distance from one + # or more training points, those training points are weighted as 1.0 + # and the other points as 0.0 + if dist.dtype is np.dtype(object): + for point_dist_i, point_dist in enumerate(dist): + # check if point_dist is iterable + # (ex: RadiusNeighborClassifier.predict may set an element of + # dist to 1e-6 to represent an 'outlier') + if hasattr(point_dist, "__contains__") and 0.0 in point_dist: + dist[point_dist_i] = point_dist == 0.0 + else: + dist[point_dist_i] = 1.0 / point_dist + else: + with np.errstate(divide="ignore"): + dist = 1.0 / dist + inf_mask = np.isinf(dist) + inf_row = np.any(inf_mask, axis=1) + dist[inf_row] = inf_mask[inf_row] + return dist + + if callable(weights): + return weights(dist) + + +def _is_sorted_by_data(graph): + """Return whether the graph's non-zero entries are sorted by data. + + The non-zero entries are stored in graph.data and graph.indices. + For each row (or sample), the non-zero entries can be either: + - sorted by indices, as after graph.sort_indices(); + - sorted by data, as after _check_precomputed(graph); + - not sorted. + + Parameters + ---------- + graph : sparse matrix of shape (n_samples, n_samples) + Neighbors graph as given by `kneighbors_graph` or + `radius_neighbors_graph`. Matrix should be of format CSR format. + + Returns + ------- + res : bool + Whether input graph is sorted by data. + """ + assert graph.format == "csr" + out_of_order = graph.data[:-1] > graph.data[1:] + line_change = np.unique(graph.indptr[1:-1] - 1) + line_change = line_change[line_change < out_of_order.shape[0]] + return out_of_order.sum() == out_of_order[line_change].sum() + + +def _check_precomputed(X): + """Check precomputed distance matrix. + + If the precomputed distance matrix is sparse, it checks that the non-zero + entries are sorted by distances. If not, the matrix is copied and sorted. + + Parameters + ---------- + X : {sparse matrix, array-like}, (n_samples, n_samples) + Distance matrix to other samples. X may be a sparse matrix, in which + case only non-zero elements may be considered neighbors. + + Returns + ------- + X : {sparse matrix, array-like}, (n_samples, n_samples) + Distance matrix to other samples. X may be a sparse matrix, in which + case only non-zero elements may be considered neighbors. + """ + if not issparse(X): + X = check_array(X, ensure_non_negative=True, input_name="X") + return X + else: + graph = X + + if graph.format not in ("csr", "csc", "coo", "lil"): + raise TypeError( + "Sparse matrix in {!r} format is not supported due to " + "its handling of explicit zeros".format(graph.format) + ) + copied = graph.format != "csr" + graph = check_array( + graph, + accept_sparse="csr", + ensure_non_negative=True, + input_name="precomputed distance matrix", + ) + graph = sort_graph_by_row_values(graph, copy=not copied, warn_when_not_sorted=True) + + return graph + + +@validate_params( + { + "graph": ["sparse matrix"], + "copy": ["boolean"], + "warn_when_not_sorted": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def sort_graph_by_row_values(graph, copy=False, warn_when_not_sorted=True): + """Sort a sparse graph such that each row is stored with increasing values. + + .. versionadded:: 1.2 + + Parameters + ---------- + graph : sparse matrix of shape (n_samples, n_samples) + Distance matrix to other samples, where only non-zero elements are + considered neighbors. Matrix is converted to CSR format if not already. + + copy : bool, default=False + If True, the graph is copied before sorting. If False, the sorting is + performed inplace. If the graph is not of CSR format, `copy` must be + True to allow the conversion to CSR format, otherwise an error is + raised. + + warn_when_not_sorted : bool, default=True + If True, a :class:`~sklearn.exceptions.EfficiencyWarning` is raised + when the input graph is not sorted by row values. + + Returns + ------- + graph : sparse matrix of shape (n_samples, n_samples) + Distance matrix to other samples, where only non-zero elements are + considered neighbors. Matrix is in CSR format. + + Examples + -------- + >>> from scipy.sparse import csr_matrix + >>> from sklearn.neighbors import sort_graph_by_row_values + >>> X = csr_matrix( + ... [[0., 3., 1.], + ... [3., 0., 2.], + ... [1., 2., 0.]]) + >>> X.data + array([3., 1., 3., 2., 1., 2.]) + >>> X_ = sort_graph_by_row_values(X) + >>> X_.data + array([1., 3., 2., 3., 1., 2.]) + """ + if graph.format == "csr" and _is_sorted_by_data(graph): + return graph + + if warn_when_not_sorted: + warnings.warn( + ( + "Precomputed sparse input was not sorted by row values. Use the" + " function sklearn.neighbors.sort_graph_by_row_values to sort the input" + " by row values, with warn_when_not_sorted=False to remove this" + " warning." + ), + EfficiencyWarning, + ) + + if graph.format not in ("csr", "csc", "coo", "lil"): + raise TypeError( + f"Sparse matrix in {graph.format!r} format is not supported due to " + "its handling of explicit zeros" + ) + elif graph.format != "csr": + if not copy: + raise ValueError( + "The input graph is not in CSR format. Use copy=True to allow " + "the conversion to CSR format." + ) + graph = graph.asformat("csr") + elif copy: # csr format with copy=True + graph = graph.copy() + + row_nnz = np.diff(graph.indptr) + if row_nnz.max() == row_nnz.min(): + # if each sample has the same number of provided neighbors + n_samples = graph.shape[0] + distances = graph.data.reshape(n_samples, -1) + + order = np.argsort(distances, kind="mergesort") + order += np.arange(n_samples)[:, None] * row_nnz[0] + order = order.ravel() + graph.data = graph.data[order] + graph.indices = graph.indices[order] + + else: + for start, stop in zip(graph.indptr, graph.indptr[1:]): + order = np.argsort(graph.data[start:stop], kind="mergesort") + graph.data[start:stop] = graph.data[start:stop][order] + graph.indices[start:stop] = graph.indices[start:stop][order] + + return graph + + +def _kneighbors_from_graph(graph, n_neighbors, return_distance): + """Decompose a nearest neighbors sparse graph into distances and indices. + + Parameters + ---------- + graph : sparse matrix of shape (n_samples, n_samples) + Neighbors graph as given by `kneighbors_graph` or + `radius_neighbors_graph`. Matrix should be of format CSR format. + + n_neighbors : int + Number of neighbors required for each sample. + + return_distance : bool + Whether or not to return the distances. + + Returns + ------- + neigh_dist : ndarray of shape (n_samples, n_neighbors) + Distances to nearest neighbors. Only present if `return_distance=True`. + + neigh_ind : ndarray of shape (n_samples, n_neighbors) + Indices of nearest neighbors. + """ + n_samples = graph.shape[0] + assert graph.format == "csr" + + # number of neighbors by samples + row_nnz = np.diff(graph.indptr) + row_nnz_min = row_nnz.min() + if n_neighbors is not None and row_nnz_min < n_neighbors: + raise ValueError( + "%d neighbors per samples are required, but some samples have only" + " %d neighbors in precomputed graph matrix. Decrease number of " + "neighbors used or recompute the graph with more neighbors." + % (n_neighbors, row_nnz_min) + ) + + def extract(a): + # if each sample has the same number of provided neighbors + if row_nnz.max() == row_nnz_min: + return a.reshape(n_samples, -1)[:, :n_neighbors] + else: + idx = np.tile(np.arange(n_neighbors), (n_samples, 1)) + idx += graph.indptr[:-1, None] + return a.take(idx, mode="clip").reshape(n_samples, n_neighbors) + + if return_distance: + return extract(graph.data), extract(graph.indices) + else: + return extract(graph.indices) + + +def _radius_neighbors_from_graph(graph, radius, return_distance): + """Decompose a nearest neighbors sparse graph into distances and indices. + + Parameters + ---------- + graph : sparse matrix of shape (n_samples, n_samples) + Neighbors graph as given by `kneighbors_graph` or + `radius_neighbors_graph`. Matrix should be of format CSR format. + + radius : float + Radius of neighborhoods which should be strictly positive. + + return_distance : bool + Whether or not to return the distances. + + Returns + ------- + neigh_dist : ndarray of shape (n_samples,) of arrays + Distances to nearest neighbors. Only present if `return_distance=True`. + + neigh_ind : ndarray of shape (n_samples,) of arrays + Indices of nearest neighbors. + """ + assert graph.format == "csr" + + no_filter_needed = bool(graph.data.max() <= radius) + + if no_filter_needed: + data, indices, indptr = graph.data, graph.indices, graph.indptr + else: + mask = graph.data <= radius + if return_distance: + data = np.compress(mask, graph.data) + indices = np.compress(mask, graph.indices) + indptr = np.concatenate(([0], np.cumsum(mask)))[graph.indptr] + + indices = indices.astype(np.intp, copy=no_filter_needed) + + if return_distance: + neigh_dist = _to_object_array(np.split(data, indptr[1:-1])) + neigh_ind = _to_object_array(np.split(indices, indptr[1:-1])) + + if return_distance: + return neigh_dist, neigh_ind + else: + return neigh_ind + + +class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): + """Base class for nearest neighbors estimators.""" + + _parameter_constraints: dict = { + "n_neighbors": [Interval(Integral, 1, None, closed="left"), None], + "radius": [Interval(Real, 0, None, closed="both"), None], + "algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})], + "leaf_size": [Interval(Integral, 1, None, closed="left")], + "p": [Interval(Real, 0, None, closed="right"), None], + "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable], + "metric_params": [dict, None], + "n_jobs": [Integral, None], + } + + @abstractmethod + def __init__( + self, + n_neighbors=None, + radius=None, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): + self.n_neighbors = n_neighbors + self.radius = radius + self.algorithm = algorithm + self.leaf_size = leaf_size + self.metric = metric + self.metric_params = metric_params + self.p = p + self.n_jobs = n_jobs + + def _check_algorithm_metric(self): + if self.algorithm == "auto": + if self.metric == "precomputed": + alg_check = "brute" + elif ( + callable(self.metric) + or self.metric in VALID_METRICS["ball_tree"] + or isinstance(self.metric, DistanceMetric) + ): + alg_check = "ball_tree" + else: + alg_check = "brute" + else: + alg_check = self.algorithm + + if callable(self.metric): + if self.algorithm == "kd_tree": + # callable metric is only valid for brute force and ball_tree + raise ValueError( + "kd_tree does not support callable metric '%s'" + "Function call overhead will result" + "in very poor performance." % self.metric + ) + elif self.metric not in VALID_METRICS[alg_check] and not isinstance( + self.metric, DistanceMetric + ): + raise ValueError( + "Metric '%s' not valid. Use " + "sorted(sklearn.neighbors.VALID_METRICS['%s']) " + "to get valid options. " + "Metric can also be a callable function." % (self.metric, alg_check) + ) + + if self.metric_params is not None and "p" in self.metric_params: + if self.p is not None: + warnings.warn( + ( + "Parameter p is found in metric_params. " + "The corresponding parameter from __init__ " + "is ignored." + ), + SyntaxWarning, + stacklevel=3, + ) + + def _fit(self, X, y=None): + ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True + if self.__sklearn_tags__().target_tags.required: + if not isinstance(X, (KDTree, BallTree, NeighborsBase)): + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + multi_output=True, + order="C", + ensure_all_finite=ensure_all_finite, + ) + + if is_classifier(self): + # Classification targets require a specific format + if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: + if y.ndim != 1: + warnings.warn( + ( + "A column-vector y was passed when a " + "1d array was expected. Please change " + "the shape of y to (n_samples,), for " + "example using ravel()." + ), + DataConversionWarning, + stacklevel=2, + ) + + self.outputs_2d_ = False + y = y.reshape((-1, 1)) + else: + self.outputs_2d_ = True + + check_classification_targets(y) + self.classes_ = [] + # Using `dtype=np.intp` is necessary since `np.bincount` + # (called in _classification.py) fails when dealing + # with a float64 array on 32bit systems. + self._y = np.empty(y.shape, dtype=np.intp) + for k in range(self._y.shape[1]): + classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) + self.classes_.append(classes) + + if not self.outputs_2d_: + self.classes_ = self.classes_[0] + self._y = self._y.ravel() + else: + self._y = y + + else: + if not isinstance(X, (KDTree, BallTree, NeighborsBase)): + X = validate_data( + self, + X, + ensure_all_finite=ensure_all_finite, + accept_sparse="csr", + order="C", + ) + + self._check_algorithm_metric() + if self.metric_params is None: + self.effective_metric_params_ = {} + else: + self.effective_metric_params_ = self.metric_params.copy() + + effective_p = self.effective_metric_params_.get("p", self.p) + if self.metric == "minkowski": + self.effective_metric_params_["p"] = effective_p + + self.effective_metric_ = self.metric + # For minkowski distance, use more efficient methods where available + if self.metric == "minkowski": + p = self.effective_metric_params_.pop("p", 2) + w = self.effective_metric_params_.pop("w", None) + + if p == 1 and w is None: + self.effective_metric_ = "manhattan" + elif p == 2 and w is None: + self.effective_metric_ = "euclidean" + elif p == np.inf and w is None: + self.effective_metric_ = "chebyshev" + else: + # Use the generic minkowski metric, possibly weighted. + self.effective_metric_params_["p"] = p + self.effective_metric_params_["w"] = w + + if isinstance(X, NeighborsBase): + self._fit_X = X._fit_X + self._tree = X._tree + self._fit_method = X._fit_method + self.n_samples_fit_ = X.n_samples_fit_ + return self + + elif isinstance(X, BallTree): + self._fit_X = X.data + self._tree = X + self._fit_method = "ball_tree" + self.n_samples_fit_ = X.data.shape[0] + return self + + elif isinstance(X, KDTree): + self._fit_X = X.data + self._tree = X + self._fit_method = "kd_tree" + self.n_samples_fit_ = X.data.shape[0] + return self + + if self.metric == "precomputed": + X = _check_precomputed(X) + # Precomputed matrix X must be squared + if X.shape[0] != X.shape[1]: + raise ValueError( + "Precomputed matrix must be square." + " Input is a {}x{} matrix.".format(X.shape[0], X.shape[1]) + ) + self.n_features_in_ = X.shape[1] + + n_samples = X.shape[0] + if n_samples == 0: + raise ValueError("n_samples must be greater than 0") + + if issparse(X): + if self.algorithm not in ("auto", "brute"): + warnings.warn("cannot use tree with sparse input: using brute force") + + if ( + self.effective_metric_ not in VALID_METRICS_SPARSE["brute"] + and not callable(self.effective_metric_) + and not isinstance(self.effective_metric_, DistanceMetric) + ): + raise ValueError( + "Metric '%s' not valid for sparse input. " + "Use sorted(sklearn.neighbors." + "VALID_METRICS_SPARSE['brute']) " + "to get valid options. " + "Metric can also be a callable function." % (self.effective_metric_) + ) + self._fit_X = X.copy() + self._tree = None + self._fit_method = "brute" + self.n_samples_fit_ = X.shape[0] + return self + + self._fit_method = self.algorithm + self._fit_X = X + self.n_samples_fit_ = X.shape[0] + + if self._fit_method == "auto": + # A tree approach is better for small number of neighbors or small + # number of features, with KDTree generally faster when available + if ( + self.metric == "precomputed" + or self._fit_X.shape[1] > 15 + or ( + self.n_neighbors is not None + and self.n_neighbors >= self._fit_X.shape[0] // 2 + ) + ): + self._fit_method = "brute" + else: + if ( + self.effective_metric_ == "minkowski" + and self.effective_metric_params_["p"] < 1 + ): + self._fit_method = "brute" + elif ( + self.effective_metric_ == "minkowski" + and self.effective_metric_params_.get("w") is not None + ): + # 'minkowski' with weights is not supported by KDTree but is + # supported byBallTree. + self._fit_method = "ball_tree" + elif self.effective_metric_ in VALID_METRICS["kd_tree"]: + self._fit_method = "kd_tree" + elif ( + callable(self.effective_metric_) + or self.effective_metric_ in VALID_METRICS["ball_tree"] + ): + self._fit_method = "ball_tree" + else: + self._fit_method = "brute" + + if ( + self.effective_metric_ == "minkowski" + and self.effective_metric_params_["p"] < 1 + ): + # For 0 < p < 1 Minkowski distances aren't valid distance + # metric as they do not satisfy triangular inequality: + # they are semi-metrics. + # algorithm="kd_tree" and algorithm="ball_tree" can't be used because + # KDTree and BallTree require a proper distance metric to work properly. + # However, the brute-force algorithm supports semi-metrics. + if self._fit_method == "brute": + warnings.warn( + "Mind that for 0 < p < 1, Minkowski metrics are not distance" + " metrics. Continuing the execution with `algorithm='brute'`." + ) + else: # self._fit_method in ("kd_tree", "ball_tree") + raise ValueError( + f'algorithm="{self._fit_method}" does not support 0 < p < 1 for ' + "the Minkowski metric. To resolve this problem either " + 'set p >= 1 or algorithm="brute".' + ) + + if self._fit_method == "ball_tree": + self._tree = BallTree( + X, + self.leaf_size, + metric=self.effective_metric_, + **self.effective_metric_params_, + ) + elif self._fit_method == "kd_tree": + if ( + self.effective_metric_ == "minkowski" + and self.effective_metric_params_.get("w") is not None + ): + raise ValueError( + "algorithm='kd_tree' is not valid for " + "metric='minkowski' with a weight parameter 'w': " + "try algorithm='ball_tree' " + "or algorithm='brute' instead." + ) + self._tree = KDTree( + X, + self.leaf_size, + metric=self.effective_metric_, + **self.effective_metric_params_, + ) + elif self._fit_method == "brute": + self._tree = None + + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # For cross-validation routines to split data correctly + tags.input_tags.pairwise = self.metric == "precomputed" + # when input is precomputed metric values, all those values need to be positive + tags.input_tags.positive_only = tags.input_tags.pairwise + tags.input_tags.allow_nan = self.metric == "nan_euclidean" + return tags + + +class KNeighborsMixin: + """Mixin for k-neighbors searches.""" + + def _kneighbors_reduce_func(self, dist, start, n_neighbors, return_distance): + """Reduce a chunk of distances to the nearest neighbors. + + Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked` + + Parameters + ---------- + dist : ndarray of shape (n_samples_chunk, n_samples) + The distance matrix. + + start : int + The index in X which the first row of dist corresponds to. + + n_neighbors : int + Number of neighbors required for each sample. + + return_distance : bool + Whether or not to return the distances. + + Returns + ------- + dist : array of shape (n_samples_chunk, n_neighbors) + Returned only if `return_distance=True`. + + neigh : array of shape (n_samples_chunk, n_neighbors) + The neighbors indices. + """ + sample_range = np.arange(dist.shape[0])[:, None] + neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) + neigh_ind = neigh_ind[:, :n_neighbors] + # argpartition doesn't guarantee sorted order, so we sort again + neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])] + if return_distance: + if self.effective_metric_ == "euclidean": + result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind + else: + result = dist[sample_range, neigh_ind], neigh_ind + else: + result = neigh_ind + return result + + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + """Find the K-neighbors of a point. + + Returns indices of and distances to the neighbors of each point. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', default=None + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + n_neighbors : int, default=None + Number of neighbors required for each sample. The default is the + value passed to the constructor. + + return_distance : bool, default=True + Whether or not to return the distances. + + Returns + ------- + neigh_dist : ndarray of shape (n_queries, n_neighbors) + Array representing the lengths to points, only present if + return_distance=True. + + neigh_ind : ndarray of shape (n_queries, n_neighbors) + Indices of the nearest points in the population matrix. + + Examples + -------- + In the following example, we construct a NearestNeighbors + class from an array representing our data set and ask who's + the closest point to [1,1,1] + + >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(n_neighbors=1) + >>> neigh.fit(samples) + NearestNeighbors(n_neighbors=1) + >>> print(neigh.kneighbors([[1., 1., 1.]])) + (array([[0.5]]), array([[2]])) + + As you can see, it returns [[0.5]], and [[2]], which means that the + element is at distance 0.5 and is the third element of samples + (indexes start at 0). You can also query for multiple points: + + >>> X = [[0., 1., 0.], [1., 0., 1.]] + >>> neigh.kneighbors(X, return_distance=False) + array([[1], + [2]]...) + """ + check_is_fitted(self) + + if n_neighbors is None: + n_neighbors = self.n_neighbors + elif n_neighbors <= 0: + raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) + elif not isinstance(n_neighbors, numbers.Integral): + raise TypeError( + "n_neighbors does not take %s value, enter integer value" + % type(n_neighbors) + ) + + ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True + query_is_train = X is None + if query_is_train: + X = self._fit_X + # Include an extra neighbor to account for the sample itself being + # returned, which is removed later + n_neighbors += 1 + else: + if self.metric == "precomputed": + X = _check_precomputed(X) + else: + X = validate_data( + self, + X, + ensure_all_finite=ensure_all_finite, + accept_sparse="csr", + reset=False, + order="C", + ) + + n_samples_fit = self.n_samples_fit_ + if n_neighbors > n_samples_fit: + if query_is_train: + n_neighbors -= 1 # ok to modify inplace because an error is raised + inequality_str = "n_neighbors < n_samples_fit" + else: + inequality_str = "n_neighbors <= n_samples_fit" + raise ValueError( + f"Expected {inequality_str}, but " + f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " + f"n_samples = {X.shape[0]}" # include n_samples for common tests + ) + + n_jobs = effective_n_jobs(self.n_jobs) + chunked_results = None + use_pairwise_distances_reductions = ( + self._fit_method == "brute" + and ArgKmin.is_usable_for( + X if X is not None else self._fit_X, self._fit_X, self.effective_metric_ + ) + ) + if use_pairwise_distances_reductions: + results = ArgKmin.compute( + X=X, + Y=self._fit_X, + k=n_neighbors, + metric=self.effective_metric_, + metric_kwargs=self.effective_metric_params_, + strategy="auto", + return_distance=return_distance, + ) + + elif ( + self._fit_method == "brute" and self.metric == "precomputed" and issparse(X) + ): + results = _kneighbors_from_graph( + X, n_neighbors=n_neighbors, return_distance=return_distance + ) + + elif self._fit_method == "brute": + # Joblib-based backend, which is used when user-defined callable + # are passed for metric. + + # This won't be used in the future once PairwiseDistancesReductions + # support: + # - DistanceMetrics which work on supposedly binary data + # - CSR-dense and dense-CSR case if 'euclidean' in metric. + reduce_func = partial( + self._kneighbors_reduce_func, + n_neighbors=n_neighbors, + return_distance=return_distance, + ) + + # for efficiency, use squared euclidean distances + if self.effective_metric_ == "euclidean": + kwds = {"squared": True} + else: + kwds = self.effective_metric_params_ + + chunked_results = list( + pairwise_distances_chunked( + X, + self._fit_X, + reduce_func=reduce_func, + metric=self.effective_metric_, + n_jobs=n_jobs, + **kwds, + ) + ) + + elif self._fit_method in ["ball_tree", "kd_tree"]: + if issparse(X): + raise ValueError( + "%s does not work with sparse matrices. Densify the data, " + "or set algorithm='brute'" % self._fit_method + ) + chunked_results = Parallel(n_jobs, prefer="threads")( + delayed(self._tree.query)(X[s], n_neighbors, return_distance) + for s in gen_even_slices(X.shape[0], n_jobs) + ) + else: + raise ValueError("internal: _fit_method not recognized") + + if chunked_results is not None: + if return_distance: + neigh_dist, neigh_ind = zip(*chunked_results) + results = np.vstack(neigh_dist), np.vstack(neigh_ind) + else: + results = np.vstack(chunked_results) + + if not query_is_train: + return results + else: + # If the query data is the same as the indexed data, we would like + # to ignore the first nearest neighbor of every sample, i.e + # the sample itself. + if return_distance: + neigh_dist, neigh_ind = results + else: + neigh_ind = results + + n_queries, _ = X.shape + sample_range = np.arange(n_queries)[:, None] + sample_mask = neigh_ind != sample_range + + # Corner case: When the number of duplicates are more + # than the number of neighbors, the first NN will not + # be the sample, but a duplicate. + # In that case mask the first duplicate. + dup_gr_nbrs = np.all(sample_mask, axis=1) + sample_mask[:, 0][dup_gr_nbrs] = False + neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + + if return_distance: + neigh_dist = np.reshape( + neigh_dist[sample_mask], (n_queries, n_neighbors - 1) + ) + return neigh_dist, neigh_ind + return neigh_ind + + def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): + """Compute the (weighted) graph of k-Neighbors for points in X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', default=None + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + For ``metric='precomputed'`` the shape should be + (n_queries, n_indexed). Otherwise the shape should be + (n_queries, n_features). + + n_neighbors : int, default=None + Number of neighbors for each sample. The default is the value + passed to the constructor. + + mode : {'connectivity', 'distance'}, default='connectivity' + Type of returned matrix: 'connectivity' will return the + connectivity matrix with ones and zeros, in 'distance' the + edges are distances between points, type of distance + depends on the selected metric parameter in + NearestNeighbors class. + + Returns + ------- + A : sparse-matrix of shape (n_queries, n_samples_fit) + `n_samples_fit` is the number of samples in the fitted data. + `A[i, j]` gives the weight of the edge connecting `i` to `j`. + The matrix is of CSR format. + + See Also + -------- + NearestNeighbors.radius_neighbors_graph : Compute the (weighted) graph + of Neighbors for points in X. + + Examples + -------- + >>> X = [[0], [3], [1]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(n_neighbors=2) + >>> neigh.fit(X) + NearestNeighbors(n_neighbors=2) + >>> A = neigh.kneighbors_graph(X) + >>> A.toarray() + array([[1., 0., 1.], + [0., 1., 1.], + [1., 0., 1.]]) + """ + check_is_fitted(self) + if n_neighbors is None: + n_neighbors = self.n_neighbors + + # check the input only in self.kneighbors + + # construct CSR matrix representation of the k-NN graph + if mode == "connectivity": + A_ind = self.kneighbors(X, n_neighbors, return_distance=False) + n_queries = A_ind.shape[0] + A_data = np.ones(n_queries * n_neighbors) + + elif mode == "distance": + A_data, A_ind = self.kneighbors(X, n_neighbors, return_distance=True) + A_data = np.ravel(A_data) + + else: + raise ValueError( + 'Unsupported mode, must be one of "connectivity", ' + f'or "distance" but got "{mode}" instead' + ) + + n_queries = A_ind.shape[0] + n_samples_fit = self.n_samples_fit_ + n_nonzero = n_queries * n_neighbors + A_indptr = np.arange(0, n_nonzero + 1, n_neighbors) + + kneighbors_graph = csr_matrix( + (A_data, A_ind.ravel(), A_indptr), shape=(n_queries, n_samples_fit) + ) + + return kneighbors_graph + + +class RadiusNeighborsMixin: + """Mixin for radius-based neighbors searches.""" + + def _radius_neighbors_reduce_func(self, dist, start, radius, return_distance): + """Reduce a chunk of distances to the nearest neighbors. + + Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked` + + Parameters + ---------- + dist : ndarray of shape (n_samples_chunk, n_samples) + The distance matrix. + + start : int + The index in X which the first row of dist corresponds to. + + radius : float + The radius considered when making the nearest neighbors search. + + return_distance : bool + Whether or not to return the distances. + + Returns + ------- + dist : list of ndarray of shape (n_samples_chunk,) + Returned only if `return_distance=True`. + + neigh : list of ndarray of shape (n_samples_chunk,) + The neighbors indices. + """ + neigh_ind = [np.where(d <= radius)[0] for d in dist] + + if return_distance: + if self.effective_metric_ == "euclidean": + dist = [np.sqrt(d[neigh_ind[i]]) for i, d in enumerate(dist)] + else: + dist = [d[neigh_ind[i]] for i, d in enumerate(dist)] + results = dist, neigh_ind + else: + results = neigh_ind + return results + + def radius_neighbors( + self, X=None, radius=None, return_distance=True, sort_results=False + ): + """Find the neighbors within a given radius of a point or points. + + Return the indices and distances of each point from the dataset + lying in a ball with size ``radius`` around the points of the query + array. Points lying on the boundary are included in the results. + + The result points are *not* necessarily sorted by distance to their + query point. + + Parameters + ---------- + X : {array-like, sparse matrix} of (n_samples, n_features), default=None + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + radius : float, default=None + Limiting distance of neighbors to return. The default is the value + passed to the constructor. + + return_distance : bool, default=True + Whether or not to return the distances. + + sort_results : bool, default=False + If True, the distances and indices will be sorted by increasing + distances before being returned. If False, the results may not + be sorted. If `return_distance=False`, setting `sort_results=True` + will result in an error. + + .. versionadded:: 0.22 + + Returns + ------- + neigh_dist : ndarray of shape (n_samples,) of arrays + Array representing the distances to each point, only present if + `return_distance=True`. The distance values are computed according + to the ``metric`` constructor parameter. + + neigh_ind : ndarray of shape (n_samples,) of arrays + An array of arrays of indices of the approximate nearest points + from the population matrix that lie within a ball of size + ``radius`` around the query points. + + Notes + ----- + Because the number of neighbors of each point is not necessarily + equal, the results for multiple query points cannot be fit in a + standard data array. + For efficiency, `radius_neighbors` returns arrays of objects, where + each object is a 1D array of indices or distances. + + Examples + -------- + In the following example, we construct a NeighborsClassifier + class from an array representing our data set and ask who's + the closest point to [1, 1, 1]: + + >>> import numpy as np + >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(radius=1.6) + >>> neigh.fit(samples) + NearestNeighbors(radius=1.6) + >>> rng = neigh.radius_neighbors([[1., 1., 1.]]) + >>> print(np.asarray(rng[0][0])) + [1.5 0.5] + >>> print(np.asarray(rng[1][0])) + [1 2] + + The first array returned contains the distances to all points which + are closer than 1.6, while the second array returned contains their + indices. In general, multiple points can be queried at the same time. + """ + check_is_fitted(self) + + if sort_results and not return_distance: + raise ValueError("return_distance must be True if sort_results is True.") + + ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True + query_is_train = X is None + if query_is_train: + X = self._fit_X + else: + if self.metric == "precomputed": + X = _check_precomputed(X) + else: + X = validate_data( + self, + X, + ensure_all_finite=ensure_all_finite, + accept_sparse="csr", + reset=False, + order="C", + ) + + if radius is None: + radius = self.radius + + use_pairwise_distances_reductions = ( + self._fit_method == "brute" + and RadiusNeighbors.is_usable_for( + X if X is not None else self._fit_X, self._fit_X, self.effective_metric_ + ) + ) + + if use_pairwise_distances_reductions: + results = RadiusNeighbors.compute( + X=X, + Y=self._fit_X, + radius=radius, + metric=self.effective_metric_, + metric_kwargs=self.effective_metric_params_, + strategy="auto", + return_distance=return_distance, + sort_results=sort_results, + ) + + elif ( + self._fit_method == "brute" and self.metric == "precomputed" and issparse(X) + ): + results = _radius_neighbors_from_graph( + X, radius=radius, return_distance=return_distance + ) + + elif self._fit_method == "brute": + # Joblib-based backend, which is used when user-defined callable + # are passed for metric. + + # This won't be used in the future once PairwiseDistancesReductions + # support: + # - DistanceMetrics which work on supposedly binary data + # - CSR-dense and dense-CSR case if 'euclidean' in metric. + + # for efficiency, use squared euclidean distances + if self.effective_metric_ == "euclidean": + radius *= radius + kwds = {"squared": True} + else: + kwds = self.effective_metric_params_ + + reduce_func = partial( + self._radius_neighbors_reduce_func, + radius=radius, + return_distance=return_distance, + ) + + chunked_results = pairwise_distances_chunked( + X, + self._fit_X, + reduce_func=reduce_func, + metric=self.effective_metric_, + n_jobs=self.n_jobs, + **kwds, + ) + if return_distance: + neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results) + neigh_dist_list = sum(neigh_dist_chunks, []) + neigh_ind_list = sum(neigh_ind_chunks, []) + neigh_dist = _to_object_array(neigh_dist_list) + neigh_ind = _to_object_array(neigh_ind_list) + results = neigh_dist, neigh_ind + else: + neigh_ind_list = sum(chunked_results, []) + results = _to_object_array(neigh_ind_list) + + if sort_results: + for ii in range(len(neigh_dist)): + order = np.argsort(neigh_dist[ii], kind="mergesort") + neigh_ind[ii] = neigh_ind[ii][order] + neigh_dist[ii] = neigh_dist[ii][order] + results = neigh_dist, neigh_ind + + elif self._fit_method in ["ball_tree", "kd_tree"]: + if issparse(X): + raise ValueError( + "%s does not work with sparse matrices. Densify the data, " + "or set algorithm='brute'" % self._fit_method + ) + + n_jobs = effective_n_jobs(self.n_jobs) + delayed_query = delayed(self._tree.query_radius) + chunked_results = Parallel(n_jobs, prefer="threads")( + delayed_query(X[s], radius, return_distance, sort_results=sort_results) + for s in gen_even_slices(X.shape[0], n_jobs) + ) + if return_distance: + neigh_ind, neigh_dist = tuple(zip(*chunked_results)) + results = np.hstack(neigh_dist), np.hstack(neigh_ind) + else: + results = np.hstack(chunked_results) + else: + raise ValueError("internal: _fit_method not recognized") + + if not query_is_train: + return results + else: + # If the query data is the same as the indexed data, we would like + # to ignore the first nearest neighbor of every sample, i.e + # the sample itself. + if return_distance: + neigh_dist, neigh_ind = results + else: + neigh_ind = results + + for ind, ind_neighbor in enumerate(neigh_ind): + mask = ind_neighbor != ind + + neigh_ind[ind] = ind_neighbor[mask] + if return_distance: + neigh_dist[ind] = neigh_dist[ind][mask] + + if return_distance: + return neigh_dist, neigh_ind + return neigh_ind + + def radius_neighbors_graph( + self, X=None, radius=None, mode="connectivity", sort_results=False + ): + """Compute the (weighted) graph of Neighbors for points in X. + + Neighborhoods are restricted the points at a distance lower than + radius. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + radius : float, default=None + Radius of neighborhoods. The default is the value passed to the + constructor. + + mode : {'connectivity', 'distance'}, default='connectivity' + Type of returned matrix: 'connectivity' will return the + connectivity matrix with ones and zeros, in 'distance' the + edges are distances between points, type of distance + depends on the selected metric parameter in + NearestNeighbors class. + + sort_results : bool, default=False + If True, in each row of the result, the non-zero entries will be + sorted by increasing distances. If False, the non-zero entries may + not be sorted. Only used with mode='distance'. + + .. versionadded:: 0.22 + + Returns + ------- + A : sparse-matrix of shape (n_queries, n_samples_fit) + `n_samples_fit` is the number of samples in the fitted data. + `A[i, j]` gives the weight of the edge connecting `i` to `j`. + The matrix is of CSR format. + + See Also + -------- + kneighbors_graph : Compute the (weighted) graph of k-Neighbors for + points in X. + + Examples + -------- + >>> X = [[0], [3], [1]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(radius=1.5) + >>> neigh.fit(X) + NearestNeighbors(radius=1.5) + >>> A = neigh.radius_neighbors_graph(X) + >>> A.toarray() + array([[1., 0., 1.], + [0., 1., 0.], + [1., 0., 1.]]) + """ + check_is_fitted(self) + + # check the input only in self.radius_neighbors + + if radius is None: + radius = self.radius + + # construct CSR matrix representation of the NN graph + if mode == "connectivity": + A_ind = self.radius_neighbors(X, radius, return_distance=False) + A_data = None + elif mode == "distance": + dist, A_ind = self.radius_neighbors( + X, radius, return_distance=True, sort_results=sort_results + ) + A_data = np.concatenate(list(dist)) + else: + raise ValueError( + 'Unsupported mode, must be one of "connectivity", ' + f'or "distance" but got "{mode}" instead' + ) + + n_queries = A_ind.shape[0] + n_samples_fit = self.n_samples_fit_ + n_neighbors = np.array([len(a) for a in A_ind]) + A_ind = np.concatenate(list(A_ind)) + if A_data is None: + A_data = np.ones(len(A_ind)) + A_indptr = np.concatenate((np.zeros(1, dtype=int), np.cumsum(n_neighbors))) + + return csr_matrix((A_data, A_ind, A_indptr), shape=(n_queries, n_samples_fit)) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = self.metric == "nan_euclidean" + return tags diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_binary_tree.pxi.tp b/.venv/Lib/site-packages/sklearn/neighbors/_binary_tree.pxi.tp new file mode 100644 index 0000000000000000000000000000000000000000..a7fe50f0aa94e19203cc16612fc7c83dc7c113ef --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_binary_tree.pxi.tp @@ -0,0 +1,2481 @@ +{{py: + +# Generated file: _binary_tree.pxi + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE + # + ('64', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'), + ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT') +] + +# KD Tree and Ball Tree +# ===================== +# +# Author: Jake Vanderplas , 2012-2013 +# Omar Salman +# +# License: BSD +# +# _binary_tree.pxi is generated and is then literally Cython included in +# ball_tree.pyx and kd_tree.pyx. See ball_tree.pyx.tp and kd_tree.pyx.tp. + +}} + + +# KD Tree and Ball Tree +# ===================== +# +# The routines here are the core algorithms of the KDTree and BallTree +# structures. If Cython supported polymorphism, we would be able to +# create a subclass and derive KDTree and BallTree from it. Because +# polymorphism is not an option, we use this single BinaryTree class +# as a literal include to avoid duplicating the entire file. +# +# A series of functions are implemented in kd_tree.pyx and ball_tree.pyx +# which use the information here to calculate the lower and upper bounds +# between a node and a point, and between two nodes. These functions are +# used here, and are all that are needed to differentiate between the two +# tree types. +# +# Description of Binary Tree Algorithms +# ------------------------------------- +# A binary tree can be thought of as a collection of nodes. The top node +# contains all the points. The next level consists of two nodes with half +# the points in each, and this continues recursively. Each node contains +# metadata which allow fast computation of distance bounds: in the case of +# a ball tree, the metadata is a center and a radius. In the case of a +# KD tree, the metadata is the minimum and maximum bound along each dimension. +# +# In a typical KD Tree or Ball Tree implementation, the nodes are implemented +# as dynamically allocated structures with pointers linking them. Here we +# take a different approach, storing all relevant data in a set of arrays +# so that the entire tree object can be saved in a pickle file. For efficiency, +# the data can be stored in such a way that explicit pointers are not +# necessary: for node data stored at index i, the two child nodes are at +# index (2 * i + 1) and (2 * i + 2); the parent node is (i - 1) // 2 +# (where // indicates integer division). +# +# The data arrays used here are as follows: +# data : the [n_samples x n_features] array of data from which the tree +# is built +# idx_array : the length n_samples array used to keep track of the indices +# of data within each node. Each node has values idx_start and +# idx_end: the points within the node are given by (using numpy +# syntax) data[idx_array[idx_start:idx_end]]. +# node_data : the length n_nodes array of structures which store the node +# indices, node radii, and leaf information for each node. +# node_bounds : the [* x n_nodes x n_features] array containing the node +# bound information. For ball tree, the first dimension is 1, and +# each row contains the centroid of the node. For kd tree, the first +# dimension is 2 and the rows for each point contain the arrays of +# lower bounds and upper bounds in each direction. +# +# The lack of dynamic allocation means the number of nodes must be computed +# before the building of the tree. This can be done assuming the points are +# divided equally between child nodes at each step; although this removes +# some flexibility in tree creation, it ensures a balanced tree and ensures +# that the number of nodes required can be computed beforehand. Given a +# specified leaf_size (the minimum number of points in any node), it is +# possible to show that a balanced tree will have +# +# n_levels = 1 + max(0, floor(log2((n_samples - 1) / leaf_size))) +# +# in order to satisfy +# +# leaf_size <= min(n_points) <= 2 * leaf_size +# +# with the exception of the special case where n_samples < leaf_size. +# for a given number of levels, the number of nodes in the tree is given by +# +# n_nodes = 2 ** n_levels - 1 +# +# both these results can be straightforwardly shown by induction. The +# following code uses these values in the construction of the tree. +# +# Distance Metrics +# ---------------- +# For flexibility, the trees can be built using a variety of distance metrics. +# The metrics are described in the DistanceMetric class: the standard +# Euclidean distance is the default, and is inlined to be faster than other +# metrics. In addition, each metric defines both a distance and a +# "reduced distance", which is often faster to compute, and is therefore +# used in the query architecture whenever possible. (For example, in the +# case of the standard Euclidean distance, the reduced distance is the +# squared-distance). +# +# Implementation Notes +# -------------------- +# This implementation uses the common object-oriented approach of having an +# abstract base class which is extended by the KDTree and BallTree +# specializations. +# +# The BinaryTree "base class" is defined here and then subclassed in the BallTree +# and KDTree pyx files. These files include implementations of the +# "abstract" methods. + +# Necessary Helper Functions +# -------------------------- +# These are the names and descriptions of the "abstract" functions which are +# defined in kd_tree.pyx and ball_tree.pyx: + +# cdef int allocate_data(BinaryTree tree, intp_t n_nodes, intp_t n_features): +# """Allocate arrays needed for the KD Tree""" + +# cdef int init_node(BinaryTree tree, intp_t i_node, +# intp_t idx_start, intp_t idx_end): +# """Initialize the node for the dataset stored in tree.data""" + +# cdef float64_t min_rdist(BinaryTree tree, intp_t i_node, float64_t* pt): +# """Compute the minimum reduced-distance between a point and a node""" + +# cdef float64_t min_dist(BinaryTree tree, intp_t i_node, float64_t* pt): +# """Compute the minimum distance between a point and a node""" + +# cdef float64_t max_rdist(BinaryTree tree, intp_t i_node, float64_t* pt): +# """Compute the maximum reduced-distance between a point and a node""" + +# cdef float64_t max_dist(BinaryTree tree, intp_t i_node, float64_t* pt): +# """Compute the maximum distance between a point and a node""" + +# cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, +# float64_t* min_dist, float64_t* max_dist): +# """Compute the minimum and maximum distance between a point and a node""" + +# cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, +# BinaryTree tree2, intp_t i_node2): +# """Compute the minimum reduced distance between two nodes""" + +# cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1, +# BinaryTree tree2, intp_t i_node2): +# """Compute the minimum distance between two nodes""" + +# cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, +# BinaryTree tree2, intp_t i_node2): +# """Compute the maximum reduced distance between two nodes""" + +# cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1, +# BinaryTree tree2, intp_t i_node2): +# """Compute the maximum distance between two nodes""" + +cimport numpy as cnp +from cython cimport floating +from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma +from libc.math cimport fmin, fmax +from libc.stdlib cimport calloc, malloc, free +from libc.string cimport memcpy + +import numpy as np +import warnings + +from ..metrics._dist_metrics cimport ( + DistanceMetric, + DistanceMetric64, + DistanceMetric32, + euclidean_dist64, + euclidean_dist32, + euclidean_rdist64, + euclidean_rdist32, + euclidean_dist_to_rdist64, + euclidean_dist_to_rdist32, +) + +from ._partition_nodes cimport partition_node_indices + +from ..utils import check_array +from ..utils._typedefs cimport float32_t, float64_t, intp_t +from ..utils._heap cimport heap_push +from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort + +cnp.import_array() + + +# TODO: use cnp.PyArray_ENABLEFLAGS when Cython>=3.0 is used. +cdef extern from "numpy/arrayobject.h": + void PyArray_ENABLEFLAGS(cnp.ndarray arr, int flags) + + +# some handy constants +cdef float64_t INF = np.inf +cdef float64_t NEG_INF = -np.inf +cdef float64_t PI = np.pi +cdef float64_t ROOT_2PI = sqrt(2 * PI) +cdef float64_t LOG_PI = log(PI) +cdef float64_t LOG_2PI = log(2 * PI) + + +# Some compound datatypes used below: +cdef struct NodeHeapData_t: + float64_t val + intp_t i1 + intp_t i2 + +# build the corresponding numpy dtype for NodeHeapData +cdef NodeHeapData_t nhd_tmp +NodeHeapData = np.asarray((&nhd_tmp)).dtype + +cdef struct NodeData_t: + intp_t idx_start + intp_t idx_end + intp_t is_leaf + float64_t radius + +# build the corresponding numpy dtype for NodeData +cdef NodeData_t nd_tmp +NodeData = np.asarray((&nd_tmp)).dtype + + +###################################################################### +# Define doc strings, substituting the appropriate class name using +# the DOC_DICT variable defined in the pyx files. +CLASS_DOC = """{BinaryTree} for fast generalized N-point problems + +Read more in the :ref:`User Guide `. + +Parameters +---------- +X : array-like of shape (n_samples, n_features) + n_samples is the number of points in the data set, and + n_features is the dimension of the parameter space. + Note: if X is a C-contiguous array of doubles then data will + not be copied. Otherwise, an internal copy will be made. + +leaf_size : positive int, default=40 + Number of points at which to switch to brute-force. Changing + leaf_size will not affect the results of a query, but can + significantly impact the speed of a query and the memory required + to store the constructed tree. The amount of memory needed to + store the tree scales as approximately n_samples / leaf_size. + For a specified ``leaf_size``, a leaf node is guaranteed to + satisfy ``leaf_size <= n_points <= 2 * leaf_size``, except in + the case that ``n_samples < leaf_size``. + +metric : str or DistanceMetric64 object, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. + A list of valid metrics for {BinaryTree} is given by the attribute + `valid_metrics`. + See the documentation of `scipy.spatial.distance + `_ and + the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for + more information on any distance metric. + +Additional keywords are passed to the distance metric class. +Note: Callable functions in the metric parameter are NOT supported for KDTree +and Ball Tree. Function call overhead will result in very poor performance. + +Attributes +---------- +data : memory view + The training data +valid_metrics: list of str + List of valid distance metrics. + +Examples +-------- +Query for k-nearest neighbors + + >>> import numpy as np + >>> from sklearn.neighbors import {BinaryTree} + >>> rng = np.random.RandomState(0) + >>> X = rng.random_sample((10, 3)) # 10 points in 3 dimensions + >>> tree = {BinaryTree}(X, leaf_size=2) # doctest: +SKIP + >>> dist, ind = tree.query(X[:1], k=3) # doctest: +SKIP + >>> print(ind) # indices of 3 closest neighbors + [0 3 1] + >>> print(dist) # distances to 3 closest neighbors + [ 0. 0.19662693 0.29473397] + +Pickle and Unpickle a tree. Note that the state of the tree is saved in the +pickle operation: the tree needs not be rebuilt upon unpickling. + + >>> import numpy as np + >>> import pickle + >>> rng = np.random.RandomState(0) + >>> X = rng.random_sample((10, 3)) # 10 points in 3 dimensions + >>> tree = {BinaryTree}(X, leaf_size=2) # doctest: +SKIP + >>> s = pickle.dumps(tree) # doctest: +SKIP + >>> tree_copy = pickle.loads(s) # doctest: +SKIP + >>> dist, ind = tree_copy.query(X[:1], k=3) # doctest: +SKIP + >>> print(ind) # indices of 3 closest neighbors + [0 3 1] + >>> print(dist) # distances to 3 closest neighbors + [ 0. 0.19662693 0.29473397] + +Query for neighbors within a given radius + + >>> import numpy as np + >>> rng = np.random.RandomState(0) + >>> X = rng.random_sample((10, 3)) # 10 points in 3 dimensions + >>> tree = {BinaryTree}(X, leaf_size=2) # doctest: +SKIP + >>> print(tree.query_radius(X[:1], r=0.3, count_only=True)) + 3 + >>> ind = tree.query_radius(X[:1], r=0.3) # doctest: +SKIP + >>> print(ind) # indices of neighbors within distance 0.3 + [3 0 1] + + +Compute a gaussian kernel density estimate: + + >>> import numpy as np + >>> rng = np.random.RandomState(42) + >>> X = rng.random_sample((100, 3)) + >>> tree = {BinaryTree}(X) # doctest: +SKIP + >>> tree.kernel_density(X[:3], h=0.1, kernel='gaussian') + array([ 6.94114649, 7.83281226, 7.2071716 ]) + +Compute a two-point auto-correlation function + + >>> import numpy as np + >>> rng = np.random.RandomState(0) + >>> X = rng.random_sample((30, 3)) + >>> r = np.linspace(0, 1, 5) + >>> tree = {BinaryTree}(X) # doctest: +SKIP + >>> tree.two_point_correlation(X, r) + array([ 30, 62, 278, 580, 820]) + +""" + + +###################################################################### +# Utility functions +cdef float64_t logaddexp(float64_t x1, float64_t x2): + """logaddexp(x1, x2) -> log(exp(x1) + exp(x2))""" + cdef float64_t a = fmax(x1, x2) + if a == NEG_INF: + return NEG_INF + else: + return a + log(exp(x1 - a) + exp(x2 - a)) + +cdef float64_t logsubexp(float64_t x1, float64_t x2): + """logsubexp(x1, x2) -> log(exp(x1) - exp(x2))""" + if x1 <= x2: + return NEG_INF + else: + return x1 + log(1 - exp(x2 - x1)) + + +###################################################################### +# Kernel functions +# +# Note: Kernels assume dist is non-negative and h is positive +# All kernel functions are normalized such that K(0, h) = 1. +# The fully normalized kernel is: +# K = exp[kernel_norm(h, d, kernel) + compute_kernel(dist, h, kernel)] +# The code only works with non-negative kernels: i.e. K(d, h) >= 0 +# for all valid d and h. Note that for precision, the log of both +# the kernel and kernel norm is returned. +cdef enum KernelType: + GAUSSIAN_KERNEL = 1 + TOPHAT_KERNEL = 2 + EPANECHNIKOV_KERNEL = 3 + EXPONENTIAL_KERNEL = 4 + LINEAR_KERNEL = 5 + COSINE_KERNEL = 6 + + +cdef inline float64_t log_gaussian_kernel(float64_t dist, float64_t h): + """log of the gaussian kernel for bandwidth h (unnormalized)""" + return -0.5 * (dist * dist) / (h * h) + + +cdef inline float64_t log_tophat_kernel(float64_t dist, float64_t h): + """log of the tophat kernel for bandwidth h (unnormalized)""" + if dist < h: + return 0.0 + else: + return NEG_INF + + +cdef inline float64_t log_epanechnikov_kernel(float64_t dist, float64_t h): + """log of the epanechnikov kernel for bandwidth h (unnormalized)""" + if dist < h: + return log(1.0 - (dist * dist) / (h * h)) + else: + return NEG_INF + + +cdef inline float64_t log_exponential_kernel(float64_t dist, float64_t h): + """log of the exponential kernel for bandwidth h (unnormalized)""" + return -dist / h + + +cdef inline float64_t log_linear_kernel(float64_t dist, float64_t h): + """log of the linear kernel for bandwidth h (unnormalized)""" + if dist < h: + return log(1 - dist / h) + else: + return NEG_INF + + +cdef inline float64_t log_cosine_kernel(float64_t dist, float64_t h): + """log of the cosine kernel for bandwidth h (unnormalized)""" + if dist < h: + return log(cos(0.5 * PI * dist / h)) + else: + return NEG_INF + + +cdef inline float64_t compute_log_kernel(float64_t dist, float64_t h, + KernelType kernel): + """Given a KernelType enumeration, compute the appropriate log-kernel""" + if kernel == GAUSSIAN_KERNEL: + return log_gaussian_kernel(dist, h) + elif kernel == TOPHAT_KERNEL: + return log_tophat_kernel(dist, h) + elif kernel == EPANECHNIKOV_KERNEL: + return log_epanechnikov_kernel(dist, h) + elif kernel == EXPONENTIAL_KERNEL: + return log_exponential_kernel(dist, h) + elif kernel == LINEAR_KERNEL: + return log_linear_kernel(dist, h) + elif kernel == COSINE_KERNEL: + return log_cosine_kernel(dist, h) + + +# ------------------------------------------------------------ +# Kernel norms are defined via the volume element V_n +# and surface element S_(n-1) of an n-sphere. +cdef float64_t logVn(intp_t n): + """V_n = pi^(n/2) / gamma(n/2 - 1)""" + return 0.5 * n * LOG_PI - lgamma(0.5 * n + 1) + + +cdef float64_t logSn(intp_t n): + """V_(n+1) = int_0^1 S_n r^n dr""" + return LOG_2PI + logVn(n - 1) + + +cdef float64_t _log_kernel_norm(float64_t h, intp_t d, + KernelType kernel) except -1: + """Given a KernelType enumeration, compute the kernel normalization. + + h is the bandwidth, d is the dimension. + """ + cdef float64_t tmp, factor = 0 + cdef intp_t k + if kernel == GAUSSIAN_KERNEL: + factor = 0.5 * d * LOG_2PI + elif kernel == TOPHAT_KERNEL: + factor = logVn(d) + elif kernel == EPANECHNIKOV_KERNEL: + factor = logVn(d) + log(2. / (d + 2.)) + elif kernel == EXPONENTIAL_KERNEL: + factor = logSn(d - 1) + lgamma(d) + elif kernel == LINEAR_KERNEL: + factor = logVn(d) - log(d + 1.) + elif kernel == COSINE_KERNEL: + # this is derived from a chain rule integration + factor = 0 + tmp = 2. / PI + for k in range(1, d + 1, 2): + factor += tmp + tmp *= -(d - k) * (d - k - 1) * (2. / PI) ** 2 + factor = log(factor) + logSn(d - 1) + else: + raise ValueError("Kernel code not recognized") + return -factor - d * log(h) + + +def kernel_norm(h, d, kernel, return_log=False): + """Given a string specification of a kernel, compute the normalization. + + Parameters + ---------- + h : float + The bandwidth of the kernel. + d : int + The dimension of the space in which the kernel norm is computed. + kernel : str + The kernel identifier. Must be one of + ['gaussian'|'tophat'|'epanechnikov'| + 'exponential'|'linear'|'cosine'] + return_log : bool, default=False + If True, return the log of the kernel norm. Otherwise, return the + kernel norm. + Returns + ------- + knorm or log_knorm : float + the kernel norm or logarithm of the kernel norm. + """ + if kernel == 'gaussian': + result = _log_kernel_norm(h, d, GAUSSIAN_KERNEL) + elif kernel == 'tophat': + result = _log_kernel_norm(h, d, TOPHAT_KERNEL) + elif kernel == 'epanechnikov': + result = _log_kernel_norm(h, d, EPANECHNIKOV_KERNEL) + elif kernel == 'exponential': + result = _log_kernel_norm(h, d, EXPONENTIAL_KERNEL) + elif kernel == 'linear': + result = _log_kernel_norm(h, d, LINEAR_KERNEL) + elif kernel == 'cosine': + result = _log_kernel_norm(h, d, COSINE_KERNEL) + else: + raise ValueError('kernel not recognized') + + if return_log: + return result + else: + return np.exp(result) + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} + +cdef class NeighborsHeap{{name_suffix}}: + """A max-heap structure to keep track of distances/indices of neighbors + + This implements an efficient pre-allocated set of fixed-size heaps + for chasing neighbors, holding both an index and a distance. + When any row of the heap is full, adding an additional point will push + the furthest point off the heap. + + Parameters + ---------- + n_pts : int + the number of heaps to use + n_nbrs : int + the size of each heap. + """ + cdef {{INPUT_DTYPE_t}}[:, ::1] distances + cdef intp_t[:, ::1] indices + + def __cinit__(self): + # One-element arrays are used as placeholders to prevent + # any problem due to potential access to those attributes + # (e.g. assigning to NULL or a to value in another segment). + self.distances = np.zeros((1, 1), dtype={{INPUT_DTYPE}}, order='C') + self.indices = np.zeros((1, 1), dtype=np.intp, order='C') + + def __init__(self, n_pts, n_nbrs): + self.distances = np.full( + (n_pts, n_nbrs), np.inf, dtype={{INPUT_DTYPE}}, order='C' + ) + self.indices = np.zeros((n_pts, n_nbrs), dtype=np.intp, order='C') + + def get_arrays(self, sort=True): + """Get the arrays of distances and indices within the heap. + + If sort=True, then simultaneously sort the indices and distances, + so the closer points are listed first. + """ + if sort: + self._sort() + return self.distances.base, self.indices.base + + cdef inline float64_t largest(self, intp_t row) except -1 nogil: + """Return the largest distance in the given row""" + return self.distances[row, 0] + + def push(self, intp_t row, float64_t val, intp_t i_val): + return self._push(row, val, i_val) + + cdef int _push(self, intp_t row, float64_t val, + intp_t i_val) except -1 nogil: + """push (val, i_val) into the given row""" + return heap_push( + values=&self.distances[row, 0], + indices=&self.indices[row, 0], + size=self.distances.shape[1], + val=val, + val_idx=i_val, + ) + + cdef int _sort(self) except -1: + """simultaneously sort the distances and indices""" + cdef intp_t row + for row in range(self.distances.shape[0]): + _simultaneous_sort( + dist=&self.distances[row, 0], + idx=&self.indices[row, 0], + size=self.distances.shape[1], + ) + return 0 + +{{endfor}} + +#------------------------------------------------------------ +# find_node_split_dim: +# this computes the equivalent of +# j_max = np.argmax(np.max(data, 0) - np.min(data, 0)) +cdef intp_t find_node_split_dim(const floating* data, + const intp_t* node_indices, + intp_t n_features, + intp_t n_points) except -1: + """Find the dimension with the largest spread. + + Parameters + ---------- + data : double pointer + Pointer to a 2D array of the training data, of shape [N, n_features]. + N must be greater than any of the values in node_indices. + node_indices : int pointer + Pointer to a 1D array of length n_points. This lists the indices of + each of the points within the current node. + + Returns + ------- + i_max : int + The index of the feature (dimension) within the node that has the + largest spread. + + Notes + ----- + In numpy, this operation is equivalent to + + def find_node_split_dim(data, node_indices): + return np.argmax(data[node_indices].max(0) - data[node_indices].min(0)) + + The cython version is much more efficient in both computation and memory. + """ + cdef float64_t min_val, max_val, val, spread, max_spread + cdef intp_t i, j, j_max + + j_max = 0 + max_spread = 0 + + for j in range(n_features): + max_val = data[node_indices[0] * n_features + j] + min_val = max_val + for i in range(1, n_points): + val = data[node_indices[i] * n_features + j] + max_val = fmax(max_val, val) + min_val = fmin(min_val, val) + spread = max_val - min_val + if spread > max_spread: + max_spread = spread + j_max = j + return j_max + + +###################################################################### +# NodeHeap : min-heap used to keep track of nodes during +# breadth-first query +cdef inline void swap_nodes(NodeHeapData_t* arr, intp_t i1, intp_t i2): + cdef NodeHeapData_t tmp = arr[i1] + arr[i1] = arr[i2] + arr[i2] = tmp + + +cdef class NodeHeap: + """NodeHeap + + This is a min-heap implementation for keeping track of nodes + during a breadth-first search. Unlike the NeighborsHeap above, + the NodeHeap does not have a fixed size and must be able to grow + as elements are added. + + Internally, the data is stored in a simple binary heap which meets + the min heap condition: + + heap[i].val < min(heap[2 * i + 1].val, heap[2 * i + 2].val) + """ + cdef NodeHeapData_t[:] data + cdef intp_t n + + def __cinit__(self): + # A one-elements array is used as a placeholder to prevent + # any problem due to potential access to this attribute + # (e.g. assigning to NULL or a to value in another segment). + self.data = np.zeros(1, dtype=NodeHeapData, order='C') + + def __init__(self, size_guess=100): + size_guess = max(size_guess, 1) # need space for at least one item + self.data = np.zeros(size_guess, dtype=NodeHeapData, order='C') + self.n = size_guess + self.clear() + + cdef int resize(self, intp_t new_size) except -1: + """Resize the heap to be either larger or smaller""" + cdef: + NodeHeapData_t *data_ptr + NodeHeapData_t *new_data_ptr + intp_t i + intp_t size = self.data.shape[0] + NodeHeapData_t[:] new_data = np.zeros( + new_size, + dtype=NodeHeapData, + ) + + if size > 0 and new_size > 0: + data_ptr = &self.data[0] + new_data_ptr = &new_data[0] + for i in range(min(size, new_size)): + new_data_ptr[i] = data_ptr[i] + + if new_size < size: + self.n = new_size + + self.data = new_data + return 0 + + cdef int push(self, NodeHeapData_t data) except -1: + """Push a new item onto the heap""" + cdef intp_t i, i_parent + cdef NodeHeapData_t* data_arr + self.n += 1 + if self.n > self.data.shape[0]: + self.resize(2 * self.n) + + # put the new element at the end, + # and then perform swaps until the heap is in order + data_arr = &self.data[0] + i = self.n - 1 + data_arr[i] = data + + while i > 0: + i_parent = (i - 1) // 2 + if data_arr[i_parent].val <= data_arr[i].val: + break + else: + swap_nodes(data_arr, i, i_parent) + i = i_parent + return 0 + + cdef NodeHeapData_t peek(self): + """Peek at the root of the heap, without removing it""" + return self.data[0] + + cdef NodeHeapData_t pop(self): + """Remove the root of the heap, and update the remaining nodes""" + if self.n == 0: + raise ValueError('cannot pop on empty heap') + + cdef intp_t i, i_child1, i_child2, i_swap + cdef NodeHeapData_t* data_arr = &self.data[0] + cdef NodeHeapData_t popped_element = data_arr[0] + + # pop off the first element, move the last element to the front, + # and then perform swaps until the heap is back in order + data_arr[0] = data_arr[self.n - 1] + self.n -= 1 + + i = 0 + + while (i < self.n): + i_child1 = 2 * i + 1 + i_child2 = 2 * i + 2 + i_swap = 0 + + if i_child2 < self.n: + if data_arr[i_child1].val <= data_arr[i_child2].val: + i_swap = i_child1 + else: + i_swap = i_child2 + elif i_child1 < self.n: + i_swap = i_child1 + else: + break + + if (i_swap > 0) and (data_arr[i_swap].val <= data_arr[i].val): + swap_nodes(data_arr, i, i_swap) + i = i_swap + else: + break + + return popped_element + + cdef void clear(self): + """Clear the heap""" + self.n = 0 + + +###################################################################### +# newObj function +# this is a helper function for pickling +def newObj(obj): + return obj.__new__(obj) + + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} + +###################################################################### +# define the reverse mapping of VALID_METRICS{{name_suffix}} +from sklearn.metrics._dist_metrics import get_valid_metric_ids +VALID_METRIC_IDS{{name_suffix}} = get_valid_metric_ids(VALID_METRICS{{name_suffix}}) + + +###################################################################### +# Binary Tree class +cdef class BinaryTree{{name_suffix}}: + + cdef readonly const {{INPUT_DTYPE_t}}[:, ::1] data + cdef readonly const {{INPUT_DTYPE_t}}[::1] sample_weight + cdef public float64_t sum_weight + + # TODO: idx_array and node_bounds must not be const, but this change needs + # to happen in a way which preserves pickling + # See also: https://github.com/cython/cython/issues/5639 + cdef public const intp_t[::1] idx_array + cdef public const NodeData_t[::1] node_data + cdef public const {{INPUT_DTYPE_t}}[:, :, ::1] node_bounds + + cdef intp_t leaf_size + cdef intp_t n_levels + cdef intp_t n_nodes + + cdef DistanceMetric{{name_suffix}} dist_metric + cdef int euclidean + + # variables to keep track of building & querying stats + cdef int n_trims + cdef int n_leaves + cdef int n_splits + cdef int n_calls + + valid_metrics = VALID_METRIC_IDS{{name_suffix}} + + # Use cinit to initialize all arrays to empty: this will prevent memory + # errors and seg-faults in rare cases where __init__ is not called + # A one-elements array is used as a placeholder to prevent + # any problem due to potential access to this attribute + # (e.g. assigning to NULL or a to value in another segment). + def __cinit__(self): + self.data = np.empty((1, 1), dtype={{INPUT_DTYPE}}, order='C') + self.sample_weight = np.empty(1, dtype={{INPUT_DTYPE}}, order='C') + self.idx_array = np.empty(1, dtype=np.intp, order='C') + self.node_data = np.empty(1, dtype=NodeData, order='C') + self.node_bounds = np.empty((1, 1, 1), dtype={{INPUT_DTYPE}}) + + self.leaf_size = 0 + self.n_levels = 0 + self.n_nodes = 0 + + self.euclidean = False + + self.n_trims = 0 + self.n_leaves = 0 + self.n_splits = 0 + self.n_calls = 0 + + def __init__(self, data, + leaf_size=40, metric='minkowski', sample_weight=None, **kwargs): + # validate data + self.data = check_array(data, dtype={{INPUT_DTYPE}}, order='C') + if self.data.size == 0: + raise ValueError("X is an empty array") + + n_samples = self.data.shape[0] + n_features = self.data.shape[1] + + if leaf_size < 1: + raise ValueError("leaf_size must be greater than or equal to 1") + self.leaf_size = leaf_size + + self.dist_metric = DistanceMetric.get_metric(metric, dtype={{INPUT_DTYPE}}, **kwargs) + self.euclidean = (self.dist_metric.__class__.__name__ + == 'EuclideanDistance{{name_suffix}}') + + metric = self.dist_metric.__class__.__name__ + if metric not in VALID_METRICS{{name_suffix}}: + raise ValueError('metric {metric} is not valid for ' + '{BinaryTree}'.format(metric=metric, + **DOC_DICT{{name_suffix}})) + self.dist_metric._validate_data(self.data) + + # determine number of levels in the tree, and from this + # the number of nodes in the tree. This results in leaf nodes + # with numbers of points between leaf_size and 2 * leaf_size + self.n_levels = int( + np.log2(fmax(1, (n_samples - 1) / self.leaf_size)) + 1) + self.n_nodes = (2 ** self.n_levels) - 1 + + # allocate arrays for storage + self.idx_array = np.arange(n_samples, dtype=np.intp) + self.node_data = np.zeros(self.n_nodes, dtype=NodeData) + + self._update_sample_weight(n_samples, sample_weight) + + # Allocate tree-specific data + allocate_data{{name_suffix}}(self, self.n_nodes, n_features) + self._recursive_build( + node_data=self.node_data.base, + i_node=0, + idx_start=0, + idx_end=n_samples + ) + + def _update_sample_weight(self, n_samples, sample_weight): + if sample_weight is not None: + self.sample_weight = np.asarray( + sample_weight, dtype={{INPUT_DTYPE}}, order='C') + self.sum_weight = np.sum(self.sample_weight) + else: + self.sample_weight = None + self.sum_weight = n_samples + + def __reduce__(self): + """ + reduce method used for pickling + """ + return (newObj, (type(self),), self.__getstate__()) + + def __getstate__(self): + """ + get state for pickling + """ + if self.sample_weight is not None: + # pass the numpy array + sample_weight = self.sample_weight.base + else: + # pass None to avoid confusion with the empty place holder + # of size 1 from __cinit__ + sample_weight = None + return (self.data.base, + self.idx_array.base, + self.node_data.base, + self.node_bounds.base, + int(self.leaf_size), + int(self.n_levels), + int(self.n_nodes), + int(self.n_trims), + int(self.n_leaves), + int(self.n_splits), + int(self.n_calls), + self.dist_metric, + sample_weight) + + def __setstate__(self, state): + """ + set state for pickling + """ + self.data = state[0] + self.idx_array = state[1] + self.node_data = state[2] + self.node_bounds = state[3] + self.leaf_size = state[4] + self.n_levels = state[5] + self.n_nodes = state[6] + self.n_trims = state[7] + self.n_leaves = state[8] + self.n_splits = state[9] + self.n_calls = state[10] + self.dist_metric = state[11] + sample_weight = state[12] + + self.euclidean = (self.dist_metric.__class__.__name__ + == 'EuclideanDistance64') + n_samples = self.data.shape[0] + self._update_sample_weight(n_samples, sample_weight) + + def get_tree_stats(self): + """ + get_tree_stats() + + Get tree status. + + Returns + ------- + tree_stats: tuple of int + (number of trims, number of leaves, number of splits) + """ + return (self.n_trims, self.n_leaves, self.n_splits) + + def reset_n_calls(self): + """ + reset_n_calls() + + Reset number of calls to 0. + """ + self.n_calls = 0 + + def get_n_calls(self): + """ + get_n_calls() + + Get number of calls. + + Returns + ------- + n_calls: int + number of distance computation calls + """ + return self.n_calls + + def get_arrays(self): + """ + get_arrays() + + Get data and node arrays. + + Returns + ------- + arrays: tuple of array + Arrays for storing tree data, index, node data and node bounds. + """ + return ( + self.data.base, + self.idx_array.base, + self.node_data.base, + self.node_bounds.base, + ) + + cdef inline float64_t dist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: + """Compute the distance between arrays x1 and x2""" + self.n_calls += 1 + if self.euclidean: + return euclidean_dist{{name_suffix}}(x1, x2, size) + else: + return self.dist_metric.dist(x1, x2, size) + + cdef inline float64_t rdist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: + """Compute the reduced distance between arrays x1 and x2. + + The reduced distance, defined for some metrics, is a quantity which + is more efficient to compute than the distance, but preserves the + relative rankings of the true distance. For example, the reduced + distance for the Euclidean metric is the squared-euclidean distance. + """ + self.n_calls += 1 + if self.euclidean: + return euclidean_rdist{{name_suffix}}(x1, x2, size) + else: + return self.dist_metric.rdist(x1, x2, size) + + cdef int _recursive_build(self, NodeData_t[::1] node_data, intp_t i_node, intp_t idx_start, + intp_t idx_end) except -1: + """Recursively build the tree. + + Parameters + ---------- + i_node : int + the node for the current step + idx_start, idx_end : int + the bounding indices in the idx_array which define the points that + belong to this node. + """ + cdef intp_t imax + cdef intp_t n_features = self.data.shape[1] + cdef intp_t n_points = idx_end - idx_start + cdef intp_t n_mid = n_points / 2 + cdef intp_t* idx_array = &self.idx_array[idx_start] + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + + # initialize node data + init_node{{name_suffix}}(self, node_data, i_node, idx_start, idx_end) + + if 2 * i_node + 1 >= self.n_nodes: + node_data[i_node].is_leaf = True + if idx_end - idx_start > 2 * self.leaf_size: + # this shouldn't happen if our memory allocation is correct + # we'll proactively prevent memory errors, but raise a + # warning saying we're doing so. + import warnings + warnings.warn("Internal: memory layout is flawed: " + "not enough nodes allocated") + + elif idx_end - idx_start < 2: + # again, this shouldn't happen if our memory allocation + # is correct. Raise a warning. + import warnings + warnings.warn("Internal: memory layout is flawed: " + "too many nodes allocated") + node_data[i_node].is_leaf = True + + else: + # split node and recursively construct child nodes. + node_data[i_node].is_leaf = False + i_max = find_node_split_dim(data, idx_array, + n_features, n_points) + partition_node_indices(data, idx_array, i_max, n_mid, + n_features, n_points) + self._recursive_build(node_data, 2 * i_node + 1, + idx_start, idx_start + n_mid) + self._recursive_build(node_data, 2 * i_node + 2, + idx_start + n_mid, idx_end) + + def query(self, X, k=1, return_distance=True, + dualtree=False, breadth_first=False, + sort_results=True): + """ + query(X, k=1, return_distance=True, + dualtree=False, breadth_first=False) + + query the tree for the k nearest neighbors + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + An array of points to query + k : int, default=1 + The number of nearest neighbors to return + return_distance : bool, default=True + if True, return a tuple (d, i) of distances and indices + if False, return array i + dualtree : bool, default=False + if True, use the dual tree formalism for the query: a tree is + built for the query points, and the pair of trees is used to + efficiently search this space. This can lead to better + performance as the number of points grows large. + breadth_first : bool, default=False + if True, then query the nodes in a breadth-first manner. + Otherwise, query the nodes in a depth-first manner. + sort_results : bool, default=True + if True, then distances and indices of each point are sorted + on return, so that the first column contains the closest points. + Otherwise, neighbors are returned in an arbitrary order. + + Returns + ------- + i : if return_distance == False + (d,i) : if return_distance == True + + d : ndarray of shape X.shape[:-1] + (k,), dtype=double + Each entry gives the list of distances to the neighbors of the + corresponding point. + + i : ndarray of shape X.shape[:-1] + (k,), dtype=int + Each entry gives the list of indices of neighbors of the + corresponding point. + """ + # XXX: we should allow X to be a pre-built tree. + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + + if self.data.shape[0] < k: + raise ValueError("k must be less than or equal " + "to the number of training points") + + # flatten X, and save original shape information + np_Xarr = X.reshape((-1, self.data.shape[1])) + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr + cdef float64_t reduced_dist_LB + cdef intp_t i + cdef const {{INPUT_DTYPE_t}}* pt + + # initialize heap for neighbors + cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(Xarr.shape[0], k) + + # node heap for breadth-first queries + cdef NodeHeap nodeheap + if breadth_first: + nodeheap = NodeHeap(self.data.shape[0] // self.leaf_size) + + # bounds is needed for the dual tree algorithm + cdef float64_t[::1] bounds + + self.n_trims = 0 + self.n_leaves = 0 + self.n_splits = 0 + + if dualtree: + other = self.__class__(np_Xarr, metric=self.dist_metric, + leaf_size=self.leaf_size) + if breadth_first: + self._query_dual_breadthfirst(other, heap, nodeheap) + else: + reduced_dist_LB = min_rdist_dual{{name_suffix}}(self, 0, other, 0) + bounds = np.full(other.node_data.shape[0], np.inf) + self._query_dual_depthfirst(0, other, 0, bounds, + heap, reduced_dist_LB) + + else: + pt = &Xarr[0, 0] + if breadth_first: + for i in range(Xarr.shape[0]): + self._query_single_breadthfirst(pt, i, heap, nodeheap) + pt += Xarr.shape[1] + else: + with nogil: + for i in range(Xarr.shape[0]): + reduced_dist_LB = min_rdist{{name_suffix}}(self, 0, pt) + self._query_single_depthfirst(0, pt, i, heap, + reduced_dist_LB) + pt += Xarr.shape[1] + + distances, indices = heap.get_arrays(sort=sort_results) + distances = self.dist_metric.rdist_to_dist(distances) + + # deflatten results + if return_distance: + return (distances.reshape(X.shape[:X.ndim - 1] + (k,)), + indices.reshape(X.shape[:X.ndim - 1] + (k,))) + else: + return indices.reshape(X.shape[:X.ndim - 1] + (k,)) + + def query_radius(self, X, r, int return_distance=False, + int count_only=False, int sort_results=False): + """ + query_radius(X, r, return_distance=False, + count_only=False, sort_results=False) + + query the tree for neighbors within a radius r + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + An array of points to query + r : distance within which neighbors are returned + r can be a single value, or an array of values of shape + x.shape[:-1] if different radii are desired for each point. + return_distance : bool, default=False + if True, return distances to neighbors of each point + if False, return only neighbors + Note that unlike the query() method, setting return_distance=True + here adds to the computation time. Not all distances need to be + calculated explicitly for return_distance=False. Results are + not sorted by default: see ``sort_results`` keyword. + count_only : bool, default=False + if True, return only the count of points within distance r + if False, return the indices of all points within distance r + If return_distance==True, setting count_only=True will + result in an error. + sort_results : bool, default=False + if True, the distances and indices will be sorted before being + returned. If False, the results will not be sorted. If + return_distance == False, setting sort_results = True will + result in an error. + + Returns + ------- + count : if count_only == True + ind : if count_only == False and return_distance == False + (ind, dist) : if count_only == False and return_distance == True + + count : ndarray of shape X.shape[:-1], dtype=int + Each entry gives the number of neighbors within a distance r of the + corresponding point. + + ind : ndarray of shape X.shape[:-1], dtype=object + Each element is a numpy integer array listing the indices of + neighbors of the corresponding point. Note that unlike + the results of a k-neighbors query, the returned neighbors + are not sorted by distance by default. + + dist : ndarray of shape X.shape[:-1], dtype=object + Each element is a numpy double array listing the distances + corresponding to indices in i. + """ + if count_only and return_distance: + raise ValueError("count_only and return_distance " + "cannot both be true") + + if sort_results and not return_distance: + raise ValueError("return_distance must be True " + "if sort_results is True") + + cdef intp_t i, count_i = 0 + cdef intp_t n_features = self.data.shape[1] + cdef {{INPUT_DTYPE_t}}[::1] dist_arr_i + cdef intp_t[::1] idx_arr_i, counts + cdef const {{INPUT_DTYPE_t}}* pt + cdef intp_t** indices = NULL + cdef {{INPUT_DTYPE_t}}** distances = NULL + + # validate X and prepare for query + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = X.reshape((-1, self.data.shape[1])) + + # prepare r for query + r = np.asarray(r, dtype=np.float64, order='C') + r = np.atleast_1d(r) + if r.shape == (1,): + r = np.full(X.shape[:X.ndim - 1], r[0], dtype=np.float64) + else: + if r.shape != X.shape[:X.ndim - 1]: + raise ValueError("r must be broadcastable to X.shape") + + rarr_np = r.reshape(-1) # store explicitly to keep in scope + cdef float64_t[::1] rarr = rarr_np + + if not count_only: + indices = calloc(Xarr.shape[0], sizeof(intp_t*)) + if indices == NULL: + raise MemoryError() + if return_distance: + distances = <{{INPUT_DTYPE_t}}**>calloc(Xarr.shape[0], sizeof({{INPUT_DTYPE_t}}*)) + if distances == NULL: + free(indices) + raise MemoryError() + + np_idx_arr = np.zeros(self.data.shape[0], dtype=np.intp) + idx_arr_i = np_idx_arr + + np_dist_arr = np.zeros(self.data.shape[0], dtype={{INPUT_DTYPE}}) + dist_arr_i = np_dist_arr + + counts_arr = np.zeros(Xarr.shape[0], dtype=np.intp) + counts = counts_arr + + pt = &Xarr[0, 0] + memory_error = False + with nogil: + for i in range(Xarr.shape[0]): + counts[i] = self._query_radius_single(0, pt, rarr[i], + &idx_arr_i[0], + &dist_arr_i[0], + 0, count_only, + return_distance) + pt += n_features + + if count_only: + continue + + if sort_results: + _simultaneous_sort(&dist_arr_i[0], &idx_arr_i[0], + counts[i]) + + # equivalent to: indices[i] = np_idx_arr[:counts[i]].copy() + indices[i] = malloc(counts[i] * sizeof(intp_t)) + if indices[i] == NULL: + memory_error = True + break + memcpy(indices[i], &idx_arr_i[0], counts[i] * sizeof(intp_t)) + + if return_distance: + # equivalent to: distances[i] = np_dist_arr[:counts[i]].copy() + distances[i] = <{{INPUT_DTYPE_t}}*>malloc(counts[i] * sizeof({{INPUT_DTYPE_t}})) + if distances[i] == NULL: + memory_error = True + break + memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof({{INPUT_DTYPE_t}})) + + try: + if memory_error: + raise MemoryError() + + if count_only: + # deflatten results + return counts_arr.reshape(X.shape[:X.ndim - 1]) + elif return_distance: + indices_npy = np.zeros(Xarr.shape[0], dtype='object') + distances_npy = np.zeros(Xarr.shape[0], dtype='object') + for i in range(Xarr.shape[0]): + # make a new numpy array that wraps the existing data + # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 + indices_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_INTP, indices[i]) + # make sure the data will be freed when the numpy array is garbage collected + PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_ARRAY_OWNDATA) + # make sure the data is not freed twice + indices[i] = NULL + + # make a new numpy array that wraps the existing data + # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 + distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], {{NPY_TYPE}}, distances[i]) + # make sure the data will be freed when the numpy array is garbage collected + PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_ARRAY_OWNDATA) + # make sure the data is not freed twice + distances[i] = NULL + + # deflatten results + return (indices_npy.reshape(X.shape[:X.ndim - 1]), + distances_npy.reshape(X.shape[:X.ndim - 1])) + else: + indices_npy = np.zeros(Xarr.shape[0], dtype='object') + for i in range(Xarr.shape[0]): + # make a new numpy array that wraps the existing data + # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 + indices_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_INTP, indices[i]) + # make sure the data will be freed when the numpy array is garbage collected + PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_ARRAY_OWNDATA) + # make sure the data is not freed twice + indices[i] = NULL + + # deflatten results + return indices_npy.reshape(X.shape[:X.ndim - 1]) + except MemoryError: + # free any buffer that is not owned by a numpy array + for i in range(Xarr.shape[0]): + free(indices[i]) + if return_distance: + free(distances[i]) + raise + finally: + free(indices) + free(distances) + + def kernel_density(self, X, h, kernel='gaussian', + atol=0, rtol=1E-8, + breadth_first=True, return_log=False): + """ + kernel_density(X, h, kernel='gaussian', atol=0, rtol=1E-8, + breadth_first=True, return_log=False) + + Compute the kernel density estimate at points X with the given kernel, + using the distance metric specified at tree creation. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + An array of points to query. Last dimension should match dimension + of training data. + h : float + the bandwidth of the kernel + kernel : str, default="gaussian" + specify the kernel to use. Options are + - 'gaussian' + - 'tophat' + - 'epanechnikov' + - 'exponential' + - 'linear' + - 'cosine' + Default is kernel = 'gaussian' + atol : float, default=0 + Specify the desired absolute tolerance of the result. + If the true result is `K_true`, then the returned result `K_ret` + satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret`` + The default is zero (i.e. machine precision). + rtol : float, default=1e-8 + Specify the desired relative tolerance of the result. + If the true result is `K_true`, then the returned result `K_ret` + satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret`` + The default is `1e-8` (i.e. machine precision). + breadth_first : bool, default=False + If True, use a breadth-first search. If False (default) use a + depth-first search. Breadth-first is generally faster for + compact kernels and/or high tolerances. + return_log : bool, default=False + Return the logarithm of the result. This can be more accurate + than returning the result itself for narrow kernels. + + Returns + ------- + density : ndarray of shape X.shape[:-1] + The array of (log)-density evaluations + """ + cdef float64_t h_c = h + cdef float64_t log_atol = log(atol) + cdef float64_t log_rtol = log(rtol) + cdef float64_t log_min_bound, log_max_bound, log_bound_spread + cdef float64_t dist_LB = 0, dist_UB = 0 + + cdef intp_t n_samples = self.data.shape[0] + cdef intp_t n_features = self.data.shape[1] + cdef intp_t i + cdef KernelType kernel_c + + # validate kernel + if kernel == 'gaussian': + kernel_c = GAUSSIAN_KERNEL + elif kernel == 'tophat': + kernel_c = TOPHAT_KERNEL + elif kernel == 'epanechnikov': + kernel_c = EPANECHNIKOV_KERNEL + elif kernel == 'exponential': + kernel_c = EXPONENTIAL_KERNEL + elif kernel == 'linear': + kernel_c = LINEAR_KERNEL + elif kernel == 'cosine': + kernel_c = COSINE_KERNEL + else: + raise ValueError("kernel = '%s' not recognized" % kernel) + + cdef float64_t log_knorm = _log_kernel_norm(h_c, n_features, kernel_c) + + # validate X and prepare for query + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') + + if X.shape[X.ndim - 1] != n_features: + raise ValueError("query data dimension must " + "match training data dimension") + Xarr_np = X.reshape((-1, n_features)) + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = Xarr_np + + log_density_arr = np.zeros(Xarr.shape[0], dtype={{INPUT_DTYPE}}) + cdef {{INPUT_DTYPE_t}}[::1] log_density = log_density_arr + + cdef const {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] + + cdef NodeHeap nodeheap + if breadth_first: + nodeheap = NodeHeap(self.data.shape[0] // self.leaf_size) + cdef float64_t[::1] node_log_min_bounds + cdef float64_t[::1] node_bound_widths + # TODO: implement dual tree approach. + # this is difficult because of the need to cache values + # computed between node pairs. + if breadth_first: + node_log_min_bounds_arr = np.full(self.n_nodes, -np.inf) + node_log_min_bounds = node_log_min_bounds_arr + node_bound_widths_arr = np.zeros(self.n_nodes) + node_bound_widths = node_bound_widths_arr + for i in range(Xarr.shape[0]): + log_density[i] = self._kde_single_breadthfirst( + pt, kernel_c, h_c, + log_knorm, log_atol, log_rtol, + nodeheap, + &node_log_min_bounds[0], + &node_bound_widths[0]) + pt += n_features + else: + for i in range(Xarr.shape[0]): + min_max_dist{{name_suffix}}(self, 0, pt, &dist_LB, &dist_UB) + # compute max & min bounds on density within top node + log_min_bound = (log(self.sum_weight) + + compute_log_kernel(dist_UB, + h_c, kernel_c)) + log_max_bound = (log(self.sum_weight) + + compute_log_kernel(dist_LB, + h_c, kernel_c)) + log_bound_spread = logsubexp(log_max_bound, log_min_bound) + self._kde_single_depthfirst(0, pt, kernel_c, h_c, + log_knorm, log_atol, log_rtol, + log_min_bound, + log_bound_spread, + &log_min_bound, + &log_bound_spread) + log_density[i] = logaddexp(log_min_bound, + log_bound_spread - log(2)) + pt += n_features + + # normalize the results + for i in range(log_density.shape[0]): + log_density[i] += log_knorm + + log_density_arr = log_density_arr.reshape(X.shape[:X.ndim - 1]) + + if return_log: + return log_density_arr + else: + return np.exp(log_density_arr) + + def two_point_correlation(self, X, r, dualtree=False): + """ + two_point_correlation(X, r, dualtree=False) + + Compute the two-point correlation function + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + An array of points to query. Last dimension should match dimension + of training data. + r : array-like + A one-dimensional array of distances + dualtree : bool, default=False + If True, use a dualtree algorithm. Otherwise, use a single-tree + algorithm. Dual tree algorithms can have better scaling for + large N. + + Returns + ------- + counts : ndarray + counts[i] contains the number of pairs of points with distance + less than or equal to r[i] + """ + cdef intp_t n_features = self.data.shape[1] + cdef intp_t i + + # validate X and prepare for query + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + + np_Xarr = X.reshape((-1, self.data.shape[1])) + cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr + + # prepare r for query + r = np.asarray(r, dtype=np.float64, order='C') + r = np.atleast_1d(r) + if r.ndim != 1: + raise ValueError("r must be a 1-dimensional array") + i_rsort = np.argsort(r) + rarr_np = r[i_rsort] # needed to keep memory in scope + cdef float64_t[::1] rarr = rarr_np + + # create array to hold counts + count = np.zeros(r.shape[0], dtype=np.intp) + cdef intp_t[::1] carr = count + + cdef const {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] + + if dualtree: + other = self.__class__(Xarr, metric=self.dist_metric, + leaf_size=self.leaf_size) + self._two_point_dual(0, other, 0, &rarr[0], &carr[0], + 0, rarr.shape[0]) + else: + for i in range(Xarr.shape[0]): + self._two_point_single(0, pt, &rarr[0], &carr[0], + 0, rarr.shape[0]) + pt += n_features + + return count + + cdef int _query_single_depthfirst( + self, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1 nogil: + """Recursive Single-tree k-neighbors query, depth-first approach""" + cdef NodeData_t node_info = self.node_data[i_node] + + cdef float64_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2 + cdef intp_t i, i1, i2 + + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + + # ------------------------------------------------------------ + # Case 1: query point is outside node radius: + # trim it from the query + if reduced_dist_LB > heap.largest(i_pt): + self.n_trims += 1 + + # ------------------------------------------------------------ + # Case 2: this is a leaf node. Update set of nearby points + elif node_info.is_leaf: + self.n_leaves += 1 + for i in range(node_info.idx_start, node_info.idx_end): + dist_pt = self.rdist(pt, + &self.data[self.idx_array[i], 0], + self.data.shape[1]) + heap._push(i_pt, dist_pt, self.idx_array[i]) + + # ------------------------------------------------------------ + # Case 3: Node is not a leaf. Recursively query subnodes + # starting with the closest + else: + self.n_splits += 1 + i1 = 2 * i_node + 1 + i2 = i1 + 1 + reduced_dist_LB_1 = min_rdist{{name_suffix}}(self, i1, pt) + reduced_dist_LB_2 = min_rdist{{name_suffix}}(self, i2, pt) + + # recursively query subnodes + if reduced_dist_LB_1 <= reduced_dist_LB_2: + self._query_single_depthfirst(i1, pt, i_pt, heap, + reduced_dist_LB_1) + self._query_single_depthfirst(i2, pt, i_pt, heap, + reduced_dist_LB_2) + else: + self._query_single_depthfirst(i2, pt, i_pt, heap, + reduced_dist_LB_2) + self._query_single_depthfirst(i1, pt, i_pt, heap, + reduced_dist_LB_1) + return 0 + + cdef int _query_single_breadthfirst( + self, + const {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: + """Non-recursive single-tree k-neighbors query, breadth-first search""" + cdef intp_t i, i_node + cdef float64_t dist_pt, reduced_dist_LB + cdef const NodeData_t* node_data = &self.node_data[0] + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + + # Set up the node heap and push the head node onto it + cdef NodeHeapData_t nodeheap_item + nodeheap_item.val = min_rdist{{name_suffix}}(self, 0, pt) + nodeheap_item.i1 = 0 + nodeheap.push(nodeheap_item) + + while nodeheap.n > 0: + nodeheap_item = nodeheap.pop() + reduced_dist_LB = nodeheap_item.val + i_node = nodeheap_item.i1 + node_info = node_data[i_node] + + # ------------------------------------------------------------ + # Case 1: query point is outside node radius: + # trim it from the query + if reduced_dist_LB > heap.largest(i_pt): + self.n_trims += 1 + + # ------------------------------------------------------------ + # Case 2: this is a leaf node. Update set of nearby points + elif node_data[i_node].is_leaf: + self.n_leaves += 1 + for i in range(node_data[i_node].idx_start, + node_data[i_node].idx_end): + dist_pt = self.rdist(pt, + &self.data[self.idx_array[i], 0], + self.data.shape[1]) + heap._push(i_pt, dist_pt, self.idx_array[i]) + + # ------------------------------------------------------------ + # Case 3: Node is not a leaf. Add subnodes to the node heap + else: + self.n_splits += 1 + for i in range(2 * i_node + 1, 2 * i_node + 3): + nodeheap_item.i1 = i + nodeheap_item.val = min_rdist{{name_suffix}}(self, i, pt) + nodeheap.push(nodeheap_item) + return 0 + + cdef int _query_dual_depthfirst( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t[::1] bounds, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1: + """Recursive dual-tree k-neighbors query, depth-first""" + # note that the array `bounds` is maintained such that + # bounds[i] is the largest distance among any of the + # current neighbors in node i of the other tree. + cdef NodeData_t node_info1 = self.node_data[i_node1] + cdef NodeData_t node_info2 = other.node_data[i_node2] + + cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] + cdef intp_t n_features = self.data.shape[1] + + cdef float64_t bound_max, dist_pt, reduced_dist_LB1, reduced_dist_LB2 + cdef intp_t i1, i2, i_pt, i_parent + + # ------------------------------------------------------------ + # Case 1: nodes are further apart than the current bound: + # trim both from the query + if reduced_dist_LB > bounds[i_node2]: + pass + + # ------------------------------------------------------------ + # Case 2: both nodes are leaves: + # do a brute-force search comparing all pairs + elif node_info1.is_leaf and node_info2.is_leaf: + bounds[i_node2] = 0 + + for i2 in range(node_info2.idx_start, node_info2.idx_end): + i_pt = other.idx_array[i2] + + if heap.largest(i_pt) <= reduced_dist_LB: + continue + + for i1 in range(node_info1.idx_start, node_info1.idx_end): + dist_pt = self.rdist( + data1 + n_features * self.idx_array[i1], + data2 + n_features * i_pt, + n_features) + heap._push(i_pt, dist_pt, self.idx_array[i1]) + + # keep track of node bound + bounds[i_node2] = fmax(bounds[i_node2], + heap.largest(i_pt)) + + # update bounds up the tree + while i_node2 > 0: + i_parent = (i_node2 - 1) // 2 + bound_max = fmax(bounds[2 * i_parent + 1], + bounds[2 * i_parent + 2]) + if bound_max < bounds[i_parent]: + bounds[i_parent] = bound_max + i_node2 = i_parent + else: + break + + # ------------------------------------------------------------ + # Case 3a: node 1 is a leaf or is smaller: split node 2 and + # recursively query, starting with the nearest subnode + elif node_info1.is_leaf or (not node_info2.is_leaf + and node_info2.radius > node_info1.radius): + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, i_node1, + other, 2 * i_node2 + 1) + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, i_node1, + other, 2 * i_node2 + 2) + + if reduced_dist_LB1 < reduced_dist_LB2: + self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 1, + bounds, heap, reduced_dist_LB1) + self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 2, + bounds, heap, reduced_dist_LB2) + else: + self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 2, + bounds, heap, reduced_dist_LB2) + self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 1, + bounds, heap, reduced_dist_LB1) + + # ------------------------------------------------------------ + # Case 3b: node 2 is a leaf or is smaller: split node 1 and + # recursively query, starting with the nearest subnode + else: + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 1, + other, i_node2) + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 2, + other, i_node2) + + if reduced_dist_LB1 < reduced_dist_LB2: + self._query_dual_depthfirst(2 * i_node1 + 1, other, i_node2, + bounds, heap, reduced_dist_LB1) + self._query_dual_depthfirst(2 * i_node1 + 2, other, i_node2, + bounds, heap, reduced_dist_LB2) + else: + self._query_dual_depthfirst(2 * i_node1 + 2, other, i_node2, + bounds, heap, reduced_dist_LB2) + self._query_dual_depthfirst(2 * i_node1 + 1, other, i_node2, + bounds, heap, reduced_dist_LB1) + return 0 + + cdef int _query_dual_breadthfirst( + self, + BinaryTree{{name_suffix}} other, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: + """Non-recursive dual-tree k-neighbors query, breadth-first""" + cdef intp_t i, i1, i2, i_node1, i_node2, i_pt + cdef float64_t dist_pt, reduced_dist_LB + cdef float64_t[::1] bounds = np.full(other.node_data.shape[0], np.inf) + cdef const NodeData_t* node_data1 = &self.node_data[0] + cdef const NodeData_t* node_data2 = &other.node_data[0] + cdef NodeData_t node_info1, node_info2 + cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] + cdef intp_t n_features = self.data.shape[1] + + # Set up the node heap and push the head nodes onto it + cdef NodeHeapData_t nodeheap_item + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, 0, other, 0) + nodeheap_item.i1 = 0 + nodeheap_item.i2 = 0 + nodeheap.push(nodeheap_item) + + while nodeheap.n > 0: + nodeheap_item = nodeheap.pop() + reduced_dist_LB = nodeheap_item.val + i_node1 = nodeheap_item.i1 + i_node2 = nodeheap_item.i2 + + node_info1 = node_data1[i_node1] + node_info2 = node_data2[i_node2] + + # ------------------------------------------------------------ + # Case 1: nodes are further apart than the current bound: + # trim both from the query + if reduced_dist_LB > bounds[i_node2]: + pass + + # ------------------------------------------------------------ + # Case 2: both nodes are leaves: + # do a brute-force search comparing all pairs + elif node_info1.is_leaf and node_info2.is_leaf: + bounds[i_node2] = -1 + + for i2 in range(node_info2.idx_start, node_info2.idx_end): + i_pt = other.idx_array[i2] + + if heap.largest(i_pt) <= reduced_dist_LB: + continue + + for i1 in range(node_info1.idx_start, node_info1.idx_end): + dist_pt = self.rdist( + data1 + n_features * self.idx_array[i1], + data2 + n_features * i_pt, + n_features) + heap._push(i_pt, dist_pt, self.idx_array[i1]) + + # keep track of node bound + bounds[i_node2] = fmax(bounds[i_node2], + heap.largest(i_pt)) + + # ------------------------------------------------------------ + # Case 3a: node 1 is a leaf or is smaller: split node 2 and + # recursively query, starting with the nearest subnode + elif node_info1.is_leaf or (not node_info2.is_leaf + and (node_info2.radius + > node_info1.radius)): + nodeheap_item.i1 = i_node1 + for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): + nodeheap_item.i2 = i2 + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i_node1, + other, i2) + nodeheap.push(nodeheap_item) + + # ------------------------------------------------------------ + # Case 3b: node 2 is a leaf or is smaller: split node 1 and + # recursively query, starting with the nearest subnode + else: + nodeheap_item.i2 = i_node2 + for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): + nodeheap_item.i1 = i1 + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i1, + other, i_node2) + nodeheap.push(nodeheap_item) + return 0 + + cdef intp_t _query_radius_single( + self, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + float64_t r, + intp_t* indices, + {{INPUT_DTYPE_t}}* distances, + intp_t count, + int count_only, + int return_distance, + ) noexcept nogil: + """recursive single-tree radius query, depth-first""" + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + cdef intp_t* idx_array = &self.idx_array[0] + cdef intp_t n_features = self.data.shape[1] + cdef NodeData_t node_info = self.node_data[i_node] + + cdef intp_t i + cdef float64_t reduced_r + + cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) + + # ------------------------------------------------------------ + # Case 1: all node points are outside distance r. + # prune this branch. + if dist_LB > r: + pass + + # ------------------------------------------------------------ + # Case 2: all node points are within distance r + # add all points to neighbors + elif dist_UB <= r: + if count_only: + count += (node_info.idx_end - node_info.idx_start) + else: + for i in range(node_info.idx_start, node_info.idx_end): + if (count < 0) or (count >= self.data.shape[0]): + return -1 + indices[count] = idx_array[i] + if return_distance: + distances[count] = self.dist(pt, (data + n_features + * idx_array[i]), + n_features) + count += 1 + + # ------------------------------------------------------------ + # Case 3: this is a leaf node. Go through all points to + # determine if they fall within radius + elif node_info.is_leaf: + reduced_r = self.dist_metric._dist_to_rdist(r) + + for i in range(node_info.idx_start, node_info.idx_end): + dist_pt = self.rdist(pt, (data + n_features * idx_array[i]), + n_features) + if dist_pt <= reduced_r: + if (count < 0) or (count >= self.data.shape[0]): + return -1 + if count_only: + pass + else: + indices[count] = idx_array[i] + if return_distance: + distances[count] =\ + self.dist_metric._rdist_to_dist(dist_pt) + count += 1 + + # ------------------------------------------------------------ + # Case 4: Node is not a leaf. Recursively query subnodes + else: + count = self._query_radius_single(2 * i_node + 1, pt, r, + indices, distances, count, + count_only, return_distance) + count = self._query_radius_single(2 * i_node + 2, pt, r, + indices, distances, count, + count_only, return_distance) + + return count + + cdef float64_t _kde_single_breadthfirst( + self, const {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + NodeHeap nodeheap, + float64_t* node_log_min_bounds, + float64_t* node_log_bound_spreads, + ): + """non-recursive single-tree kernel density estimation""" + # For the given point, node_log_min_bounds and node_log_bound_spreads + # will encode the current bounds on the density between the point + # and the associated node. + # The variables global_log_min_bound and global_log_bound_spread + # keep track of the global bounds on density. The procedure here is + # to split nodes, updating these bounds, until the bounds are within + # atol & rtol. + cdef intp_t i, i1, i2, i_node + cdef float64_t N1, N2 + cdef float64_t global_log_min_bound, global_log_bound_spread + cdef float64_t global_log_max_bound + + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + cdef bint with_sample_weight = self.sample_weight is not None + cdef const {{INPUT_DTYPE_t}}* sample_weight + if with_sample_weight: + sample_weight = &self.sample_weight[0] + cdef intp_t* idx_array = &self.idx_array[0] + cdef const NodeData_t* node_data = &self.node_data[0] + cdef float64_t N + cdef float64_t log_weight + if with_sample_weight: + N = self.sum_weight + else: + N = self.data.shape[0] + cdef intp_t n_features = self.data.shape[1] + + cdef NodeData_t node_info + cdef float64_t dist_pt, log_density + cdef float64_t dist_LB_1 = 0, dist_LB_2 = 0 + cdef float64_t dist_UB_1 = 0, dist_UB_2 = 0 + + cdef float64_t dist_UB, dist_LB + + # push the top node to the heap + cdef NodeHeapData_t nodeheap_item + nodeheap_item.val = min_dist{{name_suffix}}(self, 0, pt) + nodeheap_item.i1 = 0 + nodeheap.push(nodeheap_item) + + global_log_min_bound = log(N) + compute_log_kernel( + max_dist{{name_suffix}}(self, 0, pt), h, kernel + ) + global_log_max_bound = log(N) + compute_log_kernel(nodeheap_item.val, + h, kernel) + global_log_bound_spread = logsubexp(global_log_max_bound, + global_log_min_bound) + + node_log_min_bounds[0] = global_log_min_bound + node_log_bound_spreads[0] = global_log_bound_spread + + while nodeheap.n > 0: + nodeheap_item = nodeheap.pop() + i_node = nodeheap_item.i1 + + node_info = node_data[i_node] + if with_sample_weight: + N1 = _total_node_weight(node_data, sample_weight, + idx_array, i_node) + else: + N1 = node_info.idx_end - node_info.idx_start + + # ------------------------------------------------------------ + # Case 1: local bounds are equal to within per-point tolerance. + if (log_knorm + node_log_bound_spreads[i_node] - log(N1) + log(N) + <= logaddexp(log_atol, (log_rtol + log_knorm + + node_log_min_bounds[i_node]))): + pass + + # ------------------------------------------------------------ + # Case 2: global bounds are within rtol & atol. + elif (log_knorm + global_log_bound_spread + <= logaddexp(log_atol, + log_rtol + log_knorm + global_log_min_bound)): + break + + # ------------------------------------------------------------ + # Case 3: node is a leaf. Count contributions from all points + elif node_info.is_leaf: + global_log_min_bound =\ + logsubexp(global_log_min_bound, + node_log_min_bounds[i_node]) + global_log_bound_spread =\ + logsubexp(global_log_bound_spread, + node_log_bound_spreads[i_node]) + for i in range(node_info.idx_start, node_info.idx_end): + dist_pt = self.dist(pt, data + n_features * idx_array[i], + n_features) + log_density = compute_log_kernel(dist_pt, h, kernel) + if with_sample_weight: + log_weight = np.log(sample_weight[idx_array[i]]) + else: + log_weight = 0. + global_log_min_bound = logaddexp(global_log_min_bound, + log_density + log_weight) + + # ------------------------------------------------------------ + # Case 4: split node and query subnodes + else: + i1 = 2 * i_node + 1 + i2 = 2 * i_node + 2 + + if with_sample_weight: + N1 = _total_node_weight(node_data, sample_weight, + idx_array, i1) + N2 = _total_node_weight(node_data, sample_weight, + idx_array, i2) + else: + N1 = node_data[i1].idx_end - node_data[i1].idx_start + N2 = node_data[i2].idx_end - node_data[i2].idx_start + + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB_1, &dist_UB_1) + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB_2, &dist_UB_2) + + node_log_min_bounds[i1] = (log(N1) + + compute_log_kernel(dist_UB_1, + h, kernel)) + node_log_bound_spreads[i1] = (log(N1) + + compute_log_kernel(dist_LB_1, + h, kernel)) + + node_log_min_bounds[i2] = (log(N2) + + compute_log_kernel(dist_UB_2, + h, kernel)) + node_log_bound_spreads[i2] = (log(N2) + + compute_log_kernel(dist_LB_2, + h, kernel)) + + global_log_min_bound = logsubexp(global_log_min_bound, + node_log_min_bounds[i_node]) + global_log_min_bound = logaddexp(global_log_min_bound, + node_log_min_bounds[i1]) + global_log_min_bound = logaddexp(global_log_min_bound, + node_log_min_bounds[i2]) + + global_log_bound_spread =\ + logsubexp(global_log_bound_spread, + node_log_bound_spreads[i_node]) + global_log_bound_spread = logaddexp(global_log_bound_spread, + node_log_bound_spreads[i1]) + global_log_bound_spread = logaddexp(global_log_bound_spread, + node_log_bound_spreads[i2]) + + # TODO: rank by the spread rather than the distance? + nodeheap_item.val = dist_LB_1 + nodeheap_item.i1 = i1 + nodeheap.push(nodeheap_item) + + nodeheap_item.val = dist_LB_2 + nodeheap_item.i1 = i2 + nodeheap.push(nodeheap_item) + + nodeheap.clear() + return logaddexp(global_log_min_bound, + global_log_bound_spread - log(2)) + + cdef int _kde_single_depthfirst( + self, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + float64_t local_log_min_bound, + float64_t local_log_bound_spread, + float64_t* global_log_min_bound, + float64_t* global_log_bound_spread, + ) except -1: + """recursive single-tree kernel density estimate, depth-first""" + # For the given point, local_min_bound and local_max_bound give the + # minimum and maximum density for the current node, while + # global_min_bound and global_max_bound give the minimum and maximum + # density over the entire tree. We recurse down until global_min_bound + # and global_max_bound are within rtol and atol. + cdef intp_t i, i1, i2, iw, start, end + cdef float64_t N1, N2 + + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + cdef const NodeData_t* node_data = &self.node_data[0] + cdef bint with_sample_weight = self.sample_weight is not None + cdef const {{INPUT_DTYPE_t}}* sample_weight + cdef float64_t log_weight + if with_sample_weight: + sample_weight = &self.sample_weight[0] + cdef intp_t* idx_array = &self.idx_array[0] + cdef intp_t n_features = self.data.shape[1] + + cdef NodeData_t node_info = self.node_data[i_node] + cdef float64_t dist_pt, log_dens_contribution + + cdef float64_t child1_log_min_bound, child2_log_min_bound + cdef float64_t child1_log_bound_spread, child2_log_bound_spread + cdef float64_t dist_UB = 0, dist_LB = 0 + + if with_sample_weight: + N1 = _total_node_weight(node_data, sample_weight, + idx_array, i_node) + N2 = self.sum_weight + else: + N1 = (node_info.idx_end - node_info.idx_start) + N2 = self.data.shape[0] + + # ------------------------------------------------------------ + # Case 1: local bounds are equal to within errors. Return + if ( + log_knorm + local_log_bound_spread - log(N1) + log(N2) + <= logaddexp(log_atol, (log_rtol + log_knorm + local_log_min_bound)) + ): + pass + + # ------------------------------------------------------------ + # Case 2: global bounds are within rtol & atol. Return + elif ( + log_knorm + global_log_bound_spread[0] + <= logaddexp(log_atol, (log_rtol + log_knorm + global_log_min_bound[0])) + ): + pass + + # ------------------------------------------------------------ + # Case 3: node is a leaf. Count contributions from all points + elif node_info.is_leaf: + global_log_min_bound[0] = logsubexp(global_log_min_bound[0], + local_log_min_bound) + global_log_bound_spread[0] = logsubexp(global_log_bound_spread[0], + local_log_bound_spread) + for i in range(node_info.idx_start, node_info.idx_end): + dist_pt = self.dist(pt, (data + n_features * idx_array[i]), + n_features) + log_dens_contribution = compute_log_kernel(dist_pt, h, kernel) + if with_sample_weight: + log_weight = np.log(sample_weight[idx_array[i]]) + else: + log_weight = 0. + global_log_min_bound[0] = logaddexp(global_log_min_bound[0], + (log_dens_contribution + + log_weight)) + + # ------------------------------------------------------------ + # Case 4: split node and query subnodes + else: + i1 = 2 * i_node + 1 + i2 = 2 * i_node + 2 + + if with_sample_weight: + N1 = _total_node_weight(node_data, sample_weight, + idx_array, i1) + N2 = _total_node_weight(node_data, sample_weight, + idx_array, i2) + else: + N1 = (self.node_data[i1].idx_end - self.node_data[i1].idx_start) + N2 = (self.node_data[i2].idx_end - self.node_data[i2].idx_start) + + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB, &dist_UB) + child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h, + kernel) + child1_log_bound_spread = logsubexp(log(N1) + + compute_log_kernel(dist_LB, h, + kernel), + child1_log_min_bound) + + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB, &dist_UB) + child2_log_min_bound = log(N2) + compute_log_kernel(dist_UB, h, + kernel) + child2_log_bound_spread = logsubexp(log(N2) + + compute_log_kernel(dist_LB, h, + kernel), + child2_log_min_bound) + + global_log_min_bound[0] = logsubexp(global_log_min_bound[0], + local_log_min_bound) + global_log_min_bound[0] = logaddexp(global_log_min_bound[0], + child1_log_min_bound) + global_log_min_bound[0] = logaddexp(global_log_min_bound[0], + child2_log_min_bound) + + global_log_bound_spread[0] = logsubexp(global_log_bound_spread[0], + local_log_bound_spread) + global_log_bound_spread[0] = logaddexp(global_log_bound_spread[0], + child1_log_bound_spread) + global_log_bound_spread[0] = logaddexp(global_log_bound_spread[0], + child2_log_bound_spread) + + self._kde_single_depthfirst(i1, pt, kernel, h, log_knorm, + log_atol, log_rtol, + child1_log_min_bound, + child1_log_bound_spread, + global_log_min_bound, + global_log_bound_spread) + self._kde_single_depthfirst(i2, pt, kernel, h, log_knorm, + log_atol, log_rtol, + child2_log_min_bound, + child2_log_bound_spread, + global_log_min_bound, + global_log_bound_spread) + return 0 + + cdef int _two_point_single( + self, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: + """recursive single-tree two-point correlation function query""" + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + cdef intp_t* idx_array = &self.idx_array[0] + cdef intp_t n_features = self.data.shape[1] + cdef NodeData_t node_info = self.node_data[i_node] + + cdef intp_t i, j, Npts + cdef float64_t reduced_r + + cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) + + # ------------------------------------------------------------ + # Go through bounds and check for cuts + while i_min < i_max: + if dist_LB > r[i_min]: + i_min += 1 + else: + break + + while i_max > i_min: + Npts = (node_info.idx_end - node_info.idx_start) + if dist_UB <= r[i_max - 1]: + count[i_max - 1] += Npts + i_max -= 1 + else: + break + + if i_min < i_max: + # If node is a leaf, go through all points + if node_info.is_leaf: + for i in range(node_info.idx_start, node_info.idx_end): + dist_pt = self.dist(pt, (data + n_features * idx_array[i]), + n_features) + j = i_max - 1 + while (j >= i_min) and (dist_pt <= r[j]): + count[j] += 1 + j -= 1 + + else: + self._two_point_single(2 * i_node + 1, pt, r, + count, i_min, i_max) + self._two_point_single(2 * i_node + 2, pt, r, + count, i_min, i_max) + return 0 + + cdef int _two_point_dual( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: + """recursive dual-tree two-point correlation function query""" + cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] + cdef intp_t* idx_array1 = &self.idx_array[0] + cdef intp_t* idx_array2 = &other.idx_array[0] + cdef NodeData_t node_info1 = self.node_data[i_node1] + cdef NodeData_t node_info2 = other.node_data[i_node2] + + cdef intp_t n_features = self.data.shape[1] + + cdef intp_t i1, i2, j, Npts + cdef float64_t reduced_r + + cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 + dist_LB = min_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) + dist_UB = max_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) + + # ------------------------------------------------------------ + # Go through bounds and check for cuts + while i_min < i_max: + if dist_LB > r[i_min]: + i_min += 1 + else: + break + + while i_max > i_min: + Npts = ((node_info1.idx_end - node_info1.idx_start) + * (node_info2.idx_end - node_info2.idx_start)) + if dist_UB <= r[i_max - 1]: + count[i_max - 1] += Npts + i_max -= 1 + else: + break + + if i_min < i_max: + if node_info1.is_leaf and node_info2.is_leaf: + # If both nodes are leaves, go through all points + for i1 in range(node_info1.idx_start, node_info1.idx_end): + for i2 in range(node_info2.idx_start, node_info2.idx_end): + dist_pt = self.dist((data1 + n_features + * idx_array1[i1]), + (data2 + n_features + * idx_array2[i2]), + n_features) + j = i_max - 1 + while (j >= i_min) and (dist_pt <= r[j]): + count[j] += 1 + j -= 1 + + elif node_info1.is_leaf: + # If only one is a leaf, split the other + for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): + self._two_point_dual(i_node1, other, i2, + r, count, i_min, i_max) + + elif node_info2.is_leaf: + for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): + self._two_point_dual(i1, other, i_node2, + r, count, i_min, i_max) + + else: + # neither is a leaf: split & query both + for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): + for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): + self._two_point_dual(i1, other, i2, + r, count, i_min, i_max) + return 0 + +{{endfor}} + +###################################################################### +# Python functions for benchmarking and testing C implementations + +def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): + """In-place simultaneous sort the given row of the arrays + + This python wrapper exists primarily to enable unit testing + of the _simultaneous_sort C routine. + """ + assert distances.shape[0] == indices.shape[0] + assert distances.shape[1] == indices.shape[1] + cdef intp_t row + for row in range(distances.shape[0]): + _simultaneous_sort(&distances[row, 0], + &indices[row, 0], + distances.shape[1]) + + +def nodeheap_sort(float64_t[::1] vals): + """In-place reverse sort of vals using NodeHeap""" + cdef intp_t[::1] indices = np.zeros(vals.shape[0], dtype=np.intp) + cdef float64_t[::1] vals_sorted = np.zeros_like(vals) + + # use initial size 0 to check corner case + cdef NodeHeap heap = NodeHeap(0) + cdef NodeHeapData_t data + cdef intp_t i + for i in range(vals.shape[0]): + data.val = vals[i] + data.i1 = i + data.i2 = i + 1 + heap.push(data) + + for i in range(vals.shape[0]): + data = heap.pop() + vals_sorted[i] = data.val + indices[i] = data.i1 + + return np.asarray(vals_sorted), np.asarray(indices) + + +cdef inline float64_t _total_node_weight( + const NodeData_t* node_data, + const floating* sample_weight, + const intp_t* idx_array, + intp_t i_node, +): + cdef intp_t i + cdef float64_t N = 0.0 + for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end): + N += sample_weight[idx_array[i]] + return N diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_classification.py b/.venv/Lib/site-packages/sklearn/neighbors/_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..ae514004f1fd2cd6986796c29e258f4293fcaed7 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_classification.py @@ -0,0 +1,919 @@ +"""Nearest Neighbor Classification""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral + +import numpy as np + +from sklearn.neighbors._base import _check_precomputed + +from ..base import ClassifierMixin, _fit_context +from ..metrics._pairwise_distances_reduction import ( + ArgKminClassMode, + RadiusNeighborsClassMode, +) +from ..utils._param_validation import StrOptions +from ..utils.arrayfuncs import _all_with_any_reduction_axis_1 +from ..utils.extmath import weighted_mode +from ..utils.fixes import _mode +from ..utils.validation import ( + _is_arraylike, + _num_samples, + check_is_fitted, + validate_data, +) +from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights + + +def _adjusted_metric(metric, metric_kwargs, p=None): + metric_kwargs = metric_kwargs or {} + if metric == "minkowski": + metric_kwargs["p"] = p + if p == 2: + metric = "euclidean" + return metric, metric_kwargs + + +class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase): + """Classifier implementing the k-nearest neighbors vote. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_neighbors : int, default=5 + Number of neighbors to use by default for :meth:`kneighbors` queries. + + weights : {'uniform', 'distance'}, callable or None, default='uniform' + Weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Refer to the example entitled + :ref:`sphx_glr_auto_examples_neighbors_plot_classification.py` + showing the impact of the `weights` parameter on the decision + boundary. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is equivalent + to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. + For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected + to be positive. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + Doesn't affect :meth:`fit` method. + + Attributes + ---------- + classes_ : array of shape (n_classes,) + Class labels known to the classifier + + effective_metric_ : str or callble + The distance metric used. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + outputs_2d_ : bool + False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit + otherwise True. + + See Also + -------- + RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius. + KNeighborsRegressor: Regression based on k-nearest neighbors. + RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius. + NearestNeighbors: Unsupervised learner for implementing neighbor searches. + + Notes + ----- + See :ref:`Nearest Neighbors ` in the online documentation + for a discussion of the choice of ``algorithm`` and ``leaf_size``. + + .. warning:: + + Regarding the Nearest Neighbors algorithms, if it is found that two + neighbors, neighbor `k+1` and `k`, have identical distances + but different labels, the results will depend on the ordering of the + training data. + + https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm + + Examples + -------- + >>> X = [[0], [1], [2], [3]] + >>> y = [0, 0, 1, 1] + >>> from sklearn.neighbors import KNeighborsClassifier + >>> neigh = KNeighborsClassifier(n_neighbors=3) + >>> neigh.fit(X, y) + KNeighborsClassifier(...) + >>> print(neigh.predict([[1.1]])) + [0] + >>> print(neigh.predict_proba([[0.9]])) + [[0.666... 0.333...]] + """ + + _parameter_constraints: dict = {**NeighborsBase._parameter_constraints} + _parameter_constraints.pop("radius") + _parameter_constraints.update( + {"weights": [StrOptions({"uniform", "distance"}), callable, None]} + ) + + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + ): + super().__init__( + n_neighbors=n_neighbors, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.weights = weights + + @_fit_context( + # KNeighborsClassifier.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y): + """Fit the k-nearest neighbors classifier from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_outputs) + Target values. + + Returns + ------- + self : KNeighborsClassifier + The fitted k-nearest neighbors classifier. + """ + return self._fit(X, y) + + def predict(self, X): + """Predict the class labels for the provided data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + y : ndarray of shape (n_queries,) or (n_queries, n_outputs) + Class labels for each data sample. + """ + check_is_fitted(self, "_fit_method") + if self.weights == "uniform": + if self._fit_method == "brute" and ArgKminClassMode.is_usable_for( + X, self._fit_X, self.metric + ): + probabilities = self.predict_proba(X) + if self.outputs_2d_: + return np.stack( + [ + self.classes_[idx][np.argmax(probas, axis=1)] + for idx, probas in enumerate(probabilities) + ], + axis=1, + ) + return self.classes_[np.argmax(probabilities, axis=1)] + # In that case, we do not need the distances to perform + # the weighting so we do not compute them. + neigh_ind = self.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.kneighbors(X) + + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + n_outputs = len(classes_) + n_queries = _num_samples(self._fit_X if X is None else X) + weights = _get_weights(neigh_dist, self.weights) + if weights is not None and _all_with_any_reduction_axis_1(weights, value=0): + raise ValueError( + "All neighbors of some sample is getting zero weights. " + "Please modify 'weights' to avoid this case if you are " + "using a user-defined function." + ) + + y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) + for k, classes_k in enumerate(classes_): + if weights is None: + mode, _ = _mode(_y[neigh_ind, k], axis=1) + else: + mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1) + + mode = np.asarray(mode.ravel(), dtype=np.intp) + y_pred[:, k] = classes_k.take(mode) + + if not self.outputs_2d_: + y_pred = y_pred.ravel() + + return y_pred + + def predict_proba(self, X): + """Return probability estimates for the test data X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + p : ndarray of shape (n_queries, n_classes), or a list of n_outputs \ + of such arrays if n_outputs > 1. + The class probabilities of the input samples. Classes are ordered + by lexicographic order. + """ + check_is_fitted(self, "_fit_method") + if self.weights == "uniform": + # TODO: systematize this mapping of metric for + # PairwiseDistancesReductions. + metric, metric_kwargs = _adjusted_metric( + metric=self.metric, metric_kwargs=self.metric_params, p=self.p + ) + if ( + self._fit_method == "brute" + and ArgKminClassMode.is_usable_for(X, self._fit_X, metric) + # TODO: Implement efficient multi-output solution + and not self.outputs_2d_ + ): + if self.metric == "precomputed": + X = _check_precomputed(X) + else: + X = validate_data( + self, X, accept_sparse="csr", reset=False, order="C" + ) + + probabilities = ArgKminClassMode.compute( + X, + self._fit_X, + k=self.n_neighbors, + weights=self.weights, + Y_labels=self._y, + unique_Y_labels=self.classes_, + metric=metric, + metric_kwargs=metric_kwargs, + # `strategy="parallel_on_X"` has in practice be shown + # to be more efficient than `strategy="parallel_on_Y`` + # on many combination of datasets. + # Hence, we choose to enforce it here. + # For more information, see: + # https://github.com/scikit-learn/scikit-learn/pull/24076#issuecomment-1445258342 # noqa + # TODO: adapt the heuristic for `strategy="auto"` for + # `ArgKminClassMode` and use `strategy="auto"`. + strategy="parallel_on_X", + ) + return probabilities + + # In that case, we do not need the distances to perform + # the weighting so we do not compute them. + neigh_ind = self.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.kneighbors(X) + + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + n_queries = _num_samples(self._fit_X if X is None else X) + + weights = _get_weights(neigh_dist, self.weights) + if weights is None: + weights = np.ones_like(neigh_ind) + elif _all_with_any_reduction_axis_1(weights, value=0): + raise ValueError( + "All neighbors of some sample is getting zero weights. " + "Please modify 'weights' to avoid this case if you are " + "using a user-defined function." + ) + + all_rows = np.arange(n_queries) + probabilities = [] + for k, classes_k in enumerate(classes_): + pred_labels = _y[:, k][neigh_ind] + proba_k = np.zeros((n_queries, classes_k.size)) + + # a simple ':' index doesn't work right + for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) + proba_k[all_rows, idx] += weights[:, i] + + # normalize 'votes' into real [0,1] probabilities + normalizer = proba_k.sum(axis=1)[:, np.newaxis] + proba_k /= normalizer + + probabilities.append(proba_k) + + if not self.outputs_2d_: + probabilities = probabilities[0] + + return probabilities + + # This function is defined here only to modify the parent docstring + # and add information about X=None + def score(self, X, y, sample_weight=None): + """ + Return the mean accuracy on the given test data and labels. + + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features), or None + Test samples. If `None`, predictions for all indexed points are + used; in this case, points are not considered their own + neighbors. This means that `knn.fit(X, y).score(None, y)` + implicitly performs a leave-one-out cross-validation procedure + and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())` + but typically much faster. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + True labels for `X`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + Mean accuracy of ``self.predict(X)`` w.r.t. `y`. + """ + return super().score(X, y, sample_weight) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + tags.input_tags.pairwise = self.metric == "precomputed" + return tags + + +class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase): + """Classifier implementing a vote among neighbors within a given radius. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + radius : float, default=1.0 + Range of parameter space to use by default for :meth:`radius_neighbors` + queries. + + weights : {'uniform', 'distance'}, callable or None, default='uniform' + Weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Uniform weights are used by default. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + This parameter is expected to be positive. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + outlier_label : {manual label, 'most_frequent'}, default=None + Label for outlier samples (samples with no neighbors in given radius). + + - manual label: str or int label (should be the same type as y) + or list of manual labels if multi-output is used. + - 'most_frequent' : assign the most frequent label of y to outliers. + - None : when any outlier is detected, ValueError will be raised. + + The outlier label should be selected from among the unique 'Y' labels. + If it is specified with a different value a warning will be raised and + all class probabilities of outliers will be assigned to be 0. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + Class labels known to the classifier. + + effective_metric_ : str or callable + The distance metric used. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + outlier_label_ : int or array-like of shape (n_class,) + Label which is given for outlier samples (samples with no neighbors + on given radius). + + outputs_2d_ : bool + False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit + otherwise True. + + See Also + -------- + KNeighborsClassifier : Classifier implementing the k-nearest neighbors + vote. + RadiusNeighborsRegressor : Regression based on neighbors within a + fixed radius. + KNeighborsRegressor : Regression based on k-nearest neighbors. + NearestNeighbors : Unsupervised learner for implementing neighbor + searches. + + Notes + ----- + See :ref:`Nearest Neighbors ` in the online documentation + for a discussion of the choice of ``algorithm`` and ``leaf_size``. + + https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm + + Examples + -------- + >>> X = [[0], [1], [2], [3]] + >>> y = [0, 0, 1, 1] + >>> from sklearn.neighbors import RadiusNeighborsClassifier + >>> neigh = RadiusNeighborsClassifier(radius=1.0) + >>> neigh.fit(X, y) + RadiusNeighborsClassifier(...) + >>> print(neigh.predict([[1.5]])) + [0] + >>> print(neigh.predict_proba([[1.0]])) + [[0.66666667 0.33333333]] + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "weights": [StrOptions({"uniform", "distance"}), callable, None], + "outlier_label": [Integral, str, "array-like", None], + } + _parameter_constraints.pop("n_neighbors") + + def __init__( + self, + radius=1.0, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + outlier_label=None, + metric_params=None, + n_jobs=None, + ): + super().__init__( + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.weights = weights + self.outlier_label = outlier_label + + @_fit_context( + # RadiusNeighborsClassifier.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y): + """Fit the radius neighbors classifier from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_outputs) + Target values. + + Returns + ------- + self : RadiusNeighborsClassifier + The fitted radius neighbors classifier. + """ + self._fit(X, y) + + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + if self.outlier_label is None: + outlier_label_ = None + + elif self.outlier_label == "most_frequent": + outlier_label_ = [] + # iterate over multi-output, get the most frequent label for each + # output. + for k, classes_k in enumerate(classes_): + label_count = np.bincount(_y[:, k]) + outlier_label_.append(classes_k[label_count.argmax()]) + + else: + if _is_arraylike(self.outlier_label) and not isinstance( + self.outlier_label, str + ): + if len(self.outlier_label) != len(classes_): + raise ValueError( + "The length of outlier_label: {} is " + "inconsistent with the output " + "length: {}".format(self.outlier_label, len(classes_)) + ) + outlier_label_ = self.outlier_label + else: + outlier_label_ = [self.outlier_label] * len(classes_) + + for classes, label in zip(classes_, outlier_label_): + if _is_arraylike(label) and not isinstance(label, str): + # ensure the outlier label for each output is a scalar. + raise TypeError( + "The outlier_label of classes {} is " + "supposed to be a scalar, got " + "{}.".format(classes, label) + ) + if np.append(classes, label).dtype != classes.dtype: + # ensure the dtype of outlier label is consistent with y. + raise TypeError( + "The dtype of outlier_label {} is " + "inconsistent with classes {} in " + "y.".format(label, classes) + ) + + self.outlier_label_ = outlier_label_ + + return self + + def predict(self, X): + """Predict the class labels for the provided data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + y : ndarray of shape (n_queries,) or (n_queries, n_outputs) + Class labels for each data sample. + """ + + probs = self.predict_proba(X) + classes_ = self.classes_ + + if not self.outputs_2d_: + probs = [probs] + classes_ = [self.classes_] + + n_outputs = len(classes_) + n_queries = probs[0].shape[0] + y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) + + for k, prob in enumerate(probs): + # iterate over multi-output, assign labels based on probabilities + # of each output. + max_prob_index = prob.argmax(axis=1) + y_pred[:, k] = classes_[k].take(max_prob_index) + + outlier_zero_probs = (prob == 0).all(axis=1) + if outlier_zero_probs.any(): + zero_prob_index = np.flatnonzero(outlier_zero_probs) + y_pred[zero_prob_index, k] = self.outlier_label_[k] + + if not self.outputs_2d_: + y_pred = y_pred.ravel() + + return y_pred + + def predict_proba(self, X): + """Return probability estimates for the test data X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + p : ndarray of shape (n_queries, n_classes), or a list of \ + n_outputs of such arrays if n_outputs > 1. + The class probabilities of the input samples. Classes are ordered + by lexicographic order. + """ + check_is_fitted(self, "_fit_method") + n_queries = _num_samples(self._fit_X if X is None else X) + + metric, metric_kwargs = _adjusted_metric( + metric=self.metric, metric_kwargs=self.metric_params, p=self.p + ) + + if ( + self.weights == "uniform" + and self._fit_method == "brute" + and not self.outputs_2d_ + and RadiusNeighborsClassMode.is_usable_for(X, self._fit_X, metric) + ): + probabilities = RadiusNeighborsClassMode.compute( + X=X, + Y=self._fit_X, + radius=self.radius, + weights=self.weights, + Y_labels=self._y, + unique_Y_labels=self.classes_, + outlier_label=self.outlier_label, + metric=metric, + metric_kwargs=metric_kwargs, + strategy="parallel_on_X", + # `strategy="parallel_on_X"` has in practice be shown + # to be more efficient than `strategy="parallel_on_Y`` + # on many combination of datasets. + # Hence, we choose to enforce it here. + # For more information, see: + # https://github.com/scikit-learn/scikit-learn/pull/26828/files#r1282398471 # noqa + ) + return probabilities + + neigh_dist, neigh_ind = self.radius_neighbors(X) + outlier_mask = np.zeros(n_queries, dtype=bool) + outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] + outliers = np.flatnonzero(outlier_mask) + inliers = np.flatnonzero(~outlier_mask) + + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + if self.outlier_label_ is None and outliers.size > 0: + raise ValueError( + "No neighbors found for test samples %r, " + "you can try using larger radius, " + "giving a label for outliers, " + "or considering removing them from your dataset." % outliers + ) + + weights = _get_weights(neigh_dist, self.weights) + if weights is not None: + weights = weights[inliers] + + probabilities = [] + # iterate over multi-output, measure probabilities of the k-th output. + for k, classes_k in enumerate(classes_): + pred_labels = np.zeros(len(neigh_ind), dtype=object) + pred_labels[:] = [_y[ind, k] for ind in neigh_ind] + + proba_k = np.zeros((n_queries, classes_k.size)) + proba_inl = np.zeros((len(inliers), classes_k.size)) + + # samples have different size of neighbors within the same radius + if weights is None: + for i, idx in enumerate(pred_labels[inliers]): + proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size) + else: + for i, idx in enumerate(pred_labels[inliers]): + proba_inl[i, :] = np.bincount( + idx, weights[i], minlength=classes_k.size + ) + proba_k[inliers, :] = proba_inl + + if outliers.size > 0: + _outlier_label = self.outlier_label_[k] + label_index = np.flatnonzero(classes_k == _outlier_label) + if label_index.size == 1: + proba_k[outliers, label_index[0]] = 1.0 + else: + warnings.warn( + "Outlier label {} is not in training " + "classes. All class probabilities of " + "outliers will be assigned with 0." + "".format(self.outlier_label_[k]) + ) + + # normalize 'votes' into real [0,1] probabilities + normalizer = proba_k.sum(axis=1)[:, np.newaxis] + normalizer[normalizer == 0.0] = 1.0 + proba_k /= normalizer + + probabilities.append(proba_k) + + if not self.outputs_2d_: + probabilities = probabilities[0] + + return probabilities + + # This function is defined here only to modify the parent docstring + # and add information about X=None + def score(self, X, y, sample_weight=None): + """ + Return the mean accuracy on the given test data and labels. + + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features), or None + Test samples. If `None`, predictions for all indexed points are + used; in this case, points are not considered their own + neighbors. This means that `knn.fit(X, y).score(None, y)` + implicitly performs a leave-one-out cross-validation procedure + and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())` + but typically much faster. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + True labels for `X`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + Mean accuracy of ``self.predict(X)`` w.r.t. `y`. + """ + return super().score(X, y, sample_weight) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + return tags diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_graph.py b/.venv/Lib/site-packages/sklearn/neighbors/_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..e3b3325f06e37bacfbf8804be3f8b5bd5e9c52e7 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_graph.py @@ -0,0 +1,704 @@ +"""Nearest Neighbors graph functions""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools + +from ..base import ClassNamePrefixFeaturesOutMixin, TransformerMixin, _fit_context +from ..utils._param_validation import ( + Integral, + Interval, + Real, + StrOptions, + validate_params, +) +from ..utils.validation import check_is_fitted +from ._base import VALID_METRICS, KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin +from ._unsupervised import NearestNeighbors + + +def _check_params(X, metric, p, metric_params): + """Check the validity of the input parameters""" + params = zip(["metric", "p", "metric_params"], [metric, p, metric_params]) + est_params = X.get_params() + for param_name, func_param in params: + if func_param != est_params[param_name]: + raise ValueError( + "Got %s for %s, while the estimator has %s for the same parameter." + % (func_param, param_name, est_params[param_name]) + ) + + +def _query_include_self(X, include_self, mode): + """Return the query based on include_self param""" + if include_self == "auto": + include_self = mode == "connectivity" + + # it does not include each sample as its own neighbors + if not include_self: + X = None + + return X + + +@validate_params( + { + "X": ["array-like", "sparse matrix", KNeighborsMixin], + "n_neighbors": [Interval(Integral, 1, None, closed="left")], + "mode": [StrOptions({"connectivity", "distance"})], + "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable], + "p": [Interval(Real, 0, None, closed="right"), None], + "metric_params": [dict, None], + "include_self": ["boolean", StrOptions({"auto"})], + "n_jobs": [Integral, None], + }, + prefer_skip_nested_validation=False, # metric is not validated yet +) +def kneighbors_graph( + X, + n_neighbors, + *, + mode="connectivity", + metric="minkowski", + p=2, + metric_params=None, + include_self=False, + n_jobs=None, +): + """Compute the (weighted) graph of k-Neighbors for points in X. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Sample data. + + n_neighbors : int + Number of neighbors for each sample. + + mode : {'connectivity', 'distance'}, default='connectivity' + Type of returned matrix: 'connectivity' will return the connectivity + matrix with ones and zeros, and 'distance' will return the distances + between neighbors according to the given metric. + + metric : str, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is equivalent + to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. + For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected + to be positive. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + include_self : bool or 'auto', default=False + Whether or not to mark each sample as the first nearest neighbor to + itself. If 'auto', then True is used for mode='connectivity' and False + for mode='distance'. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns + ------- + A : sparse matrix of shape (n_samples, n_samples) + Graph where A[i, j] is assigned the weight of edge that + connects i to j. The matrix is of CSR format. + + See Also + -------- + radius_neighbors_graph: Compute the (weighted) graph of Neighbors for points in X. + + Examples + -------- + >>> X = [[0], [3], [1]] + >>> from sklearn.neighbors import kneighbors_graph + >>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True) + >>> A.toarray() + array([[1., 0., 1.], + [0., 1., 1.], + [1., 0., 1.]]) + """ + if not isinstance(X, KNeighborsMixin): + X = NearestNeighbors( + n_neighbors=n_neighbors, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ).fit(X) + else: + _check_params(X, metric, p, metric_params) + + query = _query_include_self(X._fit_X, include_self, mode) + return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode) + + +@validate_params( + { + "X": ["array-like", "sparse matrix", RadiusNeighborsMixin], + "radius": [Interval(Real, 0, None, closed="both")], + "mode": [StrOptions({"connectivity", "distance"})], + "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable], + "p": [Interval(Real, 0, None, closed="right"), None], + "metric_params": [dict, None], + "include_self": ["boolean", StrOptions({"auto"})], + "n_jobs": [Integral, None], + }, + prefer_skip_nested_validation=False, # metric is not validated yet +) +def radius_neighbors_graph( + X, + radius, + *, + mode="connectivity", + metric="minkowski", + p=2, + metric_params=None, + include_self=False, + n_jobs=None, +): + """Compute the (weighted) graph of Neighbors for points in X. + + Neighborhoods are restricted the points at a distance lower than + radius. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Sample data. + + radius : float + Radius of neighborhoods. + + mode : {'connectivity', 'distance'}, default='connectivity' + Type of returned matrix: 'connectivity' will return the connectivity + matrix with ones and zeros, and 'distance' will return the distances + between neighbors according to the given metric. + + metric : str, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + include_self : bool or 'auto', default=False + Whether or not to mark each sample as the first nearest neighbor to + itself. If 'auto', then True is used for mode='connectivity' and False + for mode='distance'. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns + ------- + A : sparse matrix of shape (n_samples, n_samples) + Graph where A[i, j] is assigned the weight of edge that connects + i to j. The matrix is of CSR format. + + See Also + -------- + kneighbors_graph: Compute the weighted graph of k-neighbors for points in X. + + Examples + -------- + >>> X = [[0], [3], [1]] + >>> from sklearn.neighbors import radius_neighbors_graph + >>> A = radius_neighbors_graph(X, 1.5, mode='connectivity', + ... include_self=True) + >>> A.toarray() + array([[1., 0., 1.], + [0., 1., 0.], + [1., 0., 1.]]) + """ + if not isinstance(X, RadiusNeighborsMixin): + X = NearestNeighbors( + radius=radius, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ).fit(X) + else: + _check_params(X, metric, p, metric_params) + + query = _query_include_self(X._fit_X, include_self, mode) + return X.radius_neighbors_graph(query, radius, mode) + + +class KNeighborsTransformer( + ClassNamePrefixFeaturesOutMixin, KNeighborsMixin, TransformerMixin, NeighborsBase +): + """Transform X into a (weighted) graph of k nearest neighbors. + + The transformed data is a sparse graph as returned by kneighbors_graph. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.22 + + Parameters + ---------- + mode : {'distance', 'connectivity'}, default='distance' + Type of returned matrix: 'connectivity' will return the connectivity + matrix with ones and zeros, and 'distance' will return the distances + between neighbors according to the given metric. + + n_neighbors : int, default=5 + Number of neighbors for each sample in the transformed sparse graph. + For compatibility reasons, as each sample is considered as its own + neighbor, one extra neighbor will be computed when mode == 'distance'. + In this case, the sparse graph contains (n_neighbors + 1) neighbors. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + p : float, default=2 + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + This parameter is expected to be positive. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + + Attributes + ---------- + effective_metric_ : str or callable + The distance metric used. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + See Also + -------- + kneighbors_graph : Compute the weighted graph of k-neighbors for + points in X. + RadiusNeighborsTransformer : Transform X into a weighted graph of + neighbors nearer than a radius. + + Notes + ----- + For an example of using :class:`~sklearn.neighbors.KNeighborsTransformer` + in combination with :class:`~sklearn.manifold.TSNE` see + :ref:`sphx_glr_auto_examples_neighbors_approximate_nearest_neighbors.py`. + + Examples + -------- + >>> from sklearn.datasets import load_wine + >>> from sklearn.neighbors import KNeighborsTransformer + >>> X, _ = load_wine(return_X_y=True) + >>> X.shape + (178, 13) + >>> transformer = KNeighborsTransformer(n_neighbors=5, mode='distance') + >>> X_dist_graph = transformer.fit_transform(X) + >>> X_dist_graph.shape + (178, 178) + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "mode": [StrOptions({"distance", "connectivity"})], + } + _parameter_constraints.pop("radius") + + def __init__( + self, + *, + mode="distance", + n_neighbors=5, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): + super(KNeighborsTransformer, self).__init__( + n_neighbors=n_neighbors, + radius=None, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.mode = mode + + @_fit_context( + # KNeighborsTransformer.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Fit the k-nearest neighbors transformer from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : KNeighborsTransformer + The fitted k-nearest neighbors transformer. + """ + self._fit(X) + self._n_features_out = self.n_samples_fit_ + return self + + def transform(self, X): + """Compute the (weighted) graph of Neighbors for points in X. + + Parameters + ---------- + X : array-like of shape (n_samples_transform, n_features) + Sample data. + + Returns + ------- + Xt : sparse matrix of shape (n_samples_transform, n_samples_fit) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + The matrix is of CSR format. + """ + check_is_fitted(self) + add_one = self.mode == "distance" + return self.kneighbors_graph( + X, mode=self.mode, n_neighbors=self.n_neighbors + add_one + ) + + def fit_transform(self, X, y=None): + """Fit to data, then transform it. + + Fits transformer to X and y with optional parameters fit_params + and returns a transformed version of X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training set. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + Xt : sparse matrix of shape (n_samples, n_samples) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + The matrix is of CSR format. + """ + return self.fit(X).transform(X) + + +class RadiusNeighborsTransformer( + ClassNamePrefixFeaturesOutMixin, + RadiusNeighborsMixin, + TransformerMixin, + NeighborsBase, +): + """Transform X into a (weighted) graph of neighbors nearer than a radius. + + The transformed data is a sparse graph as returned by + `radius_neighbors_graph`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.22 + + Parameters + ---------- + mode : {'distance', 'connectivity'}, default='distance' + Type of returned matrix: 'connectivity' will return the connectivity + matrix with ones and zeros, and 'distance' will return the distances + between neighbors according to the given metric. + + radius : float, default=1.0 + Radius of neighborhood in the transformed sparse graph. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + p : float, default=2 + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + This parameter is expected to be positive. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + + Attributes + ---------- + effective_metric_ : str or callable + The distance metric used. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + See Also + -------- + kneighbors_graph : Compute the weighted graph of k-neighbors for + points in X. + KNeighborsTransformer : Transform X into a weighted graph of k + nearest neighbors. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import load_wine + >>> from sklearn.cluster import DBSCAN + >>> from sklearn.neighbors import RadiusNeighborsTransformer + >>> from sklearn.pipeline import make_pipeline + >>> X, _ = load_wine(return_X_y=True) + >>> estimator = make_pipeline( + ... RadiusNeighborsTransformer(radius=42.0, mode='distance'), + ... DBSCAN(eps=25.0, metric='precomputed')) + >>> X_clustered = estimator.fit_predict(X) + >>> clusters, counts = np.unique(X_clustered, return_counts=True) + >>> print(counts) + [ 29 15 111 11 12] + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "mode": [StrOptions({"distance", "connectivity"})], + } + _parameter_constraints.pop("n_neighbors") + + def __init__( + self, + *, + mode="distance", + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): + super(RadiusNeighborsTransformer, self).__init__( + n_neighbors=None, + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.mode = mode + + @_fit_context( + # RadiusNeighborsTransformer.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Fit the radius neighbors transformer from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : RadiusNeighborsTransformer + The fitted radius neighbors transformer. + """ + self._fit(X) + self._n_features_out = self.n_samples_fit_ + return self + + def transform(self, X): + """Compute the (weighted) graph of Neighbors for points in X. + + Parameters + ---------- + X : array-like of shape (n_samples_transform, n_features) + Sample data. + + Returns + ------- + Xt : sparse matrix of shape (n_samples_transform, n_samples_fit) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + The matrix is of CSR format. + """ + check_is_fitted(self) + return self.radius_neighbors_graph(X, mode=self.mode, sort_results=True) + + def fit_transform(self, X, y=None): + """Fit to data, then transform it. + + Fits transformer to X and y with optional parameters fit_params + and returns a transformed version of X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training set. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + Xt : sparse matrix of shape (n_samples, n_samples) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + The matrix is of CSR format. + """ + return self.fit(X).transform(X) diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..6eaa79ab27018028c9090f4c78f747306184eddb Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.lib differ diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.pyd new file mode 100644 index 0000000000000000000000000000000000000000..fe0c5cfc0a30ec22b0b3d915cf359b5aaa2464ac Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.pyd differ diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.pyx.tp b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..4d182a3644e4bb73f344e0e15e4bc0299e4ea785 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.pyx.tp @@ -0,0 +1,336 @@ +{{py: + +# Generated file: _kd_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + +# By Jake Vanderplas (2013) +# written for the scikit-learn project +# SPDX-License-Identifier: BSD-3-Clause + +}} + + +__all__ = ['KDTree', 'KDTree64', 'KDTree32'] + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'KDTree{{name_suffix}}', + 'binary_tree': 'kd_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'EuclideanDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}' +] + +{{endfor}} + +include "_binary_tree.pxi" + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) + pass + +{{endfor}} + + +# ---------------------------------------------------------------------- +# The functions below specialized the Binary Tree as a KD Tree +# +# Note that these functions use the concept of "reduced distance". +# The reduced distance, defined for some metrics, is a quantity which +# is more efficient to compute than the distance, but preserves the +# relative rankings of the true distance. For example, the reduced +# distance for the Euclidean metric is the squared-euclidean distance. +# For some metrics, the reduced distance is simply the distance. + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t n_nodes, + intp_t n_features, +) except -1: + """Allocate arrays needed for the KD Tree""" + tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype={{INPUT_DTYPE}}) + return 0 + + +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + intp_t i_node, + intp_t idx_start, + intp_t idx_end, +) except -1: + """Initialize the node for the dataset stored in tree.data""" + cdef intp_t n_features = tree.data.shape[1] + cdef intp_t i, j + cdef float64_t rad = 0 + + cdef {{INPUT_DTYPE_t}}* lower_bounds = &tree.node_bounds[0, i_node, 0] + cdef {{INPUT_DTYPE_t}}* upper_bounds = &tree.node_bounds[1, i_node, 0] + cdef const {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] + cdef const intp_t* idx_array = &tree.idx_array[0] + + cdef const {{INPUT_DTYPE_t}}* data_row + + # determine Node bounds + for j in range(n_features): + lower_bounds[j] = INF + upper_bounds[j] = -INF + + # Compute the actual data range. At build time, this is slightly + # slower than using the previously-computed bounds of the parent node, + # but leads to more compact trees and thus faster queries. + for i in range(idx_start, idx_end): + data_row = data + idx_array[i] * n_features + for j in range(n_features): + lower_bounds[j] = fmin(lower_bounds[j], data_row[j]) + upper_bounds[j] = fmax(upper_bounds[j], data_row[j]) + + for j in range(n_features): + if tree.dist_metric.p == INF: + rad = fmax(rad, 0.5 * (upper_bounds[j] - lower_bounds[j])) + else: + rad += pow(0.5 * abs(upper_bounds[j] - lower_bounds[j]), + tree.dist_metric.p) + + node_data[i_node].idx_start = idx_start + node_data[i_node].idx_end = idx_end + + # The radius will hold the size of the circumscribed hypersphere measured + # with the specified metric: in querying, this is used as a measure of the + # size of each node when deciding which nodes to split. + node_data[i_node].radius = pow(rad, 1. / tree.dist_metric.p) + return 0 + + +cdef float64_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum reduced-distance between a point and a node""" + cdef intp_t n_features = tree.data.shape[1] + cdef float64_t d, d_lo, d_hi, rdist=0.0 + cdef intp_t j + + if tree.dist_metric.p == INF: + for j in range(n_features): + d_lo = tree.node_bounds[0, i_node, j] - pt[j] + d_hi = pt[j] - tree.node_bounds[1, i_node, j] + d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi)) + rdist = fmax(rdist, 0.5 * d) + else: + # here we'll use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(n_features): + d_lo = tree.node_bounds[0, i_node, j] - pt[j] + d_hi = pt[j] - tree.node_bounds[1, i_node, j] + d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi)) + rdist += pow(0.5 * d, tree.dist_metric.p) + + return rdist + + +cdef float64_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the minimum distance between a point and a node""" + if tree.dist_metric.p == INF: + return min_rdist{{name_suffix}}(tree, i_node, pt) + else: + return pow( + min_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) + + +cdef float64_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum reduced-distance between a point and a node""" + cdef intp_t n_features = tree.data.shape[1] + + cdef float64_t d_lo, d_hi, rdist=0.0 + cdef intp_t j + + if tree.dist_metric.p == INF: + for j in range(n_features): + rdist = fmax(rdist, fabs(pt[j] - tree.node_bounds[0, i_node, j])) + rdist = fmax(rdist, fabs(pt[j] - tree.node_bounds[1, i_node, j])) + else: + for j in range(n_features): + d_lo = fabs(pt[j] - tree.node_bounds[0, i_node, j]) + d_hi = fabs(pt[j] - tree.node_bounds[1, i_node, j]) + rdist += pow(fmax(d_lo, d_hi), tree.dist_metric.p) + + return rdist + + +cdef float64_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum distance between a point and a node""" + if tree.dist_metric.p == INF: + return max_rdist{{name_suffix}}(tree, i_node, pt) + else: + return pow( + max_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + float64_t* min_dist, + float64_t* max_dist, +) except -1 nogil: + """Compute the minimum and maximum distance between a point and a node""" + cdef intp_t n_features = tree.data.shape[1] + + cdef float64_t d, d_lo, d_hi + cdef intp_t j + + min_dist[0] = 0.0 + max_dist[0] = 0.0 + + if tree.dist_metric.p == INF: + for j in range(n_features): + d_lo = tree.node_bounds[0, i_node, j] - pt[j] + d_hi = pt[j] - tree.node_bounds[1, i_node, j] + d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi)) + min_dist[0] = fmax(min_dist[0], 0.5 * d) + max_dist[0] = fmax(max_dist[0], fabs(d_lo)) + max_dist[0] = fmax(max_dist[0], fabs(d_hi)) + else: + # as above, use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(n_features): + d_lo = tree.node_bounds[0, i_node, j] - pt[j] + d_hi = pt[j] - tree.node_bounds[1, i_node, j] + d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi)) + min_dist[0] += pow(0.5 * d, tree.dist_metric.p) + max_dist[0] += pow(fmax(fabs(d_lo), fabs(d_hi)), + tree.dist_metric.p) + + min_dist[0] = pow(min_dist[0], 1. / tree.dist_metric.p) + max_dist[0] = pow(max_dist[0], 1. / tree.dist_metric.p) + + return 0 + + +cdef inline float64_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """Compute the minimum reduced distance between two nodes""" + cdef intp_t n_features = tree1.data.shape[1] + + cdef float64_t d, d1, d2, rdist=0.0 + cdef intp_t j + + if tree1.dist_metric.p == INF: + for j in range(n_features): + d1 = (tree1.node_bounds[0, i_node1, j] + - tree2.node_bounds[1, i_node2, j]) + d2 = (tree2.node_bounds[0, i_node2, j] + - tree1.node_bounds[1, i_node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist = fmax(rdist, 0.5 * d) + else: + # here we'll use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(n_features): + d1 = (tree1.node_bounds[0, i_node1, j] + - tree2.node_bounds[1, i_node2, j]) + d2 = (tree2.node_bounds[0, i_node2, j] + - tree1.node_bounds[1, i_node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist += pow(0.5 * d, tree1.dist_metric.p) + + return rdist + + +cdef inline float64_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """Compute the minimum distance between two nodes""" + return tree1.dist_metric._rdist_to_dist( + min_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + + +cdef inline float64_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """Compute the maximum reduced distance between two nodes""" + cdef intp_t n_features = tree1.data.shape[1] + + cdef float64_t d1, d2, rdist=0.0 + cdef intp_t j + + if tree1.dist_metric.p == INF: + for j in range(n_features): + rdist = fmax(rdist, fabs(tree1.node_bounds[0, i_node1, j] + - tree2.node_bounds[1, i_node2, j])) + rdist = fmax(rdist, fabs(tree1.node_bounds[1, i_node1, j] + - tree2.node_bounds[0, i_node2, j])) + else: + for j in range(n_features): + d1 = fabs(tree1.node_bounds[0, i_node1, j] + - tree2.node_bounds[1, i_node2, j]) + d2 = fabs(tree1.node_bounds[1, i_node1, j] + - tree2.node_bounds[0, i_node2, j]) + rdist += pow(fmax(d1, d2), tree1.dist_metric.p) + + return rdist + + +cdef inline float64_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """Compute the maximum distance between two nodes""" + return tree1.dist_metric._rdist_to_dist( + max_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} + + +class KDTree(KDTree64): + __doc__ = CLASS_DOC.format(BinaryTree="KDTree") + pass diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_kde.py b/.venv/Lib/site-packages/sklearn/neighbors/_kde.py new file mode 100644 index 0000000000000000000000000000000000000000..9ee37a639f31e2fb4753119fece1a9de837e650b --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_kde.py @@ -0,0 +1,359 @@ +""" +Kernel Density Estimation +------------------------- +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +from numbers import Integral, Real + +import numpy as np +from scipy.special import gammainc + +from ..base import BaseEstimator, _fit_context +from ..neighbors._base import VALID_METRICS +from ..utils import check_random_state +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import row_norms +from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data +from ._ball_tree import BallTree +from ._kd_tree import KDTree + +VALID_KERNELS = [ + "gaussian", + "tophat", + "epanechnikov", + "exponential", + "linear", + "cosine", +] + +TREE_DICT = {"ball_tree": BallTree, "kd_tree": KDTree} + + +# TODO: implement a brute force version for testing purposes +# TODO: create a density estimation base class? +class KernelDensity(BaseEstimator): + """Kernel Density Estimation. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + bandwidth : float or {"scott", "silverman"}, default=1.0 + The bandwidth of the kernel. If bandwidth is a float, it defines the + bandwidth of the kernel. If bandwidth is a string, one of the estimation + methods is implemented. + + algorithm : {'kd_tree', 'ball_tree', 'auto'}, default='auto' + The tree algorithm to use. + + kernel : {'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', \ + 'cosine'}, default='gaussian' + The kernel to use. + + metric : str, default='euclidean' + Metric to use for distance computation. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + Not all metrics are valid with all algorithms: refer to the + documentation of :class:`BallTree` and :class:`KDTree`. Note that the + normalization of the density output is correct only for the Euclidean + distance metric. + + atol : float, default=0 + The desired absolute tolerance of the result. A larger tolerance will + generally lead to faster execution. + + rtol : float, default=0 + The desired relative tolerance of the result. A larger tolerance will + generally lead to faster execution. + + breadth_first : bool, default=True + If true (default), use a breadth-first approach to the problem. + Otherwise use a depth-first approach. + + leaf_size : int, default=40 + Specify the leaf size of the underlying tree. See :class:`BallTree` + or :class:`KDTree` for details. + + metric_params : dict, default=None + Additional parameters to be passed to the tree for use with the + metric. For more information, see the documentation of + :class:`BallTree` or :class:`KDTree`. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + tree_ : ``BinaryTree`` instance + The tree algorithm for fast generalized N-point problems. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + bandwidth_ : float + Value of the bandwidth, given directly by the bandwidth parameter or + estimated using the 'scott' or 'silverman' method. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.neighbors.KDTree : K-dimensional tree for fast generalized N-point + problems. + sklearn.neighbors.BallTree : Ball tree for fast generalized N-point + problems. + + Examples + -------- + Compute a gaussian kernel density estimate with a fixed bandwidth. + + >>> from sklearn.neighbors import KernelDensity + >>> import numpy as np + >>> rng = np.random.RandomState(42) + >>> X = rng.random_sample((100, 3)) + >>> kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X) + >>> log_density = kde.score_samples(X[:3]) + >>> log_density + array([-1.52955942, -1.51462041, -1.60244657]) + """ + + _parameter_constraints: dict = { + "bandwidth": [ + Interval(Real, 0, None, closed="neither"), + StrOptions({"scott", "silverman"}), + ], + "algorithm": [StrOptions(set(TREE_DICT.keys()) | {"auto"})], + "kernel": [StrOptions(set(VALID_KERNELS))], + "metric": [ + StrOptions( + set(itertools.chain(*[VALID_METRICS[alg] for alg in TREE_DICT.keys()])) + ) + ], + "atol": [Interval(Real, 0, None, closed="left")], + "rtol": [Interval(Real, 0, None, closed="left")], + "breadth_first": ["boolean"], + "leaf_size": [Interval(Integral, 1, None, closed="left")], + "metric_params": [None, dict], + } + + def __init__( + self, + *, + bandwidth=1.0, + algorithm="auto", + kernel="gaussian", + metric="euclidean", + atol=0, + rtol=0, + breadth_first=True, + leaf_size=40, + metric_params=None, + ): + self.algorithm = algorithm + self.bandwidth = bandwidth + self.kernel = kernel + self.metric = metric + self.atol = atol + self.rtol = rtol + self.breadth_first = breadth_first + self.leaf_size = leaf_size + self.metric_params = metric_params + + def _choose_algorithm(self, algorithm, metric): + # given the algorithm string + metric string, choose the optimal + # algorithm to compute the result. + if algorithm == "auto": + # use KD Tree if possible + if metric in KDTree.valid_metrics: + return "kd_tree" + elif metric in BallTree.valid_metrics: + return "ball_tree" + else: # kd_tree or ball_tree + if metric not in TREE_DICT[algorithm].valid_metrics: + raise ValueError( + "invalid metric for {0}: '{1}'".format(TREE_DICT[algorithm], metric) + ) + return algorithm + + @_fit_context( + # KernelDensity.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None, sample_weight=None): + """Fit the Kernel Density model on the data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + List of n_features-dimensional data points. Each row + corresponds to a single data point. + + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + + sample_weight : array-like of shape (n_samples,), default=None + List of sample weights attached to the data X. + + .. versionadded:: 0.20 + + Returns + ------- + self : object + Returns the instance itself. + """ + algorithm = self._choose_algorithm(self.algorithm, self.metric) + + if isinstance(self.bandwidth, str): + if self.bandwidth == "scott": + self.bandwidth_ = X.shape[0] ** (-1 / (X.shape[1] + 4)) + elif self.bandwidth == "silverman": + self.bandwidth_ = (X.shape[0] * (X.shape[1] + 2) / 4) ** ( + -1 / (X.shape[1] + 4) + ) + else: + self.bandwidth_ = self.bandwidth + + X = validate_data(self, X, order="C", dtype=np.float64) + + if sample_weight is not None: + sample_weight = _check_sample_weight( + sample_weight, X, dtype=np.float64, ensure_non_negative=True + ) + + kwargs = self.metric_params + if kwargs is None: + kwargs = {} + self.tree_ = TREE_DICT[algorithm]( + X, + metric=self.metric, + leaf_size=self.leaf_size, + sample_weight=sample_weight, + **kwargs, + ) + return self + + def score_samples(self, X): + """Compute the log-likelihood of each sample under the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + An array of points to query. Last dimension should match dimension + of training data (n_features). + + Returns + ------- + density : ndarray of shape (n_samples,) + Log-likelihood of each sample in `X`. These are normalized to be + probability densities, so values will be low for high-dimensional + data. + """ + check_is_fitted(self) + # The returned density is normalized to the number of points. + # For it to be a probability, we must scale it. For this reason + # we'll also scale atol. + X = validate_data(self, X, order="C", dtype=np.float64, reset=False) + if self.tree_.sample_weight is None: + N = self.tree_.data.shape[0] + else: + N = self.tree_.sum_weight + atol_N = self.atol * N + log_density = self.tree_.kernel_density( + X, + h=self.bandwidth_, + kernel=self.kernel, + atol=atol_N, + rtol=self.rtol, + breadth_first=self.breadth_first, + return_log=True, + ) + log_density -= np.log(N) + return log_density + + def score(self, X, y=None): + """Compute the total log-likelihood under the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + List of n_features-dimensional data points. Each row + corresponds to a single data point. + + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + + Returns + ------- + logprob : float + Total log-likelihood of the data in X. This is normalized to be a + probability density, so the value will be low for high-dimensional + data. + """ + return np.sum(self.score_samples(X)) + + def sample(self, n_samples=1, random_state=None): + """Generate random samples from the model. + + Currently, this is implemented only for gaussian and tophat kernels. + + Parameters + ---------- + n_samples : int, default=1 + Number of samples to generate. + + random_state : int, RandomState instance or None, default=None + Determines random number generation used to generate + random samples. Pass an int for reproducible results + across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : array-like of shape (n_samples, n_features) + List of samples. + """ + check_is_fitted(self) + # TODO: implement sampling for other valid kernel shapes + if self.kernel not in ["gaussian", "tophat"]: + raise NotImplementedError() + + data = np.asarray(self.tree_.data) + + rng = check_random_state(random_state) + u = rng.uniform(0, 1, size=n_samples) + if self.tree_.sample_weight is None: + i = (u * data.shape[0]).astype(np.int64) + else: + cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight)) + sum_weight = cumsum_weight[-1] + i = np.searchsorted(cumsum_weight, u * sum_weight) + if self.kernel == "gaussian": + return np.atleast_2d(rng.normal(data[i], self.bandwidth_)) + + elif self.kernel == "tophat": + # we first draw points from a d-dimensional normal distribution, + # then use an incomplete gamma function to map them to a uniform + # d-dimensional tophat distribution. + dim = data.shape[1] + X = rng.normal(size=(n_samples, dim)) + s_sq = row_norms(X, squared=True) + correction = ( + gammainc(0.5 * dim, 0.5 * s_sq) ** (1.0 / dim) + * self.bandwidth_ + / np.sqrt(s_sq) + ) + return data[i] + X * correction[:, np.newaxis] diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_lof.py b/.venv/Lib/site-packages/sklearn/neighbors/_lof.py new file mode 100644 index 0000000000000000000000000000000000000000..b6cf21587a1f37ebc996682806ce8ded64e97ea9 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_lof.py @@ -0,0 +1,518 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Real + +import numpy as np + +from ..base import OutlierMixin, _fit_context +from ..utils import check_array +from ..utils._param_validation import Interval, StrOptions +from ..utils.metaestimators import available_if +from ..utils.validation import check_is_fitted +from ._base import KNeighborsMixin, NeighborsBase + +__all__ = ["LocalOutlierFactor"] + + +class LocalOutlierFactor(KNeighborsMixin, OutlierMixin, NeighborsBase): + """Unsupervised Outlier Detection using the Local Outlier Factor (LOF). + + The anomaly score of each sample is called the Local Outlier Factor. + It measures the local deviation of the density of a given sample with respect + to its neighbors. + It is local in that the anomaly score depends on how isolated the object + is with respect to the surrounding neighborhood. + More precisely, locality is given by k-nearest neighbors, whose distance + is used to estimate the local density. + By comparing the local density of a sample to the local densities of its + neighbors, one can identify samples that have a substantially lower density + than their neighbors. These are considered outliers. + + .. versionadded:: 0.19 + + Parameters + ---------- + n_neighbors : int, default=20 + Number of neighbors to use by default for :meth:`kneighbors` queries. + If n_neighbors is larger than the number of samples provided, + all samples will be used. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can + affect the speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + p : float, default=2 + Parameter for the Minkowski metric from + :func:`sklearn.metrics.pairwise_distances`. When p = 1, this + is equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + contamination : 'auto' or float, default='auto' + The amount of contamination of the data set, i.e. the proportion + of outliers in the data set. When fitting this is used to define the + threshold on the scores of the samples. + + - if 'auto', the threshold is determined as in the + original paper, + - if a float, the contamination should be in the range (0, 0.5]. + + .. versionchanged:: 0.22 + The default value of ``contamination`` changed from 0.1 + to ``'auto'``. + + novelty : bool, default=False + By default, LocalOutlierFactor is only meant to be used for outlier + detection (novelty=False). Set novelty to True if you want to use + LocalOutlierFactor for novelty detection. In this case be aware that + you should only use predict, decision_function and score_samples + on new unseen data and not on the training set; and note that the + results obtained this way may differ from the standard LOF results. + + .. versionadded:: 0.20 + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + negative_outlier_factor_ : ndarray of shape (n_samples,) + The opposite LOF of the training samples. The higher, the more normal. + Inliers tend to have a LOF score close to 1 + (``negative_outlier_factor_`` close to -1), while outliers tend to have + a larger LOF score. + + The local outlier factor (LOF) of a sample captures its + supposed 'degree of abnormality'. + It is the average of the ratio of the local reachability density of + a sample and those of its k-nearest neighbors. + + n_neighbors_ : int + The actual number of neighbors used for :meth:`kneighbors` queries. + + offset_ : float + Offset used to obtain binary labels from the raw scores. + Observations having a negative_outlier_factor smaller than `offset_` + are detected as abnormal. + The offset is set to -1.5 (inliers score around -1), except when a + contamination parameter different than "auto" is provided. In that + case, the offset is defined in such a way we obtain the expected + number of outliers in training. + + .. versionadded:: 0.20 + + effective_metric_ : str + The effective metric used for the distance computation. + + effective_metric_params_ : dict + The effective additional keyword arguments for the metric function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + It is the number of samples in the fitted data. + + See Also + -------- + sklearn.svm.OneClassSVM: Unsupervised Outlier Detection using + Support Vector Machine. + + References + ---------- + .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May). + LOF: identifying density-based local outliers. In ACM sigmod record. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.neighbors import LocalOutlierFactor + >>> X = [[-1.1], [0.2], [101.1], [0.3]] + >>> clf = LocalOutlierFactor(n_neighbors=2) + >>> clf.fit_predict(X) + array([ 1, 1, -1, 1]) + >>> clf.negative_outlier_factor_ + array([ -0.9821..., -1.0370..., -73.3697..., -0.9821...]) + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "contamination": [ + StrOptions({"auto"}), + Interval(Real, 0, 0.5, closed="right"), + ], + "novelty": ["boolean"], + } + _parameter_constraints.pop("radius") + + def __init__( + self, + n_neighbors=20, + *, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + contamination="auto", + novelty=False, + n_jobs=None, + ): + super().__init__( + n_neighbors=n_neighbors, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.contamination = contamination + self.novelty = novelty + + def _check_novelty_fit_predict(self): + if self.novelty: + msg = ( + "fit_predict is not available when novelty=True. Use " + "novelty=False if you want to predict on the training set." + ) + raise AttributeError(msg) + return True + + @available_if(_check_novelty_fit_predict) + def fit_predict(self, X, y=None): + """Fit the model to the training set X and return the labels. + + **Not available for novelty detection (when novelty is set to True).** + Label is 1 for an inlier and -1 for an outlier according to the LOF + score and the contamination parameter. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None + The query sample or samples to compute the Local Outlier Factor + w.r.t. the training samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + is_inlier : ndarray of shape (n_samples,) + Returns -1 for anomalies/outliers and 1 for inliers. + """ + + # As fit_predict would be different from fit.predict, fit_predict is + # only available for outlier detection (novelty=False) + + return self.fit(X)._predict() + + @_fit_context( + # LocalOutlierFactor.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Fit the local outlier factor detector from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : LocalOutlierFactor + The fitted local outlier factor detector. + """ + self._fit(X) + + n_samples = self.n_samples_fit_ + if self.n_neighbors > n_samples: + warnings.warn( + "n_neighbors (%s) is greater than the " + "total number of samples (%s). n_neighbors " + "will be set to (n_samples - 1) for estimation." + % (self.n_neighbors, n_samples) + ) + self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) + + self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors( + n_neighbors=self.n_neighbors_ + ) + + if self._fit_X.dtype == np.float32: + self._distances_fit_X_ = self._distances_fit_X_.astype( + self._fit_X.dtype, + copy=False, + ) + + self._lrd = self._local_reachability_density( + self._distances_fit_X_, _neighbors_indices_fit_X_ + ) + + # Compute lof score over training samples to define offset_: + lrd_ratios_array = ( + self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis] + ) + + self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1) + + if self.contamination == "auto": + # inliers score around -1 (the higher, the less abnormal). + self.offset_ = -1.5 + else: + self.offset_ = np.percentile( + self.negative_outlier_factor_, 100.0 * self.contamination + ) + + # Verify if negative_outlier_factor_ values are within acceptable range. + # Novelty must also be false to detect outliers + if np.min(self.negative_outlier_factor_) < -1e7 and not self.novelty: + warnings.warn( + "Duplicate values are leading to incorrect results. " + "Increase the number of neighbors for more accurate results." + ) + + return self + + def _check_novelty_predict(self): + if not self.novelty: + msg = ( + "predict is not available when novelty=False, use " + "fit_predict if you want to predict on training data. Use " + "novelty=True if you want to use LOF for novelty detection " + "and predict on new unseen data." + ) + raise AttributeError(msg) + return True + + @available_if(_check_novelty_predict) + def predict(self, X=None): + """Predict the labels (1 inlier, -1 outlier) of X according to LOF. + + **Only available for novelty detection (when novelty is set to True).** + This method allows to generalize prediction to *new observations* (not + in the training set). Note that the result of ``clf.fit(X)`` then + ``clf.predict(X)`` with ``novelty=True`` may differ from the result + obtained by ``clf.fit_predict(X)`` with ``novelty=False``. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The query sample or samples to compute the Local Outlier Factor + w.r.t. the training samples. + + Returns + ------- + is_inlier : ndarray of shape (n_samples,) + Returns -1 for anomalies/outliers and +1 for inliers. + """ + return self._predict(X) + + def _predict(self, X=None): + """Predict the labels (1 inlier, -1 outlier) of X according to LOF. + + If X is None, returns the same as fit_predict(X_train). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None + The query sample or samples to compute the Local Outlier Factor + w.r.t. the training samples. If None, makes prediction on the + training data without considering them as their own neighbors. + + Returns + ------- + is_inlier : ndarray of shape (n_samples,) + Returns -1 for anomalies/outliers and +1 for inliers. + """ + check_is_fitted(self) + + if X is not None: + shifted_opposite_lof_scores = self.decision_function(X) + is_inlier = np.ones(shifted_opposite_lof_scores.shape[0], dtype=int) + is_inlier[shifted_opposite_lof_scores < 0] = -1 + else: + is_inlier = np.ones(self.n_samples_fit_, dtype=int) + is_inlier[self.negative_outlier_factor_ < self.offset_] = -1 + + return is_inlier + + def _check_novelty_decision_function(self): + if not self.novelty: + msg = ( + "decision_function is not available when novelty=False. " + "Use novelty=True if you want to use LOF for novelty " + "detection and compute decision_function for new unseen " + "data. Note that the opposite LOF of the training samples " + "is always available by considering the " + "negative_outlier_factor_ attribute." + ) + raise AttributeError(msg) + return True + + @available_if(_check_novelty_decision_function) + def decision_function(self, X): + """Shifted opposite of the Local Outlier Factor of X. + + Bigger is better, i.e. large values correspond to inliers. + + **Only available for novelty detection (when novelty is set to True).** + The shift offset allows a zero threshold for being an outlier. + The argument X is supposed to contain *new data*: if X contains a + point from training, it considers the later in its own neighborhood. + Also, the samples in X are not considered in the neighborhood of any + point. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The query sample or samples to compute the Local Outlier Factor + w.r.t. the training samples. + + Returns + ------- + shifted_opposite_lof_scores : ndarray of shape (n_samples,) + The shifted opposite of the Local Outlier Factor of each input + samples. The lower, the more abnormal. Negative scores represent + outliers, positive scores represent inliers. + """ + return self.score_samples(X) - self.offset_ + + def _check_novelty_score_samples(self): + if not self.novelty: + msg = ( + "score_samples is not available when novelty=False. The " + "scores of the training samples are always available " + "through the negative_outlier_factor_ attribute. Use " + "novelty=True if you want to use LOF for novelty detection " + "and compute score_samples for new unseen data." + ) + raise AttributeError(msg) + return True + + @available_if(_check_novelty_score_samples) + def score_samples(self, X): + """Opposite of the Local Outlier Factor of X. + + It is the opposite as bigger is better, i.e. large values correspond + to inliers. + + **Only available for novelty detection (when novelty is set to True).** + The argument X is supposed to contain *new data*: if X contains a + point from training, it considers the later in its own neighborhood. + Also, the samples in X are not considered in the neighborhood of any + point. Because of this, the scores obtained via ``score_samples`` may + differ from the standard LOF scores. + The standard LOF scores for the training data is available via the + ``negative_outlier_factor_`` attribute. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The query sample or samples to compute the Local Outlier Factor + w.r.t. the training samples. + + Returns + ------- + opposite_lof_scores : ndarray of shape (n_samples,) + The opposite of the Local Outlier Factor of each input samples. + The lower, the more abnormal. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse="csr") + + distances_X, neighbors_indices_X = self.kneighbors( + X, n_neighbors=self.n_neighbors_ + ) + + if X.dtype == np.float32: + distances_X = distances_X.astype(X.dtype, copy=False) + + X_lrd = self._local_reachability_density( + distances_X, + neighbors_indices_X, + ) + + lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis] + + # as bigger is better: + return -np.mean(lrd_ratios_array, axis=1) + + def _local_reachability_density(self, distances_X, neighbors_indices): + """The local reachability density (LRD) + + The LRD of a sample is the inverse of the average reachability + distance of its k-nearest neighbors. + + Parameters + ---------- + distances_X : ndarray of shape (n_queries, self.n_neighbors) + Distances to the neighbors (in the training samples `self._fit_X`) + of each query point to compute the LRD. + + neighbors_indices : ndarray of shape (n_queries, self.n_neighbors) + Neighbors indices (of each query point) among training samples + self._fit_X. + + Returns + ------- + local_reachability_density : ndarray of shape (n_queries,) + The local reachability density of each sample. + """ + dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1] + reach_dist_array = np.maximum(distances_X, dist_k) + + # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_: + return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10) diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_nca.py b/.venv/Lib/site-packages/sklearn/neighbors/_nca.py new file mode 100644 index 0000000000000000000000000000000000000000..a8153f388b7718bc9f2ec6ae775bc67a50c717ae --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_nca.py @@ -0,0 +1,530 @@ +""" +Neighborhood Component Analysis +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import sys +import time +from numbers import Integral, Real +from warnings import warn + +import numpy as np +from scipy.optimize import minimize + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..decomposition import PCA +from ..exceptions import ConvergenceWarning +from ..metrics import pairwise_distances +from ..preprocessing import LabelEncoder +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import softmax +from ..utils.multiclass import check_classification_targets +from ..utils.random import check_random_state +from ..utils.validation import check_array, check_is_fitted, validate_data + + +class NeighborhoodComponentsAnalysis( + ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator +): + """Neighborhood Components Analysis. + + Neighborhood Component Analysis (NCA) is a machine learning algorithm for + metric learning. It learns a linear transformation in a supervised fashion + to improve the classification accuracy of a stochastic nearest neighbors + rule in the transformed space. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=None + Preferred dimensionality of the projected space. + If None it will be set to `n_features`. + + init : {'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape \ + (n_features_a, n_features_b), default='auto' + Initialization of the linear transformation. Possible options are + `'auto'`, `'pca'`, `'lda'`, `'identity'`, `'random'`, and a numpy + array of shape `(n_features_a, n_features_b)`. + + - `'auto'` + Depending on `n_components`, the most reasonable initialization + is chosen. If `n_components <= min(n_features, n_classes - 1)` + we use `'lda'`, as it uses labels information. If not, but + `n_components < min(n_features, n_samples)`, we use `'pca'`, as + it projects data in meaningful directions (those of higher + variance). Otherwise, we just use `'identity'`. + + - `'pca'` + `n_components` principal components of the inputs passed + to :meth:`fit` will be used to initialize the transformation. + (See :class:`~sklearn.decomposition.PCA`) + + - `'lda'` + `min(n_components, n_classes)` most discriminative + components of the inputs passed to :meth:`fit` will be used to + initialize the transformation. (If `n_components > n_classes`, + the rest of the components will be zero.) (See + :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`) + + - `'identity'` + If `n_components` is strictly smaller than the + dimensionality of the inputs passed to :meth:`fit`, the identity + matrix will be truncated to the first `n_components` rows. + + - `'random'` + The initial transformation will be a random array of shape + `(n_components, n_features)`. Each value is sampled from the + standard normal distribution. + + - numpy array + `n_features_b` must match the dimensionality of the inputs passed + to :meth:`fit` and n_features_a must be less than or equal to that. + If `n_components` is not `None`, `n_features_a` must match it. + + warm_start : bool, default=False + If `True` and :meth:`fit` has been called before, the solution of the + previous call to :meth:`fit` is used as the initial linear + transformation (`n_components` and `init` will be ignored). + + max_iter : int, default=50 + Maximum number of iterations in the optimization. + + tol : float, default=1e-5 + Convergence tolerance for the optimization. + + callback : callable, default=None + If not `None`, this function is called after every iteration of the + optimizer, taking as arguments the current solution (flattened + transformation matrix) and the number of iterations. This might be + useful in case one wants to examine or store the transformation + found after each iteration. + + verbose : int, default=0 + If 0, no progress messages will be printed. + If 1, progress messages will be printed to stdout. + If > 1, progress messages will be printed and the `disp` + parameter of :func:`scipy.optimize.minimize` will be set to + `verbose - 2`. + + random_state : int or numpy.RandomState, default=None + A pseudo random number generator object or a seed for it if int. If + `init='random'`, `random_state` is used to initialize the random + transformation. If `init='pca'`, `random_state` is passed as an + argument to PCA when initializing the transformation. Pass an int + for reproducible results across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + The linear transformation learned during fitting. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + n_iter_ : int + Counts the number of iterations performed by the optimizer. + + random_state_ : numpy.RandomState + Pseudo random number generator object used during initialization. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.discriminant_analysis.LinearDiscriminantAnalysis : Linear + Discriminant Analysis. + sklearn.decomposition.PCA : Principal component analysis (PCA). + + References + ---------- + .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov. + "Neighbourhood Components Analysis". Advances in Neural Information + Processing Systems. 17, 513-520, 2005. + http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf + + .. [2] Wikipedia entry on Neighborhood Components Analysis + https://en.wikipedia.org/wiki/Neighbourhood_components_analysis + + Examples + -------- + >>> from sklearn.neighbors import NeighborhoodComponentsAnalysis + >>> from sklearn.neighbors import KNeighborsClassifier + >>> from sklearn.datasets import load_iris + >>> from sklearn.model_selection import train_test_split + >>> X, y = load_iris(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, + ... stratify=y, test_size=0.7, random_state=42) + >>> nca = NeighborhoodComponentsAnalysis(random_state=42) + >>> nca.fit(X_train, y_train) + NeighborhoodComponentsAnalysis(...) + >>> knn = KNeighborsClassifier(n_neighbors=3) + >>> knn.fit(X_train, y_train) + KNeighborsClassifier(...) + >>> print(knn.score(X_test, y_test)) + 0.933333... + >>> knn.fit(nca.transform(X_train), y_train) + KNeighborsClassifier(...) + >>> print(knn.score(nca.transform(X_test), y_test)) + 0.961904... + """ + + _parameter_constraints: dict = { + "n_components": [ + Interval(Integral, 1, None, closed="left"), + None, + ], + "init": [ + StrOptions({"auto", "pca", "lda", "identity", "random"}), + np.ndarray, + ], + "warm_start": ["boolean"], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0, None, closed="left")], + "callback": [callable, None], + "verbose": ["verbose"], + "random_state": ["random_state"], + } + + def __init__( + self, + n_components=None, + *, + init="auto", + warm_start=False, + max_iter=50, + tol=1e-5, + callback=None, + verbose=0, + random_state=None, + ): + self.n_components = n_components + self.init = init + self.warm_start = warm_start + self.max_iter = max_iter + self.tol = tol + self.callback = callback + self.verbose = verbose + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit the model according to the given training data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training samples. + + y : array-like of shape (n_samples,) + The corresponding training labels. + + Returns + ------- + self : object + Fitted estimator. + """ + # Validate the inputs X and y, and converts y to numerical classes. + X, y = validate_data(self, X, y, ensure_min_samples=2) + check_classification_targets(y) + y = LabelEncoder().fit_transform(y) + + # Check the preferred dimensionality of the projected space + if self.n_components is not None and self.n_components > X.shape[1]: + raise ValueError( + "The preferred dimensionality of the " + f"projected space `n_components` ({self.n_components}) cannot " + "be greater than the given data " + f"dimensionality ({X.shape[1]})!" + ) + # If warm_start is enabled, check that the inputs are consistent + if ( + self.warm_start + and hasattr(self, "components_") + and self.components_.shape[1] != X.shape[1] + ): + raise ValueError( + f"The new inputs dimensionality ({X.shape[1]}) does not " + "match the input dimensionality of the " + f"previously learned transformation ({self.components_.shape[1]})." + ) + # Check how the linear transformation should be initialized + init = self.init + if isinstance(init, np.ndarray): + init = check_array(init) + # Assert that init.shape[1] = X.shape[1] + if init.shape[1] != X.shape[1]: + raise ValueError( + f"The input dimensionality ({init.shape[1]}) of the given " + "linear transformation `init` must match the " + f"dimensionality of the given inputs `X` ({X.shape[1]})." + ) + # Assert that init.shape[0] <= init.shape[1] + if init.shape[0] > init.shape[1]: + raise ValueError( + f"The output dimensionality ({init.shape[0]}) of the given " + "linear transformation `init` cannot be " + f"greater than its input dimensionality ({init.shape[1]})." + ) + # Assert that self.n_components = init.shape[0] + if self.n_components is not None and self.n_components != init.shape[0]: + raise ValueError( + "The preferred dimensionality of the " + f"projected space `n_components` ({self.n_components}) does" + " not match the output dimensionality of " + "the given linear transformation " + f"`init` ({init.shape[0]})!" + ) + + # Initialize the random generator + self.random_state_ = check_random_state(self.random_state) + + # Measure the total training time + t_train = time.time() + + # Compute a mask that stays fixed during optimization: + same_class_mask = y[:, np.newaxis] == y[np.newaxis, :] + # (n_samples, n_samples) + + # Initialize the transformation + transformation = np.ravel(self._initialize(X, y, init)) + + # Create a dictionary of parameters to be passed to the optimizer + disp = self.verbose - 2 if self.verbose > 1 else -1 + optimizer_params = { + "method": "L-BFGS-B", + "fun": self._loss_grad_lbfgs, + "args": (X, same_class_mask, -1.0), + "jac": True, + "x0": transformation, + "tol": self.tol, + "options": dict(maxiter=self.max_iter, disp=disp), + "callback": self._callback, + } + + # Call the optimizer + self.n_iter_ = 0 + opt_result = minimize(**optimizer_params) + + # Reshape the solution found by the optimizer + self.components_ = opt_result.x.reshape(-1, X.shape[1]) + + # Stop timer + t_train = time.time() - t_train + if self.verbose: + cls_name = self.__class__.__name__ + + # Warn the user if the algorithm did not converge + if not opt_result.success: + warn( + "[{}] NCA did not converge: {}".format( + cls_name, opt_result.message + ), + ConvergenceWarning, + ) + + print("[{}] Training took {:8.2f}s.".format(cls_name, t_train)) + + return self + + def transform(self, X): + """Apply the learned transformation to the given data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data samples. + + Returns + ------- + X_embedded: ndarray of shape (n_samples, n_components) + The data samples transformed. + + Raises + ------ + NotFittedError + If :meth:`fit` has not been called before. + """ + + check_is_fitted(self) + X = validate_data(self, X, reset=False) + + return np.dot(X, self.components_.T) + + def _initialize(self, X, y, init): + """Initialize the transformation. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training samples. + + y : array-like of shape (n_samples,) + The training labels. + + init : str or ndarray of shape (n_features_a, n_features_b) + The validated initialization of the linear transformation. + + Returns + ------- + transformation : ndarray of shape (n_components, n_features) + The initialized linear transformation. + + """ + + transformation = init + if self.warm_start and hasattr(self, "components_"): + transformation = self.components_ + elif isinstance(init, np.ndarray): + pass + else: + n_samples, n_features = X.shape + n_components = self.n_components or n_features + if init == "auto": + n_classes = len(np.unique(y)) + if n_components <= min(n_features, n_classes - 1): + init = "lda" + elif n_components < min(n_features, n_samples): + init = "pca" + else: + init = "identity" + if init == "identity": + transformation = np.eye(n_components, X.shape[1]) + elif init == "random": + transformation = self.random_state_.standard_normal( + size=(n_components, X.shape[1]) + ) + elif init in {"pca", "lda"}: + init_time = time.time() + if init == "pca": + pca = PCA( + n_components=n_components, random_state=self.random_state_ + ) + if self.verbose: + print("Finding principal components... ", end="") + sys.stdout.flush() + pca.fit(X) + transformation = pca.components_ + elif init == "lda": + from ..discriminant_analysis import LinearDiscriminantAnalysis + + lda = LinearDiscriminantAnalysis(n_components=n_components) + if self.verbose: + print("Finding most discriminative components... ", end="") + sys.stdout.flush() + lda.fit(X, y) + transformation = lda.scalings_.T[:n_components] + if self.verbose: + print("done in {:5.2f}s".format(time.time() - init_time)) + return transformation + + def _callback(self, transformation): + """Called after each iteration of the optimizer. + + Parameters + ---------- + transformation : ndarray of shape (n_components * n_features,) + The solution computed by the optimizer in this iteration. + """ + if self.callback is not None: + self.callback(transformation, self.n_iter_) + + self.n_iter_ += 1 + + def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): + """Compute the loss and the loss gradient w.r.t. `transformation`. + + Parameters + ---------- + transformation : ndarray of shape (n_components * n_features,) + The raveled linear transformation on which to compute loss and + evaluate gradient. + + X : ndarray of shape (n_samples, n_features) + The training samples. + + same_class_mask : ndarray of shape (n_samples, n_samples) + A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong + to the same class, and `0` otherwise. + + Returns + ------- + loss : float + The loss computed for the given transformation. + + gradient : ndarray of shape (n_components * n_features,) + The new (flattened) gradient of the loss. + """ + + if self.n_iter_ == 0: + self.n_iter_ += 1 + if self.verbose: + header_fields = ["Iteration", "Objective Value", "Time(s)"] + header_fmt = "{:>10} {:>20} {:>10}" + header = header_fmt.format(*header_fields) + cls_name = self.__class__.__name__ + print("[{}]".format(cls_name)) + print( + "[{}] {}\n[{}] {}".format( + cls_name, header, cls_name, "-" * len(header) + ) + ) + + t_funcall = time.time() + + transformation = transformation.reshape(-1, X.shape[1]) + X_embedded = np.dot(X, transformation.T) # (n_samples, n_components) + + # Compute softmax distances + p_ij = pairwise_distances(X_embedded, squared=True) + np.fill_diagonal(p_ij, np.inf) + p_ij = softmax(-p_ij) # (n_samples, n_samples) + + # Compute loss + masked_p_ij = p_ij * same_class_mask + p = np.sum(masked_p_ij, axis=1, keepdims=True) # (n_samples, 1) + loss = np.sum(p) + + # Compute gradient of loss w.r.t. `transform` + weighted_p_ij = masked_p_ij - p_ij * p + weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T + np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0)) + gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X) + # time complexity of the gradient: O(n_components x n_samples x ( + # n_samples + n_features)) + + if self.verbose: + t_funcall = time.time() - t_funcall + values_fmt = "[{}] {:>10} {:>20.6e} {:>10.2f}" + print( + values_fmt.format( + self.__class__.__name__, self.n_iter_, loss, t_funcall + ) + ) + sys.stdout.flush() + + return sign * loss, sign * gradient.ravel() + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = True + return tags + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_nearest_centroid.py b/.venv/Lib/site-packages/sklearn/neighbors/_nearest_centroid.py new file mode 100644 index 0000000000000000000000000000000000000000..e9556cea19ad326f4c3a77b9ee19e488e9fcac93 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_nearest_centroid.py @@ -0,0 +1,358 @@ +""" +Nearest Centroid Classification +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Real + +import numpy as np +from scipy import sparse as sp + +from ..base import BaseEstimator, ClassifierMixin, _fit_context +from ..discriminant_analysis import DiscriminantAnalysisPredictionMixin +from ..metrics.pairwise import ( + pairwise_distances, + pairwise_distances_argmin, +) +from ..preprocessing import LabelEncoder +from ..utils import get_tags +from ..utils._available_if import available_if +from ..utils._param_validation import Interval, StrOptions +from ..utils.multiclass import check_classification_targets +from ..utils.sparsefuncs import csc_median_axis_0 +from ..utils.validation import check_is_fitted, validate_data + + +class NearestCentroid( + DiscriminantAnalysisPredictionMixin, ClassifierMixin, BaseEstimator +): + """Nearest centroid classifier. + + Each class is represented by its centroid, with test samples classified to + the class with the nearest centroid. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + metric : {"euclidean", "manhattan"}, default="euclidean" + Metric to use for distance computation. + + If `metric="euclidean"`, the centroid for the samples corresponding to each + class is the arithmetic mean, which minimizes the sum of squared L1 distances. + If `metric="manhattan"`, the centroid is the feature-wise median, which + minimizes the sum of L1 distances. + + .. versionchanged:: 1.5 + All metrics but `"euclidean"` and `"manhattan"` were deprecated and + now raise an error. + + .. versionchanged:: 0.19 + `metric='precomputed'` was deprecated and now raises an error + + shrink_threshold : float, default=None + Threshold for shrinking centroids to remove features. + + priors : {"uniform", "empirical"} or array-like of shape (n_classes,), \ + default="uniform" + The class prior probabilities. By default, the class proportions are + inferred from the training data. + + .. versionadded:: 1.6 + + Attributes + ---------- + centroids_ : array-like of shape (n_classes, n_features) + Centroid of each class. + + classes_ : array of shape (n_classes,) + The unique classes labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + deviations_ : ndarray of shape (n_classes, n_features) + Deviations (or shrinkages) of the centroids of each class from the + overall centroid. Equal to eq. (18.4) if `shrink_threshold=None`, + else (18.5) p. 653 of [2]. Can be used to identify features used + for classification. + + .. versionadded:: 1.6 + + within_class_std_dev_ : ndarray of shape (n_features,) + Pooled or within-class standard deviation of input data. + + .. versionadded:: 1.6 + + class_prior_ : ndarray of shape (n_classes,) + The class prior probabilities. + + .. versionadded:: 1.6 + + See Also + -------- + KNeighborsClassifier : Nearest neighbors classifier. + + Notes + ----- + When used for text classification with tf-idf vectors, this classifier is + also known as the Rocchio classifier. + + References + ---------- + [1] Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of + multiple cancer types by shrunken centroids of gene expression. Proceedings + of the National Academy of Sciences of the United States of America, + 99(10), 6567-6572. The National Academy of Sciences. + + [2] Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical + Learning Data Mining, Inference, and Prediction. 2nd Edition. New York, Springer. + + Examples + -------- + >>> from sklearn.neighbors import NearestCentroid + >>> import numpy as np + >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) + >>> y = np.array([1, 1, 1, 2, 2, 2]) + >>> clf = NearestCentroid() + >>> clf.fit(X, y) + NearestCentroid() + >>> print(clf.predict([[-0.8, -1]])) + [1] + """ + + _parameter_constraints: dict = { + "metric": [StrOptions({"manhattan", "euclidean"})], + "shrink_threshold": [Interval(Real, 0, None, closed="neither"), None], + "priors": ["array-like", StrOptions({"empirical", "uniform"})], + } + + def __init__( + self, + metric="euclidean", + *, + shrink_threshold=None, + priors="uniform", + ): + self.metric = metric + self.shrink_threshold = shrink_threshold + self.priors = priors + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """ + Fit the NearestCentroid model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + Note that centroid shrinking cannot be used with sparse matrices. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : object + Fitted estimator. + """ + # If X is sparse and the metric is "manhattan", store it in a csc + # format is easier to calculate the median. + if self.metric == "manhattan": + X, y = validate_data(self, X, y, accept_sparse=["csc"]) + else: + ensure_all_finite = ( + "allow-nan" if get_tags(self).input_tags.allow_nan else True + ) + X, y = validate_data( + self, + X, + y, + ensure_all_finite=ensure_all_finite, + accept_sparse=["csr", "csc"], + ) + is_X_sparse = sp.issparse(X) + check_classification_targets(y) + + n_samples, n_features = X.shape + le = LabelEncoder() + y_ind = le.fit_transform(y) + self.classes_ = classes = le.classes_ + n_classes = classes.size + if n_classes < 2: + raise ValueError( + "The number of classes has to be greater than one; got %d class" + % (n_classes) + ) + + if self.priors == "empirical": # estimate priors from sample + _, class_counts = np.unique(y, return_inverse=True) # non-negative ints + self.class_prior_ = np.bincount(class_counts) / float(len(y)) + elif self.priors == "uniform": + self.class_prior_ = np.asarray([1 / n_classes] * n_classes) + else: + self.class_prior_ = np.asarray(self.priors) + + if (self.class_prior_ < 0).any(): + raise ValueError("priors must be non-negative") + if not np.isclose(self.class_prior_.sum(), 1.0): + warnings.warn( + "The priors do not sum to 1. Normalizing such that it sums to one.", + UserWarning, + ) + self.class_prior_ = self.class_prior_ / self.class_prior_.sum() + + # Mask mapping each class to its members. + self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64) + + # Number of clusters in each class. + nk = np.zeros(n_classes) + + for cur_class in range(n_classes): + center_mask = y_ind == cur_class + nk[cur_class] = np.sum(center_mask) + if is_X_sparse: + center_mask = np.where(center_mask)[0] + + if self.metric == "manhattan": + # NumPy does not calculate median of sparse matrices. + if not is_X_sparse: + self.centroids_[cur_class] = np.median(X[center_mask], axis=0) + else: + self.centroids_[cur_class] = csc_median_axis_0(X[center_mask]) + else: # metric == "euclidean" + self.centroids_[cur_class] = X[center_mask].mean(axis=0) + + # Compute within-class std_dev with unshrunked centroids + variance = np.array(X - self.centroids_[y_ind], copy=False) ** 2 + self.within_class_std_dev_ = np.array( + np.sqrt(variance.sum(axis=0) / (n_samples - n_classes)), copy=False + ) + if any(self.within_class_std_dev_ == 0): + warnings.warn( + "self.within_class_std_dev_ has at least 1 zero standard deviation." + "Inputs within the same classes for at least 1 feature are identical." + ) + + err_msg = "All features have zero variance. Division by zero." + if is_X_sparse and np.all((X.max(axis=0) - X.min(axis=0)).toarray() == 0): + raise ValueError(err_msg) + elif not is_X_sparse and np.all(np.ptp(X, axis=0) == 0): + raise ValueError(err_msg) + + dataset_centroid_ = X.mean(axis=0) + # m parameter for determining deviation + m = np.sqrt((1.0 / nk) - (1.0 / n_samples)) + # Calculate deviation using the standard deviation of centroids. + # To deter outliers from affecting the results. + s = self.within_class_std_dev_ + np.median(self.within_class_std_dev_) + mm = m.reshape(len(m), 1) # Reshape to allow broadcasting. + ms = mm * s + self.deviations_ = np.array( + (self.centroids_ - dataset_centroid_) / ms, copy=False + ) + # Soft thresholding: if the deviation crosses 0 during shrinking, + # it becomes zero. + if self.shrink_threshold: + signs = np.sign(self.deviations_) + self.deviations_ = np.abs(self.deviations_) - self.shrink_threshold + np.clip(self.deviations_, 0, None, out=self.deviations_) + self.deviations_ *= signs + # Now adjust the centroids using the deviation + msd = ms * self.deviations_ + self.centroids_ = np.array(dataset_centroid_ + msd, copy=False) + return self + + def predict(self, X): + """Perform classification on an array of test vectors `X`. + + The predicted class `C` for each sample in `X` is returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + The predicted classes. + """ + check_is_fitted(self) + if np.isclose(self.class_prior_, 1 / len(self.classes_)).all(): + # `validate_data` is called here since we are not calling `super()` + ensure_all_finite = ( + "allow-nan" if get_tags(self).input_tags.allow_nan else True + ) + X = validate_data( + self, + X, + ensure_all_finite=ensure_all_finite, + accept_sparse="csr", + reset=False, + ) + return self.classes_[ + pairwise_distances_argmin(X, self.centroids_, metric=self.metric) + ] + else: + return super().predict(X) + + def _decision_function(self, X): + # return discriminant scores, see eq. (18.2) p. 652 of the ESL. + check_is_fitted(self, "centroids_") + + X_normalized = validate_data( + self, X, copy=True, reset=False, accept_sparse="csr", dtype=np.float64 + ) + + discriminant_score = np.empty( + (X_normalized.shape[0], self.classes_.size), dtype=np.float64 + ) + + mask = self.within_class_std_dev_ != 0 + X_normalized[:, mask] /= self.within_class_std_dev_[mask] + centroids_normalized = self.centroids_.copy() + centroids_normalized[:, mask] /= self.within_class_std_dev_[mask] + + for class_idx in range(self.classes_.size): + distances = pairwise_distances( + X_normalized, centroids_normalized[[class_idx]], metric=self.metric + ).ravel() + distances **= 2 + discriminant_score[:, class_idx] = np.squeeze( + -distances + 2.0 * np.log(self.class_prior_[class_idx]) + ) + + return discriminant_score + + def _check_euclidean_metric(self): + return self.metric == "euclidean" + + decision_function = available_if(_check_euclidean_metric)( + DiscriminantAnalysisPredictionMixin.decision_function + ) + + predict_proba = available_if(_check_euclidean_metric)( + DiscriminantAnalysisPredictionMixin.predict_proba + ) + + predict_log_proba = available_if(_check_euclidean_metric)( + DiscriminantAnalysisPredictionMixin.predict_log_proba + ) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = self.metric == "nan_euclidean" + return tags diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..57b9d941d022044094ea821d04eb6f15f4c22bc8 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.lib differ diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.pyd new file mode 100644 index 0000000000000000000000000000000000000000..81893531c59578de69d0ac23f901b229efe2c7ec Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.pyd differ diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pxd b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pxd new file mode 100644 index 0000000000000000000000000000000000000000..8a749db257a114322447b705ab557af2765a80cd --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pxd @@ -0,0 +1,10 @@ +from cython cimport floating +from ..utils._typedefs cimport float64_t, intp_t + +cdef int partition_node_indices( + const floating *data, + intp_t *node_indices, + intp_t split_dim, + intp_t split_index, + intp_t n_features, + intp_t n_points) except -1 diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pyx b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pyx new file mode 100644 index 0000000000000000000000000000000000000000..8c35d57dfc49550ac10dd98324bd177982f6568b --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pyx @@ -0,0 +1,122 @@ +# BinaryTrees rely on partial sorts to partition their nodes during their +# initialisation. +# +# The C++ std library exposes nth_element, an efficient partial sort for this +# situation which has a linear time complexity as well as the best performances. +# +# To use std::algorithm::nth_element, a few fixture are defined using Cython: +# - partition_node_indices, a Cython function used in BinaryTrees, that calls +# - partition_node_indices_inner, a C++ function that wraps nth_element and uses +# - an IndexComparator to state how to compare KDTrees' indices +# +# IndexComparator has been defined so that partial sorts are stable with +# respect to the nodes initial indices. +# +# See for reference: +# - https://en.cppreference.com/w/cpp/algorithm/nth_element. +# - https://github.com/scikit-learn/scikit-learn/pull/11103 +# - https://github.com/scikit-learn/scikit-learn/pull/19473 +from cython cimport floating + + +cdef extern from *: + """ + #include + + template + class IndexComparator { + private: + const D *data; + I split_dim, n_features; + public: + IndexComparator(const D *data, const I &split_dim, const I &n_features): + data(data), split_dim(split_dim), n_features(n_features) {} + + bool operator()(const I &a, const I &b) const { + D a_value = data[a * n_features + split_dim]; + D b_value = data[b * n_features + split_dim]; + return a_value == b_value ? a < b : a_value < b_value; + } + }; + + template + void partition_node_indices_inner( + const D *data, + I *node_indices, + const I &split_dim, + const I &split_index, + const I &n_features, + const I &n_points) { + IndexComparator index_comparator(data, split_dim, n_features); + std::nth_element( + node_indices, + node_indices + split_index, + node_indices + n_points, + index_comparator); + } + """ + void partition_node_indices_inner[D, I]( + const D *data, + I *node_indices, + I split_dim, + I split_index, + I n_features, + I n_points) except + + + +cdef int partition_node_indices( + const floating *data, + intp_t *node_indices, + intp_t split_dim, + intp_t split_index, + intp_t n_features, + intp_t n_points) except -1: + """Partition points in the node into two equal-sized groups. + + Upon return, the values in node_indices will be rearranged such that + (assuming numpy-style indexing): + + data[node_indices[0:split_index], split_dim] + <= data[node_indices[split_index], split_dim] + + and + + data[node_indices[split_index], split_dim] + <= data[node_indices[split_index:n_points], split_dim] + + The algorithm is essentially a partial in-place quicksort around a + set pivot. + + Parameters + ---------- + data : double pointer + Pointer to a 2D array of the training data, of shape [N, n_features]. + N must be greater than any of the values in node_indices. + node_indices : int pointer + Pointer to a 1D array of length n_points. This lists the indices of + each of the points within the current node. This will be modified + in-place. + split_dim : int + the dimension on which to split. This will usually be computed via + the routine ``find_node_split_dim``. + split_index : int + the index within node_indices around which to split the points. + n_features: int + the number of features (i.e columns) in the 2D array pointed by data. + n_points : int + the length of node_indices. This is also the number of points in + the original dataset. + Returns + ------- + status : int + integer exit status. On return, the contents of node_indices are + modified as noted above. + """ + partition_node_indices_inner( + data, + node_indices, + split_dim, + split_index, + n_features, + n_points) + return 0 diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..de5779031bbaebb1d07da8d740618503b476def6 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.lib differ diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.pyd new file mode 100644 index 0000000000000000000000000000000000000000..8289cc1cb71fbe3ed3a7d816797740509e8a66de Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.pyd differ diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pxd b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pxd new file mode 100644 index 0000000000000000000000000000000000000000..a8ca8d7d4f684241d6c6617d1a692419f2eb8731 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pxd @@ -0,0 +1,92 @@ +# Author: Thomas Moreau +# Author: Olivier Grisel + +# See quad_tree.pyx for details. + +cimport numpy as cnp +from ..utils._typedefs cimport float32_t, intp_t + +# This is effectively an ifdef statement in Cython +# It allows us to write printf debugging lines +# and remove them at compile time +cdef enum: + DEBUGFLAG = 0 + +cdef float EPSILON = 1e-6 + +# XXX: Careful to not change the order of the arguments. It is important to +# have is_leaf and max_width consecutive as it permits to avoid padding by +# the compiler and keep the size coherent for both C and numpy data structures. +cdef struct Cell: + # Base storage structure for cells in a QuadTree object + + # Tree structure + intp_t parent # Parent cell of this cell + intp_t[8] children # Array pointing to children of this cell + + # Cell description + intp_t cell_id # Id of the cell in the cells array in the Tree + intp_t point_index # Index of the point at this cell (only defined + # # in non empty leaf) + bint is_leaf # Does this cell have children? + float32_t squared_max_width # Squared value of the maximum width w + intp_t depth # Depth of the cell in the tree + intp_t cumulative_size # Number of points included in the subtree with + # # this cell as a root. + + # Internal constants + float32_t[3] center # Store the center for quick split of cells + float32_t[3] barycenter # Keep track of the center of mass of the cell + + # Cell boundaries + float32_t[3] min_bounds # Inferior boundaries of this cell (inclusive) + float32_t[3] max_bounds # Superior boundaries of this cell (exclusive) + + +cdef class _QuadTree: + # The QuadTree object is a quad tree structure constructed by inserting + # recursively points in the tree and splitting cells in 4 so that each + # leaf cell contains at most one point. + # This structure also handle 3D data, inserted in trees with 8 children + # for each node. + + # Parameters of the tree + cdef public int n_dimensions # Number of dimensions in X + cdef public int verbose # Verbosity of the output + cdef intp_t n_cells_per_cell # Number of children per node. (2 ** n_dimension) + + # Tree inner structure + cdef public intp_t max_depth # Max depth of the tree + cdef public intp_t cell_count # Counter for node IDs + cdef public intp_t capacity # Capacity of tree, in terms of nodes + cdef public intp_t n_points # Total number of points + cdef Cell* cells # Array of nodes + + # Point insertion methods + cdef int insert_point(self, float32_t[3] point, intp_t point_index, + intp_t cell_id=*) except -1 nogil + cdef intp_t _insert_point_in_new_child(self, float32_t[3] point, Cell* cell, + intp_t point_index, intp_t size=* + ) noexcept nogil + cdef intp_t _select_child(self, float32_t[3] point, Cell* cell) noexcept nogil + cdef bint _is_duplicate(self, float32_t[3] point1, float32_t[3] point2) noexcept nogil + + # Create a summary of the Tree compare to a query point + cdef long summarize(self, float32_t[3] point, float32_t* results, + float squared_theta=*, intp_t cell_id=*, long idx=* + ) noexcept nogil + + # Internal cell initialization methods + cdef void _init_cell(self, Cell* cell, intp_t parent, intp_t depth) noexcept nogil + cdef void _init_root(self, float32_t[3] min_bounds, float32_t[3] max_bounds + ) noexcept nogil + + # Private methods + cdef int _check_point_in_cell(self, float32_t[3] point, Cell* cell + ) except -1 nogil + + # Private array manipulation to manage the ``cells`` array + cdef int _resize(self, intp_t capacity) except -1 nogil + cdef int _resize_c(self, intp_t capacity=*) except -1 nogil + cdef int _get_cell(self, float32_t[3] point, intp_t cell_id=*) except -1 nogil + cdef Cell[:] _get_cell_ndarray(self) diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pyx b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pyx new file mode 100644 index 0000000000000000000000000000000000000000..ddc79b695685287c455561a7ba6d6c4e861fc1e5 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pyx @@ -0,0 +1,609 @@ +# Author: Thomas Moreau +# Author: Olivier Grisel + + +from cpython cimport Py_INCREF, PyObject, PyTypeObject + +from libc.math cimport fabsf +from libc.stdlib cimport free +from libc.string cimport memcpy +from libc.stdio cimport printf +from libc.stdint cimport SIZE_MAX + +from ..tree._utils cimport safe_realloc + +import numpy as np +cimport numpy as cnp +cnp.import_array() + +cdef extern from "numpy/arrayobject.h": + object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr, + int nd, cnp.npy_intp* dims, + cnp.npy_intp* strides, + void* data, int flags, object obj) + int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) + +# Build the corresponding numpy dtype for Cell. +# This works by casting `dummy` to an array of Cell of length 1, which numpy +# can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946 +# for a more detailed explanation. +cdef Cell dummy +CELL_DTYPE = np.asarray((&dummy)).dtype + +assert CELL_DTYPE.itemsize == sizeof(Cell) + + +cdef class _QuadTree: + """Array-based representation of a QuadTree. + + This class is currently working for indexing 2D data (regular QuadTree) and + for indexing 3D data (OcTree). It is planned to split the 2 implementations + using `Cython.Tempita` to save some memory for QuadTree. + + Note that this code is currently internally used only by the Barnes-Hut + method in `sklearn.manifold.TSNE`. It is planned to be refactored and + generalized in the future to be compatible with nearest neighbors API of + `sklearn.neighbors` with 2D and 3D data. + """ + def __cinit__(self, int n_dimensions, int verbose): + """Constructor.""" + # Parameters of the tree + self.n_dimensions = n_dimensions + self.verbose = verbose + self.n_cells_per_cell = (2 ** self.n_dimensions) + + # Inner structures + self.max_depth = 0 + self.cell_count = 0 + self.capacity = 0 + self.n_points = 0 + self.cells = NULL + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.cells) + + @property + def cumulative_size(self): + cdef Cell[:] cell_mem_view = self._get_cell_ndarray() + return cell_mem_view.base['cumulative_size'][:self.cell_count] + + @property + def leafs(self): + cdef Cell[:] cell_mem_view = self._get_cell_ndarray() + return cell_mem_view.base['is_leaf'][:self.cell_count] + + def build_tree(self, X): + """Build a tree from an array of points X.""" + cdef: + int i + float32_t[3] pt + float32_t[3] min_bounds, max_bounds + + # validate X and prepare for query + # X = check_array(X, dtype=float32_t, order='C') + n_samples = X.shape[0] + + capacity = 100 + self._resize(capacity) + m = np.min(X, axis=0) + M = np.max(X, axis=0) + # Scale the maximum to get all points strictly in the tree bounding box + # The 3 bounds are for positive, negative and small values + M = np.maximum(M * (1. + 1e-3 * np.sign(M)), M + 1e-3) + for i in range(self.n_dimensions): + min_bounds[i] = m[i] + max_bounds[i] = M[i] + + if self.verbose > 10: + printf("[QuadTree] bounding box axis %i : [%f, %f]\n", + i, min_bounds[i], max_bounds[i]) + + # Create the initial node with boundaries from the dataset + self._init_root(min_bounds, max_bounds) + + for i in range(n_samples): + for j in range(self.n_dimensions): + pt[j] = X[i, j] + self.insert_point(pt, i) + + # Shrink the cells array to reduce memory usage + self._resize(capacity=self.cell_count) + + cdef int insert_point(self, float32_t[3] point, intp_t point_index, + intp_t cell_id=0) except -1 nogil: + """Insert a point in the QuadTree.""" + cdef int ax + cdef intp_t selected_child + cdef Cell* cell = &self.cells[cell_id] + cdef intp_t n_point = cell.cumulative_size + + if self.verbose > 10: + printf("[QuadTree] Inserting depth %li\n", cell.depth) + + # Assert that the point is in the right range + if DEBUGFLAG: + self._check_point_in_cell(point, cell) + + # If the cell is an empty leaf, insert the point in it + if cell.cumulative_size == 0: + cell.cumulative_size = 1 + self.n_points += 1 + for i in range(self.n_dimensions): + cell.barycenter[i] = point[i] + cell.point_index = point_index + if self.verbose > 10: + printf("[QuadTree] inserted point %li in cell %li\n", + point_index, cell_id) + return cell_id + + # If the cell is not a leaf, update cell internals and + # recurse in selected child + if not cell.is_leaf: + for ax in range(self.n_dimensions): + # barycenter update using a weighted mean + cell.barycenter[ax] = ( + n_point * cell.barycenter[ax] + point[ax]) / (n_point + 1) + + # Increase the size of the subtree starting from this cell + cell.cumulative_size += 1 + + # Insert child in the correct subtree + selected_child = self._select_child(point, cell) + if self.verbose > 49: + printf("[QuadTree] selected child %li\n", selected_child) + if selected_child == -1: + self.n_points += 1 + return self._insert_point_in_new_child(point, cell, point_index) + return self.insert_point(point, point_index, selected_child) + + # Finally, if the cell is a leaf with a point already inserted, + # split the cell in n_cells_per_cell if the point is not a duplicate. + # If it is a duplicate, increase the size of the leaf and return. + if self._is_duplicate(point, cell.barycenter): + if self.verbose > 10: + printf("[QuadTree] found a duplicate!\n") + cell.cumulative_size += 1 + self.n_points += 1 + return cell_id + + # In a leaf, the barycenter correspond to the only point included + # in it. + self._insert_point_in_new_child(cell.barycenter, cell, cell.point_index, + cell.cumulative_size) + return self.insert_point(point, point_index, cell_id) + + # XXX: This operation is not Thread safe + cdef intp_t _insert_point_in_new_child( + self, float32_t[3] point, Cell* cell, intp_t point_index, intp_t size=1 + ) noexcept nogil: + """Create a child of cell which will contain point.""" + + # Local variable definition + cdef: + intp_t cell_id, cell_child_id, parent_id + float32_t[3] save_point + float32_t width + Cell* child + int i + + # If the maximal capacity of the Tree have been reached, double the capacity + # We need to save the current cell id and the current point to retrieve them + # in case the reallocation + if self.cell_count + 1 > self.capacity: + parent_id = cell.cell_id + for i in range(self.n_dimensions): + save_point[i] = point[i] + self._resize(SIZE_MAX) + cell = &self.cells[parent_id] + point = save_point + + # Get an empty cell and initialize it + cell_id = self.cell_count + self.cell_count += 1 + child = &self.cells[cell_id] + + self._init_cell(child, cell.cell_id, cell.depth + 1) + child.cell_id = cell_id + + # Set the cell as an inner cell of the Tree + cell.is_leaf = False + cell.point_index = -1 + + # Set the correct boundary for the cell, store the point in the cell + # and compute its index in the children array. + cell_child_id = 0 + for i in range(self.n_dimensions): + cell_child_id *= 2 + if point[i] >= cell.center[i]: + cell_child_id += 1 + child.min_bounds[i] = cell.center[i] + child.max_bounds[i] = cell.max_bounds[i] + else: + child.min_bounds[i] = cell.min_bounds[i] + child.max_bounds[i] = cell.center[i] + child.center[i] = (child.min_bounds[i] + child.max_bounds[i]) / 2. + width = child.max_bounds[i] - child.min_bounds[i] + + child.barycenter[i] = point[i] + child.squared_max_width = max(child.squared_max_width, width*width) + + # Store the point info and the size to account for duplicated points + child.point_index = point_index + child.cumulative_size = size + + # Store the child cell in the correct place in children + cell.children[cell_child_id] = child.cell_id + + if DEBUGFLAG: + # Assert that the point is in the right range + self._check_point_in_cell(point, child) + if self.verbose > 10: + printf("[QuadTree] inserted point %li in new child %li\n", + point_index, cell_id) + + return cell_id + + cdef bint _is_duplicate(self, float32_t[3] point1, float32_t[3] point2) noexcept nogil: + """Check if the two given points are equals.""" + cdef int i + cdef bint res = True + for i in range(self.n_dimensions): + # Use EPSILON to avoid numerical error that would overgrow the tree + res &= fabsf(point1[i] - point2[i]) <= EPSILON + return res + + cdef intp_t _select_child(self, float32_t[3] point, Cell* cell) noexcept nogil: + """Select the child of cell which contains the given query point.""" + cdef: + int i + intp_t selected_child = 0 + + for i in range(self.n_dimensions): + # Select the correct child cell to insert the point by comparing + # it to the borders of the cells using precomputed center. + selected_child *= 2 + if point[i] >= cell.center[i]: + selected_child += 1 + return cell.children[selected_child] + + cdef void _init_cell(self, Cell* cell, intp_t parent, intp_t depth) noexcept nogil: + """Initialize a cell structure with some constants.""" + cell.parent = parent + cell.is_leaf = True + cell.depth = depth + cell.squared_max_width = 0 + cell.cumulative_size = 0 + for i in range(self.n_cells_per_cell): + cell.children[i] = SIZE_MAX + + cdef void _init_root(self, float32_t[3] min_bounds, float32_t[3] max_bounds + ) noexcept nogil: + """Initialize the root node with the given space boundaries""" + cdef: + int i + float32_t width + Cell* root = &self.cells[0] + + self._init_cell(root, -1, 0) + for i in range(self.n_dimensions): + root.min_bounds[i] = min_bounds[i] + root.max_bounds[i] = max_bounds[i] + root.center[i] = (max_bounds[i] + min_bounds[i]) / 2. + width = max_bounds[i] - min_bounds[i] + root.squared_max_width = max(root.squared_max_width, width*width) + root.cell_id = 0 + + self.cell_count += 1 + + cdef int _check_point_in_cell(self, float32_t[3] point, Cell* cell + ) except -1 nogil: + """Check that the given point is in the cell boundaries.""" + + if self.verbose >= 50: + if self.n_dimensions == 3: + printf("[QuadTree] Checking point (%f, %f, %f) in cell %li " + "([%f/%f, %f/%f, %f/%f], size %li)\n", + point[0], point[1], point[2], cell.cell_id, + cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1], + cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2], + cell.cumulative_size) + else: + printf("[QuadTree] Checking point (%f, %f) in cell %li " + "([%f/%f, %f/%f], size %li)\n", + point[0], point[1], cell.cell_id, cell.min_bounds[0], + cell.max_bounds[0], cell.min_bounds[1], + cell.max_bounds[1], cell.cumulative_size) + + for i in range(self.n_dimensions): + if (cell.min_bounds[i] > point[i] or + cell.max_bounds[i] <= point[i]): + with gil: + msg = "[QuadTree] InsertionError: point out of cell " + msg += "boundary.\nAxis %li: cell [%f, %f]; point %f\n" + + msg %= i, cell.min_bounds[i], cell.max_bounds[i], point[i] + raise ValueError(msg) + + def _check_coherence(self): + """Check the coherence of the cells of the tree. + + Check that the info stored in each cell is compatible with the info + stored in descendent and sibling cells. Raise a ValueError if this + fails. + """ + for cell in self.cells[:self.cell_count]: + # Check that the barycenter of inserted point is within the cell + # boundaries + self._check_point_in_cell(cell.barycenter, &cell) + + if not cell.is_leaf: + # Compute the number of point in children and compare with + # its cummulative_size. + n_points = 0 + for idx in range(self.n_cells_per_cell): + child_id = cell.children[idx] + if child_id != -1: + child = self.cells[child_id] + n_points += child.cumulative_size + assert child.cell_id == child_id, ( + "Cell id not correctly initialized.") + if n_points != cell.cumulative_size: + raise ValueError( + "Cell {} is incoherent. Size={} but found {} points " + "in children. ({})" + .format(cell.cell_id, cell.cumulative_size, + n_points, cell.children)) + + # Make sure that the number of point in the tree correspond to the + # cumulative size in root cell. + if self.n_points != self.cells[0].cumulative_size: + raise ValueError( + "QuadTree is incoherent. Size={} but found {} points " + "in children." + .format(self.n_points, self.cells[0].cumulative_size)) + + cdef long summarize(self, float32_t[3] point, float32_t* results, + float squared_theta=.5, intp_t cell_id=0, long idx=0 + ) noexcept nogil: + """Summarize the tree compared to a query point. + + Input arguments + --------------- + point : array (n_dimensions) + query point to construct the summary. + cell_id : integer, optional (default: 0) + current cell of the tree summarized. This should be set to 0 for + external calls. + idx : integer, optional (default: 0) + current index in the result array. This should be set to 0 for + external calls + squared_theta: float, optional (default: .5) + threshold to decide whether the node is sufficiently far + from the query point to be a good summary. The formula is such that + the node is a summary if + node_width^2 / dist_node_point^2 < squared_theta. + Note that the argument should be passed as theta^2 to avoid + computing square roots of the distances. + + Output arguments + ---------------- + results : array (n_samples * (n_dimensions+2)) + result will contain a summary of the tree information compared to + the query point: + - results[idx:idx+n_dimensions] contains the coordinate-wise + difference between the query point and the summary cell idx. + This is useful in t-SNE to compute the negative forces. + - result[idx+n_dimensions+1] contains the squared euclidean + distance to the summary cell idx. + - result[idx+n_dimensions+2] contains the number of point of the + tree contained in the summary cell idx. + + Return + ------ + idx : integer + number of elements in the results array. + """ + cdef: + int i, idx_d = idx + self.n_dimensions + bint duplicate = True + Cell* cell = &self.cells[cell_id] + + results[idx_d] = 0. + for i in range(self.n_dimensions): + results[idx + i] = point[i] - cell.barycenter[i] + results[idx_d] += results[idx + i] * results[idx + i] + duplicate &= fabsf(results[idx + i]) <= EPSILON + + # Do not compute self interactions + if duplicate and cell.is_leaf: + return idx + + # Check whether we can use this node as a summary + # It's a summary node if the angular size as measured from the point + # is relatively small (w.r.t. theta) or if it is a leaf node. + # If it can be summarized, we use the cell center of mass + # Otherwise, we go a higher level of resolution and into the leaves. + if cell.is_leaf or ( + (cell.squared_max_width / results[idx_d]) < squared_theta): + results[idx_d + 1] = cell.cumulative_size + return idx + self.n_dimensions + 2 + + else: + # Recursively compute the summary in nodes + for c in range(self.n_cells_per_cell): + child_id = cell.children[c] + if child_id != -1: + idx = self.summarize(point, results, squared_theta, + child_id, idx) + + return idx + + def get_cell(self, point): + """return the id of the cell containing the query point or raise + ValueError if the point is not in the tree + """ + cdef float32_t[3] query_pt + cdef int i + + assert len(point) == self.n_dimensions, ( + "Query point should be a point in dimension {}." + .format(self.n_dimensions)) + + for i in range(self.n_dimensions): + query_pt[i] = point[i] + + return self._get_cell(query_pt, 0) + + cdef int _get_cell(self, float32_t[3] point, intp_t cell_id=0 + ) except -1 nogil: + """guts of get_cell. + + Return the id of the cell containing the query point or raise ValueError + if the point is not in the tree""" + cdef: + intp_t selected_child + Cell* cell = &self.cells[cell_id] + + if cell.is_leaf: + if self._is_duplicate(cell.barycenter, point): + if self.verbose > 99: + printf("[QuadTree] Found point in cell: %li\n", + cell.cell_id) + return cell_id + with gil: + raise ValueError("Query point not in the Tree.") + + selected_child = self._select_child(point, cell) + if selected_child > 0: + if self.verbose > 99: + printf("[QuadTree] Selected_child: %li\n", selected_child) + return self._get_cell(point, selected_child) + with gil: + raise ValueError("Query point not in the Tree.") + + # Pickling primitives + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (_QuadTree, (self.n_dimensions, self.verbose), self.__getstate__()) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is inferred during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["cell_count"] = self.cell_count + d["capacity"] = self.capacity + d["n_points"] = self.n_points + d["cells"] = self._get_cell_ndarray().base + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.cell_count = d["cell_count"] + self.capacity = d["capacity"] + self.n_points = d["n_points"] + + if 'cells' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + cell_ndarray = d['cells'] + + if (cell_ndarray.ndim != 1 or + cell_ndarray.dtype != CELL_DTYPE or + not cell_ndarray.flags.c_contiguous): + raise ValueError('Did not recognise loaded array layout') + + self.capacity = cell_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + + cdef Cell[:] cell_mem_view = cell_ndarray + memcpy( + pto=self.cells, + pfrom=&cell_mem_view[0], + size=self.capacity * sizeof(Cell), + ) + + # Array manipulation methods, to convert it to numpy or to resize + # self.cells array + + cdef Cell[:] _get_cell_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef cnp.npy_intp shape[1] + shape[0] = self.cell_count + cdef cnp.npy_intp strides[1] + strides[0] = sizeof(Cell) + cdef Cell[:] arr + Py_INCREF(CELL_DTYPE) + arr = PyArray_NewFromDescr( + subtype= np.ndarray, + descr=CELL_DTYPE, + nd=1, + dims=shape, + strides=strides, + data= self.cells, + flags=cnp.NPY_ARRAY_DEFAULT, + obj=None, + ) + Py_INCREF(self) + if PyArray_SetBaseObject(arr.base, self) < 0: + raise ValueError("Can't initialize array!") + return arr + + cdef int _resize(self, intp_t capacity) except -1 nogil: + """Resize all inner arrays to `capacity`, if `capacity` == -1, then + double the size of the inner arrays. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if self._resize_c(capacity) != 0: + # Acquire gil only if we need to raise + with gil: + raise MemoryError() + + cdef int _resize_c(self, intp_t capacity=SIZE_MAX) except -1 nogil: + """Guts of _resize + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if capacity == self.capacity and self.cells != NULL: + return 0 + + if capacity == SIZE_MAX: + if self.capacity == 0: + capacity = 9 # default initial value to min + else: + capacity = 2 * self.capacity + + safe_realloc(&self.cells, capacity) + + # if capacity smaller than cell_count, adjust the counter + if capacity < self.cell_count: + self.cell_count = capacity + + self.capacity = capacity + return 0 + + def _py_summarize(self, float32_t[:] query_pt, float32_t[:, :] X, float angle): + # Used for testing summarize + cdef: + float32_t[:] summary + int n_samples + + n_samples = X.shape[0] + summary = np.empty(4 * n_samples, dtype=np.float32) + + idx = self.summarize(&query_pt[0], &summary[0], angle * angle) + return idx, summary diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_regression.py b/.venv/Lib/site-packages/sklearn/neighbors/_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..c28b40a243cf2218fad226c69f78812072760d7b --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_regression.py @@ -0,0 +1,513 @@ +"""Nearest Neighbor Regression.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings + +import numpy as np + +from ..base import RegressorMixin, _fit_context +from ..metrics import DistanceMetric +from ..utils._param_validation import StrOptions +from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights + + +class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): + """Regression based on k-nearest neighbors. + + The target is predicted by local interpolation of the targets + associated of the nearest neighbors in the training set. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.9 + + Parameters + ---------- + n_neighbors : int, default=5 + Number of neighbors to use by default for :meth:`kneighbors` queries. + + weights : {'uniform', 'distance'}, callable or None, default='uniform' + Weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Uniform weights are used by default. + + See the following example for a demonstration of the impact of + different weighting schemes on predictions: + :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric : str, DistanceMetric object or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + If metric is a DistanceMetric object, it will be passed directly to + the underlying computation routines. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + Doesn't affect :meth:`fit` method. + + Attributes + ---------- + effective_metric_ : str or callable + The distance metric to use. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + See Also + -------- + NearestNeighbors : Unsupervised learner for implementing neighbor searches. + RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius. + KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote. + RadiusNeighborsClassifier : Classifier implementing + a vote among neighbors within a given radius. + + Notes + ----- + See :ref:`Nearest Neighbors ` in the online documentation + for a discussion of the choice of ``algorithm`` and ``leaf_size``. + + .. warning:: + + Regarding the Nearest Neighbors algorithms, if it is found that two + neighbors, neighbor `k+1` and `k`, have identical distances but + different labels, the results will depend on the ordering of the + training data. + + https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm + + Examples + -------- + >>> X = [[0], [1], [2], [3]] + >>> y = [0, 0, 1, 1] + >>> from sklearn.neighbors import KNeighborsRegressor + >>> neigh = KNeighborsRegressor(n_neighbors=2) + >>> neigh.fit(X, y) + KNeighborsRegressor(...) + >>> print(neigh.predict([[1.5]])) + [0.5] + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "weights": [StrOptions({"uniform", "distance"}), callable, None], + } + _parameter_constraints["metric"].append(DistanceMetric) + _parameter_constraints.pop("radius") + + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + ): + super().__init__( + n_neighbors=n_neighbors, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.weights = weights + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # For cross-validation routines to split data correctly + tags.input_tags.pairwise = self.metric == "precomputed" + return tags + + @_fit_context( + # KNeighborsRegressor.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y): + """Fit the k-nearest neighbors regressor from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_outputs) + Target values. + + Returns + ------- + self : KNeighborsRegressor + The fitted k-nearest neighbors regressor. + """ + return self._fit(X, y) + + def predict(self, X): + """Predict the target for the provided data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int + Target values. + """ + if self.weights == "uniform": + # In that case, we do not need the distances to perform + # the weighting so we do not compute them. + neigh_ind = self.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.kneighbors(X) + + weights = _get_weights(neigh_dist, self.weights) + + _y = self._y + if _y.ndim == 1: + _y = _y.reshape((-1, 1)) + + if weights is None: + y_pred = np.mean(_y[neigh_ind], axis=1) + else: + y_pred = np.empty((neigh_dist.shape[0], _y.shape[1]), dtype=np.float64) + denom = np.sum(weights, axis=1) + + for j in range(_y.shape[1]): + num = np.sum(_y[neigh_ind, j] * weights, axis=1) + y_pred[:, j] = num / denom + + if self._y.ndim == 1: + y_pred = y_pred.ravel() + + return y_pred + + +class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBase): + """Regression based on neighbors within a fixed radius. + + The target is predicted by local interpolation of the targets + associated of the nearest neighbors in the training set. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.9 + + Parameters + ---------- + radius : float, default=1.0 + Range of parameter space to use by default for :meth:`radius_neighbors` + queries. + + weights : {'uniform', 'distance'}, callable or None, default='uniform' + Weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Uniform weights are used by default. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + effective_metric_ : str or callable + The distance metric to use. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + See Also + -------- + NearestNeighbors : Unsupervised learner for implementing neighbor searches. + KNeighborsRegressor : Regression based on k-nearest neighbors. + KNeighborsClassifier : Classifier based on the k-nearest neighbors. + RadiusNeighborsClassifier : Classifier based on neighbors within a given radius. + + Notes + ----- + See :ref:`Nearest Neighbors ` in the online documentation + for a discussion of the choice of ``algorithm`` and ``leaf_size``. + + https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm + + Examples + -------- + >>> X = [[0], [1], [2], [3]] + >>> y = [0, 0, 1, 1] + >>> from sklearn.neighbors import RadiusNeighborsRegressor + >>> neigh = RadiusNeighborsRegressor(radius=1.0) + >>> neigh.fit(X, y) + RadiusNeighborsRegressor(...) + >>> print(neigh.predict([[1.5]])) + [0.5] + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "weights": [StrOptions({"uniform", "distance"}), callable, None], + } + _parameter_constraints.pop("n_neighbors") + + def __init__( + self, + radius=1.0, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + ): + super().__init__( + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + p=p, + metric=metric, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.weights = weights + + @_fit_context( + # RadiusNeighborsRegressor.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y): + """Fit the radius neighbors regressor from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_outputs) + Target values. + + Returns + ------- + self : RadiusNeighborsRegressor + The fitted radius neighbors regressor. + """ + return self._fit(X, y) + + def predict(self, X): + """Predict the target for the provided data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + y : ndarray of shape (n_queries,) or (n_queries, n_outputs), \ + dtype=double + Target values. + """ + neigh_dist, neigh_ind = self.radius_neighbors(X) + + weights = _get_weights(neigh_dist, self.weights) + + _y = self._y + if _y.ndim == 1: + _y = _y.reshape((-1, 1)) + + empty_obs = np.full_like(_y[0], np.nan) + + if weights is None: + y_pred = np.array( + [ + np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs + for (i, ind) in enumerate(neigh_ind) + ] + ) + + else: + y_pred = np.array( + [ + ( + np.average(_y[ind, :], axis=0, weights=weights[i]) + if len(ind) + else empty_obs + ) + for (i, ind) in enumerate(neigh_ind) + ] + ) + + if np.any(np.isnan(y_pred)): + empty_warning_msg = ( + "One or more samples have no neighbors " + "within specified radius; predicting NaN." + ) + warnings.warn(empty_warning_msg) + + if self._y.ndim == 1: + y_pred = y_pred.ravel() + + return y_pred diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_unsupervised.py b/.venv/Lib/site-packages/sklearn/neighbors/_unsupervised.py new file mode 100644 index 0000000000000000000000000000000000000000..b34af2c75367ad73e69b826ca968b2f53982e541 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/_unsupervised.py @@ -0,0 +1,179 @@ +"""Unsupervised nearest neighbors learner""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ..base import _fit_context +from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin + + +class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase): + """Unsupervised learner for implementing neighbor searches. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.9 + + Parameters + ---------- + n_neighbors : int, default=5 + Number of neighbors to use by default for :meth:`kneighbors` queries. + + radius : float, default=1.0 + Range of parameter space to use by default for :meth:`radius_neighbors` + queries. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + p : float (positive), default=2 + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + effective_metric_ : str + Metric used to compute distances to neighbors. + + effective_metric_params_ : dict + Parameters for the metric used to compute distances to neighbors. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + See Also + -------- + KNeighborsClassifier : Classifier implementing the k-nearest neighbors + vote. + RadiusNeighborsClassifier : Classifier implementing a vote among neighbors + within a given radius. + KNeighborsRegressor : Regression based on k-nearest neighbors. + RadiusNeighborsRegressor : Regression based on neighbors within a fixed + radius. + BallTree : Space partitioning data structure for organizing points in a + multi-dimensional space, used for nearest neighbor search. + + Notes + ----- + See :ref:`Nearest Neighbors ` in the online documentation + for a discussion of the choice of ``algorithm`` and ``leaf_size``. + + https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm + + Examples + -------- + >>> import numpy as np + >>> from sklearn.neighbors import NearestNeighbors + >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]] + >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4) + >>> neigh.fit(samples) + NearestNeighbors(...) + >>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False) + array([[2, 0]]...) + >>> nbrs = neigh.radius_neighbors( + ... [[0, 0, 1.3]], 0.4, return_distance=False + ... ) + >>> np.asarray(nbrs[0][0]) + array(2) + """ + + def __init__( + self, + *, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): + super().__init__( + n_neighbors=n_neighbors, + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + + @_fit_context( + # NearestNeighbors.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Fit the nearest neighbors estimator from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : NearestNeighbors + The fitted nearest neighbors estimator. + """ + return self._fit(X) diff --git a/.venv/Lib/site-packages/sklearn/neighbors/meson.build b/.venv/Lib/site-packages/sklearn/neighbors/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..06a869bb1fb671398f420e58a066f80ed040533b --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/meson.build @@ -0,0 +1,56 @@ +_binary_tree_pxi = custom_target( + '_binary_tree_pxi', + output: '_binary_tree.pxi', + input: '_binary_tree.pxi.tp', + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], +) + +# .pyx is generated so this is needed to make Cython compilation work. The pxi +# file is included avoid "missing dependency paths" with ninja -t missindeps +neighbors_cython_tree = [ + fs.copyfile('__init__.py'), + fs.copyfile('_partition_nodes.pxd'), + _binary_tree_pxi, +] + +name_list = ['_ball_tree', '_kd_tree'] + +foreach name: name_list + pyx = custom_target( + name + '_pyx', + output: name + '.pyx', + input: name + '.pyx.tp', + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: [neighbors_cython_tree, utils_cython_tree, metrics_cython_tree], + ) + py.extension_module( + name, + pyx, + dependencies: [np_dep], + cython_args: cython_args, + subdir: 'sklearn/neighbors', + install: true +) +endforeach + +neighbors_extension_metadata = { + '_partition_nodes': + {'sources': ['_partition_nodes.pyx'], + 'override_options': ['cython_language=cpp'], 'dependencies': [np_dep]}, + '_quad_tree': {'sources': ['_quad_tree.pyx'], 'dependencies': [np_dep]}, +} + +foreach ext_name, ext_dict : neighbors_extension_metadata + py.extension_module( + ext_name, + [ext_dict.get('sources'), utils_cython_tree], + dependencies: ext_dict.get('dependencies'), + override_options : ext_dict.get('override_options', []), + cython_args: cython_args, + subdir: 'sklearn/neighbors', + install: true + ) +endforeach diff --git a/.venv/Lib/site-packages/sklearn/neighbors/tests/test_ball_tree.py b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_ball_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..eab2af2d78f3db07f526c77e965c51356af1616c --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_ball_tree.py @@ -0,0 +1,200 @@ +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal + +from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64 +from sklearn.utils import check_random_state +from sklearn.utils._testing import _convert_container +from sklearn.utils.validation import check_array + +rng = np.random.RandomState(10) +V_mahalanobis = rng.rand(3, 3) +V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T) + +DIMENSION = 3 + +METRICS = { + "euclidean": {}, + "manhattan": {}, + "minkowski": dict(p=3), + "chebyshev": {}, +} + +DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"] + +BOOLEAN_METRICS = [ + "jaccard", + "dice", + "rogerstanimoto", + "russellrao", + "sokalmichener", + "sokalsneath", +] + +BALL_TREE_CLASSES = [ + BallTree64, + BallTree32, +] + + +def brute_force_neighbors(X, Y, k, metric, **kwargs): + from sklearn.metrics import DistanceMetric + + X, Y = check_array(X), check_array(Y) + D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) + ind = np.argsort(D, axis=1)[:, :k] + dist = D[np.arange(Y.shape[0])[:, None], ind] + return dist, ind + + +def test_BallTree_is_BallTree64_subclass(): + assert issubclass(BallTree, BallTree64) + + +@pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) +@pytest.mark.parametrize("array_type", ["list", "array"]) +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation): + rng = check_random_state(0) + if metric in BOOLEAN_METRICS: + X = rng.random_sample((40, 10)).round(0) + Y = rng.random_sample((10, 10)).round(0) + elif metric in DISCRETE_METRICS: + X = (4 * rng.random_sample((40, 10))).round(0) + Y = (4 * rng.random_sample((10, 10))).round(0) + X = _convert_container(X, array_type) + Y = _convert_container(Y, array_type) + + k = 5 + + bt = BallTreeImplementation(X, leaf_size=1, metric=metric) + dist1, ind1 = bt.query(Y, k) + dist2, ind2 = brute_force_neighbors(X, Y, k, metric) + assert_array_almost_equal(dist1, dist2) + + +@pytest.mark.parametrize( + "BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5]) +) +def test_query_haversine(BallTreeImplementation, decimal_tol): + rng = check_random_state(0) + X = 2 * np.pi * rng.random_sample((40, 2)) + bt = BallTreeImplementation(X, leaf_size=1, metric="haversine") + dist1, ind1 = bt.query(X, k=5) + dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") + + assert_array_almost_equal(dist1, dist2, decimal=decimal_tol) + assert_array_almost_equal(ind1, ind2) + + +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_array_object_type(BallTreeImplementation): + """Check that we do not accept object dtype array.""" + X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) + with pytest.raises(ValueError, match="setting an array element with a sequence"): + BallTreeImplementation(X) + + +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_bad_pyfunc_metric(BallTreeImplementation): + def wrong_returned_value(x, y): + return "1" + + def one_arg_func(x): + return 1.0 # pragma: no cover + + X = np.ones((5, 2)) + msg = "Custom distance function must accept two vectors and return a float." + with pytest.raises(TypeError, match=msg): + BallTreeImplementation(X, metric=wrong_returned_value) + + msg = "takes 1 positional argument but 2 were given" + with pytest.raises(TypeError, match=msg): + BallTreeImplementation(X, metric=one_arg_func) + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_ball_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 + ) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 5 + dist_64, ind_64 = bt_64.query(Y_64, k=k) + dist_32, ind_32 = bt_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64 = bt_64.query_radius(Y_64, r=r) + ind_32 = bt_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 + + +def test_two_point_correlation_numerical_consistency(global_random_seed): + # Test consistency with respect to the `two_point_correlation` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + bt_64 = BallTree64(X_64, leaf_size=10) + bt_32 = BallTree32(X_32, leaf_size=10) + + r = np.linspace(0, 1, 10) + + counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True) + counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True) + assert_allclose(counts_64, counts_32) + + +def get_dataset_for_binary_tree(random_seed, features=3): + rng = np.random.RandomState(random_seed) + _X = rng.rand(100, features) + _Y = rng.rand(5, features) + + X_64 = _X.astype(dtype=np.float64, copy=False) + Y_64 = _Y.astype(dtype=np.float64, copy=False) + + X_32 = _X.astype(dtype=np.float32, copy=False) + Y_32 = _Y.astype(dtype=np.float32, copy=False) + + return X_64, X_32, Y_64, Y_32 diff --git a/.venv/Lib/site-packages/sklearn/neighbors/tests/test_graph.py b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..0862ecd748eb1042e0030a3c883e2925f8be8a26 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_graph.py @@ -0,0 +1,101 @@ +import numpy as np +import pytest + +from sklearn.metrics import euclidean_distances +from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer +from sklearn.neighbors._base import _is_sorted_by_data +from sklearn.utils._testing import assert_array_equal + + +def test_transformer_result(): + # Test the number of neighbors returned + n_neighbors = 5 + n_samples_fit = 20 + n_queries = 18 + n_features = 10 + + rng = np.random.RandomState(42) + X = rng.randn(n_samples_fit, n_features) + X2 = rng.randn(n_queries, n_features) + radius = np.percentile(euclidean_distances(X), 10) + + # with n_neighbors + for mode in ["distance", "connectivity"]: + add_one = mode == "distance" + nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode) + Xt = nnt.fit_transform(X) + assert Xt.shape == (n_samples_fit, n_samples_fit) + assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),) + assert Xt.format == "csr" + assert _is_sorted_by_data(Xt) + + X2t = nnt.transform(X2) + assert X2t.shape == (n_queries, n_samples_fit) + assert X2t.data.shape == (n_queries * (n_neighbors + add_one),) + assert X2t.format == "csr" + assert _is_sorted_by_data(X2t) + + # with radius + for mode in ["distance", "connectivity"]: + add_one = mode == "distance" + nnt = RadiusNeighborsTransformer(radius=radius, mode=mode) + Xt = nnt.fit_transform(X) + assert Xt.shape == (n_samples_fit, n_samples_fit) + assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),) + assert Xt.format == "csr" + assert _is_sorted_by_data(Xt) + + X2t = nnt.transform(X2) + assert X2t.shape == (n_queries, n_samples_fit) + assert not X2t.data.shape == (n_queries * (n_neighbors + add_one),) + assert X2t.format == "csr" + assert _is_sorted_by_data(X2t) + + +def _has_explicit_diagonal(X): + """Return True if the diagonal is explicitly stored""" + X = X.tocoo() + explicit = X.row[X.row == X.col] + return len(explicit) == X.shape[0] + + +def test_explicit_diagonal(): + # Test that the diagonal is explicitly stored in the sparse graph + n_neighbors = 5 + n_samples_fit, n_samples_transform, n_features = 20, 18, 10 + rng = np.random.RandomState(42) + X = rng.randn(n_samples_fit, n_features) + X2 = rng.randn(n_samples_transform, n_features) + + nnt = KNeighborsTransformer(n_neighbors=n_neighbors) + Xt = nnt.fit_transform(X) + assert _has_explicit_diagonal(Xt) + assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0) + + Xt = nnt.transform(X) + assert _has_explicit_diagonal(Xt) + assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0) + + # Using transform on new data should not always have zero diagonal + X2t = nnt.transform(X2) + assert not _has_explicit_diagonal(X2t) + + +@pytest.mark.parametrize("Klass", [KNeighborsTransformer, RadiusNeighborsTransformer]) +def test_graph_feature_names_out(Klass): + """Check `get_feature_names_out` for transformers defined in `_graph.py`.""" + + n_samples_fit = 20 + n_features = 10 + rng = np.random.RandomState(42) + X = rng.randn(n_samples_fit, n_features) + + est = Klass().fit(X) + names_out = est.get_feature_names_out() + + class_name_lower = Klass.__name__.lower() + expected_names_out = np.array( + [f"{class_name_lower}{i}" for i in range(est.n_samples_fit_)], + dtype=object, + ) + assert_array_equal(names_out, expected_names_out) diff --git a/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kd_tree.py b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kd_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..205c323ca7f13228384b25ff63bbbf2075cba71c --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kd_tree.py @@ -0,0 +1,100 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_equal + +from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64 +from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree +from sklearn.utils.parallel import Parallel, delayed + +DIMENSION = 3 + +METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)} + +KD_TREE_CLASSES = [ + KDTree64, + KDTree32, +] + + +def test_KDTree_is_KDTree64_subclass(): + assert issubclass(KDTree, KDTree64) + + +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_array_object_type(BinarySearchTree): + """Check that we do not accept object dtype array.""" + X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) + with pytest.raises(ValueError, match="setting an array element with a sequence"): + BinarySearchTree(X) + + +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_kdtree_picklable_with_joblib(BinarySearchTree): + """Make sure that KDTree queries work when joblib memmaps. + + Non-regression test for #21685 and #21228.""" + rng = np.random.RandomState(0) + X = rng.random_sample((10, 3)) + tree = BinarySearchTree(X, leaf_size=2) + + # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that + # use to raise "ValueError: buffer source array is read-only" in a previous + # version of the Cython code. + Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X]) + + +@pytest.mark.parametrize("metric", METRICS) +def test_kd_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 + ) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 4 + dist_64, ind_64 = kd_64.query(Y_64, k=k) + dist_32, ind_32 = kd_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64 = kd_64.query_radius(Y_64, r=r) + ind_32 = kd_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 + + +@pytest.mark.parametrize("metric", METRICS) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 diff --git a/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kde.py b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kde.py new file mode 100644 index 0000000000000000000000000000000000000000..d30194b95b34ec9246d3030b4d80573ae6654e8d --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kde.py @@ -0,0 +1,252 @@ +import joblib +import numpy as np +import pytest + +from sklearn.datasets import make_blobs +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KDTree, KernelDensity, NearestNeighbors +from sklearn.neighbors._ball_tree import kernel_norm +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.utils._testing import assert_allclose + + +# XXX Duplicated in test_neighbors_tree, test_kde +def compute_kernel_slow(Y, X, kernel, h): + if h == "scott": + h = X.shape[0] ** (-1 / (X.shape[1] + 4)) + elif h == "silverman": + h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4)) + + d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1)) + norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0] + + if kernel == "gaussian": + return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1) + elif kernel == "tophat": + return norm * (d < h).sum(-1) + elif kernel == "epanechnikov": + return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1) + elif kernel == "exponential": + return norm * (np.exp(-d / h)).sum(-1) + elif kernel == "linear": + return norm * ((1 - d / h) * (d < h)).sum(-1) + elif kernel == "cosine": + return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1) + else: + raise ValueError("kernel not recognized") + + +def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true): + kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol) + log_dens = kde.fit(X).score_samples(Y) + assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1e-7, rtol)) + assert_allclose( + np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1e-7, rtol) + ) + + +@pytest.mark.parametrize( + "kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"] +) +@pytest.mark.parametrize("bandwidth", [0.01, 0.1, 1, "scott", "silverman"]) +def test_kernel_density(kernel, bandwidth): + n_samples, n_features = (100, 3) + + rng = np.random.RandomState(0) + X = rng.randn(n_samples, n_features) + Y = rng.randn(n_samples, n_features) + + dens_true = compute_kernel_slow(Y, X, kernel, bandwidth) + + for rtol in [0, 1e-5]: + for atol in [1e-6, 1e-2]: + for breadth_first in (True, False): + check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true) + + +def test_kernel_density_sampling(n_samples=100, n_features=3): + rng = np.random.RandomState(0) + X = rng.randn(n_samples, n_features) + + bandwidth = 0.2 + + for kernel in ["gaussian", "tophat"]: + # draw a tophat sample + kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X) + samp = kde.sample(100) + assert X.shape == samp.shape + + # check that samples are in the right range + nbrs = NearestNeighbors(n_neighbors=1).fit(X) + dist, ind = nbrs.kneighbors(X, return_distance=True) + + if kernel == "tophat": + assert np.all(dist < bandwidth) + elif kernel == "gaussian": + # 5 standard deviations is safe for 100 samples, but there's a + # very small chance this test could fail. + assert np.all(dist < 5 * bandwidth) + + # check unsupported kernels + for kernel in ["epanechnikov", "exponential", "linear", "cosine"]: + kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X) + with pytest.raises(NotImplementedError): + kde.sample(100) + + # non-regression test: used to return a scalar + X = rng.randn(4, 1) + kde = KernelDensity(kernel="gaussian").fit(X) + assert kde.sample().shape == (1, 1) + + +@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree"]) +@pytest.mark.parametrize( + "metric", ["euclidean", "minkowski", "manhattan", "chebyshev", "haversine"] +) +def test_kde_algorithm_metric_choice(algorithm, metric): + # Smoke test for various metrics and algorithms + rng = np.random.RandomState(0) + X = rng.randn(10, 2) # 2 features required for haversine dist. + Y = rng.randn(10, 2) + + kde = KernelDensity(algorithm=algorithm, metric=metric) + + if algorithm == "kd_tree" and metric not in KDTree.valid_metrics: + with pytest.raises(ValueError, match="invalid metric"): + kde.fit(X) + else: + kde.fit(X) + y_dens = kde.score_samples(Y) + assert y_dens.shape == Y.shape[:1] + + +def test_kde_score(n_samples=100, n_features=3): + pass + # FIXME + # rng = np.random.RandomState(0) + # X = rng.random_sample((n_samples, n_features)) + # Y = rng.random_sample((n_samples, n_features)) + + +def test_kde_sample_weights_error(): + kde = KernelDensity() + with pytest.raises(ValueError): + kde.fit(np.random.random((200, 10)), sample_weight=np.random.random((200, 10))) + with pytest.raises(ValueError): + kde.fit(np.random.random((200, 10)), sample_weight=-np.random.random(200)) + + +def test_kde_pipeline_gridsearch(): + # test that kde plays nice in pipelines and grid-searches + X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) + pipe1 = make_pipeline( + StandardScaler(with_mean=False, with_std=False), + KernelDensity(kernel="gaussian"), + ) + params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10]) + search = GridSearchCV(pipe1, param_grid=params) + search.fit(X) + assert search.best_params_["kerneldensity__bandwidth"] == 0.1 + + +def test_kde_sample_weights(): + n_samples = 400 + size_test = 20 + weights_neutral = np.full(n_samples, 3.0) + for d in [1, 2, 10]: + rng = np.random.RandomState(0) + X = rng.rand(n_samples, d) + weights = 1 + (10 * X.sum(axis=1)).astype(np.int8) + X_repetitions = np.repeat(X, weights, axis=0) + n_samples_test = size_test // d + test_points = rng.rand(n_samples_test, d) + for algorithm in ["auto", "ball_tree", "kd_tree"]: + for metric in ["euclidean", "minkowski", "manhattan", "chebyshev"]: + if algorithm != "kd_tree" or metric in KDTree.valid_metrics: + kde = KernelDensity(algorithm=algorithm, metric=metric) + + # Test that adding a constant sample weight has no effect + kde.fit(X, sample_weight=weights_neutral) + scores_const_weight = kde.score_samples(test_points) + sample_const_weight = kde.sample(random_state=1234) + kde.fit(X) + scores_no_weight = kde.score_samples(test_points) + sample_no_weight = kde.sample(random_state=1234) + assert_allclose(scores_const_weight, scores_no_weight) + assert_allclose(sample_const_weight, sample_no_weight) + + # Test equivalence between sampling and (integer) weights + kde.fit(X, sample_weight=weights) + scores_weight = kde.score_samples(test_points) + sample_weight = kde.sample(random_state=1234) + kde.fit(X_repetitions) + scores_ref_sampling = kde.score_samples(test_points) + sample_ref_sampling = kde.sample(random_state=1234) + assert_allclose(scores_weight, scores_ref_sampling) + assert_allclose(sample_weight, sample_ref_sampling) + + # Test that sample weights has a non-trivial effect + diff = np.max(np.abs(scores_no_weight - scores_weight)) + assert diff > 0.001 + + # Test invariance with respect to arbitrary scaling + scale_factor = rng.rand() + kde.fit(X, sample_weight=(scale_factor * weights)) + scores_scaled_weight = kde.score_samples(test_points) + assert_allclose(scores_scaled_weight, scores_weight) + + +@pytest.mark.parametrize("sample_weight", [None, [0.1, 0.2, 0.3]]) +def test_pickling(tmpdir, sample_weight): + # Make sure that predictions are the same before and after pickling. Used + # to be a bug because sample_weights wasn't pickled and the resulting tree + # would miss some info. + + kde = KernelDensity() + data = np.reshape([1.0, 2.0, 3.0], (-1, 1)) + kde.fit(data, sample_weight=sample_weight) + + X = np.reshape([1.1, 2.1], (-1, 1)) + scores = kde.score_samples(X) + + file_path = str(tmpdir.join("dump.pkl")) + joblib.dump(kde, file_path) + kde = joblib.load(file_path) + scores_pickled = kde.score_samples(X) + + assert_allclose(scores, scores_pickled) + + +@pytest.mark.parametrize("method", ["score_samples", "sample"]) +def test_check_is_fitted(method): + # Check that predict raises an exception in an unfitted estimator. + # Unfitted estimators should raise a NotFittedError. + rng = np.random.RandomState(0) + X = rng.randn(10, 2) + kde = KernelDensity() + + with pytest.raises(NotFittedError): + getattr(kde, method)(X) + + +@pytest.mark.parametrize("bandwidth", ["scott", "silverman", 0.1]) +def test_bandwidth(bandwidth): + n_samples, n_features = (100, 3) + rng = np.random.RandomState(0) + X = rng.randn(n_samples, n_features) + kde = KernelDensity(bandwidth=bandwidth).fit(X) + samp = kde.sample(100) + kde_sc = kde.score_samples(X) + assert X.shape == samp.shape + assert kde_sc.shape == (n_samples,) + + # Test that the attribute self.bandwidth_ has the expected value + if bandwidth == "scott": + h = X.shape[0] ** (-1 / (X.shape[1] + 4)) + elif bandwidth == "silverman": + h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4)) + else: + h = bandwidth + assert kde.bandwidth_ == pytest.approx(h) diff --git a/.venv/Lib/site-packages/sklearn/neighbors/tests/test_lof.py b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_lof.py new file mode 100644 index 0000000000000000000000000000000000000000..ba22bbb44f35fe1f57a5ec1ede7dad9c2725f421 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_lof.py @@ -0,0 +1,394 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import re +from math import sqrt + +import numpy as np +import pytest + +from sklearn import metrics, neighbors +from sklearn.datasets import load_iris +from sklearn.metrics import roc_auc_score +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.estimator_checks import ( + check_outlier_corruption, + parametrize_with_checks, +) +from sklearn.utils.fixes import CSR_CONTAINERS + +# load the iris dataset +# and randomly permute it +rng = check_random_state(0) +iris = load_iris() +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + + +def test_lof(global_dtype): + # Toy sample (the last two samples are outliers): + X = np.asarray( + [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]], + dtype=global_dtype, + ) + + # Test LocalOutlierFactor: + clf = neighbors.LocalOutlierFactor(n_neighbors=5) + score = clf.fit(X).negative_outlier_factor_ + assert_array_equal(clf._fit_X, X) + + # Assert largest outlier score is smaller than smallest inlier score: + assert np.min(score[:-2]) > np.max(score[-2:]) + + # Assert predict() works: + clf = neighbors.LocalOutlierFactor(contamination=0.25, n_neighbors=5).fit(X) + expected_predictions = 6 * [1] + 2 * [-1] + assert_array_equal(clf._predict(), expected_predictions) + assert_array_equal(clf.fit_predict(X), expected_predictions) + + +def test_lof_performance(global_dtype): + # Generate train/test data + rng = check_random_state(2) + X = 0.3 * rng.randn(120, 2).astype(global_dtype, copy=False) + X_train = X[:100] + + # Generate some abnormal novel observations + X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)).astype( + global_dtype, copy=False + ) + X_test = np.r_[X[100:], X_outliers] + y_test = np.array([0] * 20 + [1] * 20) + + # fit the model for novelty detection + clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train) + + # predict scores (the lower, the more normal) + y_pred = -clf.decision_function(X_test) + + # check that roc_auc is good + assert roc_auc_score(y_test, y_pred) > 0.99 + + +def test_lof_values(global_dtype): + # toy samples: + X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype) + clf1 = neighbors.LocalOutlierFactor( + n_neighbors=2, contamination=0.1, novelty=True + ).fit(X_train) + clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train) + s_0 = 2.0 * sqrt(2.0) / (1.0 + sqrt(2.0)) + s_1 = (1.0 + sqrt(2)) * (1.0 / (4.0 * sqrt(2.0)) + 1.0 / (2.0 + 2.0 * sqrt(2))) + # check predict() + assert_allclose(-clf1.negative_outlier_factor_, [s_0, s_1, s_1]) + assert_allclose(-clf2.negative_outlier_factor_, [s_0, s_1, s_1]) + # check predict(one sample not in train) + assert_allclose(-clf1.score_samples([[2.0, 2.0]]), [s_0]) + assert_allclose(-clf2.score_samples([[2.0, 2.0]]), [s_0]) + # check predict(one sample already in train) + assert_allclose(-clf1.score_samples([[1.0, 1.0]]), [s_1]) + assert_allclose(-clf2.score_samples([[1.0, 1.0]]), [s_1]) + + +def test_lof_precomputed(global_dtype, random_state=42): + """Tests LOF with a distance matrix.""" + # Note: smaller samples may result in spurious test success + rng = np.random.RandomState(random_state) + X = rng.random_sample((10, 4)).astype(global_dtype, copy=False) + Y = rng.random_sample((3, 4)).astype(global_dtype, copy=False) + DXX = metrics.pairwise_distances(X, metric="euclidean") + DYX = metrics.pairwise_distances(Y, X, metric="euclidean") + # As a feature matrix (n_samples by n_features) + lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True) + lof_X.fit(X) + pred_X_X = lof_X._predict() + pred_X_Y = lof_X.predict(Y) + + # As a dense distance matrix (n_samples by n_samples) + lof_D = neighbors.LocalOutlierFactor( + n_neighbors=3, algorithm="brute", metric="precomputed", novelty=True + ) + lof_D.fit(DXX) + pred_D_X = lof_D._predict() + pred_D_Y = lof_D.predict(DYX) + + assert_allclose(pred_X_X, pred_D_X) + assert_allclose(pred_X_Y, pred_D_Y) + + +def test_n_neighbors_attribute(): + X = iris.data + clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X) + assert clf.n_neighbors_ == X.shape[0] - 1 + + clf = neighbors.LocalOutlierFactor(n_neighbors=500) + msg = "n_neighbors will be set to (n_samples - 1)" + with pytest.warns(UserWarning, match=re.escape(msg)): + clf.fit(X) + assert clf.n_neighbors_ == X.shape[0] - 1 + + +def test_score_samples(global_dtype): + X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype) + X_test = np.asarray([[2.0, 2.0]], dtype=global_dtype) + clf1 = neighbors.LocalOutlierFactor( + n_neighbors=2, contamination=0.1, novelty=True + ).fit(X_train) + clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train) + + clf1_scores = clf1.score_samples(X_test) + clf1_decisions = clf1.decision_function(X_test) + + clf2_scores = clf2.score_samples(X_test) + clf2_decisions = clf2.decision_function(X_test) + + assert_allclose( + clf1_scores, + clf1_decisions + clf1.offset_, + ) + assert_allclose( + clf2_scores, + clf2_decisions + clf2.offset_, + ) + assert_allclose(clf1_scores, clf2_scores) + + +def test_novelty_errors(): + X = iris.data + + # check errors for novelty=False + clf = neighbors.LocalOutlierFactor() + clf.fit(X) + # predict, decision_function and score_samples raise ValueError + for method in ["predict", "decision_function", "score_samples"]: + outer_msg = f"'LocalOutlierFactor' has no attribute '{method}'" + inner_msg = "{} is not available when novelty=False".format(method) + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + getattr(clf, method) + + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + # check errors for novelty=True + clf = neighbors.LocalOutlierFactor(novelty=True) + + outer_msg = "'LocalOutlierFactor' has no attribute 'fit_predict'" + inner_msg = "fit_predict is not available when novelty=True" + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + getattr(clf, "fit_predict") + + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + +def test_novelty_training_scores(global_dtype): + # check that the scores of the training samples are still accessible + # when novelty=True through the negative_outlier_factor_ attribute + X = iris.data.astype(global_dtype) + + # fit with novelty=False + clf_1 = neighbors.LocalOutlierFactor() + clf_1.fit(X) + scores_1 = clf_1.negative_outlier_factor_ + + # fit with novelty=True + clf_2 = neighbors.LocalOutlierFactor(novelty=True) + clf_2.fit(X) + scores_2 = clf_2.negative_outlier_factor_ + + assert_allclose(scores_1, scores_2) + + +def test_hasattr_prediction(): + # check availability of prediction methods depending on novelty value. + X = [[1, 1], [1, 2], [2, 1]] + + # when novelty=True + clf = neighbors.LocalOutlierFactor(novelty=True) + clf.fit(X) + assert hasattr(clf, "predict") + assert hasattr(clf, "decision_function") + assert hasattr(clf, "score_samples") + assert not hasattr(clf, "fit_predict") + + # when novelty=False + clf = neighbors.LocalOutlierFactor(novelty=False) + clf.fit(X) + assert hasattr(clf, "fit_predict") + assert not hasattr(clf, "predict") + assert not hasattr(clf, "decision_function") + assert not hasattr(clf, "score_samples") + + +@parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)]) +def test_novelty_true_common_tests(estimator, check): + # the common tests are run for the default LOF (novelty=False). + # here we run these common tests for LOF when novelty=True + check(estimator) + + +@pytest.mark.parametrize("expected_outliers", [30, 53]) +def test_predicted_outlier_number(expected_outliers): + # the number of predicted outliers should be equal to the number of + # expected outliers unless there are ties in the abnormality scores. + X = iris.data + n_samples = X.shape[0] + contamination = float(expected_outliers) / n_samples + + clf = neighbors.LocalOutlierFactor(contamination=contamination) + y_pred = clf.fit_predict(X) + + num_outliers = np.sum(y_pred != 1) + if num_outliers != expected_outliers: + y_dec = clf.negative_outlier_factor_ + check_outlier_corruption(num_outliers, expected_outliers, y_dec) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse(csr_container): + # LocalOutlierFactor must support CSR inputs + # TODO: compare results on dense and sparse data as proposed in: + # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186 + X = csr_container(iris.data) + + lof = neighbors.LocalOutlierFactor(novelty=True) + lof.fit(X) + lof.predict(X) + lof.score_samples(X) + lof.decision_function(X) + + lof = neighbors.LocalOutlierFactor(novelty=False) + lof.fit_predict(X) + + +def test_lof_error_n_neighbors_too_large(): + """Check that we raise a proper error message when n_neighbors == n_samples. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/17207 + """ + X = np.ones((7, 7)) + + msg = ( + "Expected n_neighbors < n_samples_fit, but n_neighbors = 1, " + "n_samples_fit = 1, n_samples = 1" + ) + with pytest.raises(ValueError, match=msg): + lof = neighbors.LocalOutlierFactor(n_neighbors=1).fit(X[:1]) + + lof = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X[:2]) + assert lof.n_samples_fit_ == 2 + + msg = ( + "Expected n_neighbors < n_samples_fit, but n_neighbors = 2, " + "n_samples_fit = 2, n_samples = 2" + ) + with pytest.raises(ValueError, match=msg): + lof.kneighbors(None, n_neighbors=2) + + distances, indices = lof.kneighbors(None, n_neighbors=1) + assert distances.shape == (2, 1) + assert indices.shape == (2, 1) + + msg = ( + "Expected n_neighbors <= n_samples_fit, but n_neighbors = 3, " + "n_samples_fit = 2, n_samples = 7" + ) + with pytest.raises(ValueError, match=msg): + lof.kneighbors(X, n_neighbors=3) + + ( + distances, + indices, + ) = lof.kneighbors(X, n_neighbors=2) + assert distances.shape == (7, 2) + assert indices.shape == (7, 2) + + +@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]) +@pytest.mark.parametrize("novelty", [True, False]) +@pytest.mark.parametrize("contamination", [0.5, "auto"]) +def test_lof_input_dtype_preservation(global_dtype, algorithm, contamination, novelty): + """Check that the fitted attributes are stored using the data type of X.""" + X = iris.data.astype(global_dtype, copy=False) + + iso = neighbors.LocalOutlierFactor( + n_neighbors=5, algorithm=algorithm, contamination=contamination, novelty=novelty + ) + iso.fit(X) + + assert iso.negative_outlier_factor_.dtype == global_dtype + + for method in ("score_samples", "decision_function"): + if hasattr(iso, method): + y_pred = getattr(iso, method)(X) + assert y_pred.dtype == global_dtype + + +@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]) +@pytest.mark.parametrize("novelty", [True, False]) +@pytest.mark.parametrize("contamination", [0.5, "auto"]) +def test_lof_dtype_equivalence(algorithm, novelty, contamination): + """Check the equivalence of the results with 32 and 64 bits input.""" + + inliers = iris.data[:50] # setosa iris are really distinct from others + outliers = iris.data[-5:] # virginica will be considered as outliers + # lower the precision of the input data to check that we have an equivalence when + # making the computation in 32 and 64 bits. + X = np.concatenate([inliers, outliers], axis=0).astype(np.float32) + + lof_32 = neighbors.LocalOutlierFactor( + algorithm=algorithm, novelty=novelty, contamination=contamination + ) + X_32 = X.astype(np.float32, copy=True) + lof_32.fit(X_32) + + lof_64 = neighbors.LocalOutlierFactor( + algorithm=algorithm, novelty=novelty, contamination=contamination + ) + X_64 = X.astype(np.float64, copy=True) + lof_64.fit(X_64) + + assert_allclose(lof_32.negative_outlier_factor_, lof_64.negative_outlier_factor_) + + for method in ("score_samples", "decision_function", "predict", "fit_predict"): + if hasattr(lof_32, method): + y_pred_32 = getattr(lof_32, method)(X_32) + y_pred_64 = getattr(lof_64, method)(X_64) + assert_allclose(y_pred_32, y_pred_64, atol=0.0002) + + +def test_lof_duplicate_samples(): + """ + Check that LocalOutlierFactor raises a warning when duplicate values + in the training data cause inaccurate results. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27839 + """ + + rng = np.random.default_rng(0) + + x = rng.permutation( + np.hstack( + [ + [0.1] * 1000, # constant values + np.linspace(0.1, 0.3, num=3000), + rng.random(500) * 100, # the clear outliers + ] + ) + ) + X = x.reshape(-1, 1) + + error_msg = ( + "Duplicate values are leading to incorrect results. " + "Increase the number of neighbors for more accurate results." + ) + + lof = neighbors.LocalOutlierFactor(n_neighbors=5, contamination=0.1) + + # Catch the warning + with pytest.warns(UserWarning, match=re.escape(error_msg)): + lof.fit_predict(X) diff --git a/.venv/Lib/site-packages/sklearn/neural_network/__init__.py b/.venv/Lib/site-packages/sklearn/neural_network/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..adbcdbc687e4d059a7ee0094803cf765c2334881 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neural_network/__init__.py @@ -0,0 +1,9 @@ +"""Models based on neural networks.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._multilayer_perceptron import MLPClassifier, MLPRegressor +from ._rbm import BernoulliRBM + +__all__ = ["BernoulliRBM", "MLPClassifier", "MLPRegressor"] diff --git a/.venv/Lib/site-packages/sklearn/neural_network/_stochastic_optimizers.py b/.venv/Lib/site-packages/sklearn/neural_network/_stochastic_optimizers.py new file mode 100644 index 0000000000000000000000000000000000000000..fe1e72860681ad8f248d3781d3d68b546f8ceaa4 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neural_network/_stochastic_optimizers.py @@ -0,0 +1,287 @@ +"""Stochastic optimization methods for MLP""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np + + +class BaseOptimizer: + """Base (Stochastic) gradient descent optimizer + + Parameters + ---------- + learning_rate_init : float, default=0.1 + The initial learning rate used. It controls the step-size in updating + the weights + + Attributes + ---------- + learning_rate : float + the current learning rate + """ + + def __init__(self, learning_rate_init=0.1): + self.learning_rate_init = learning_rate_init + self.learning_rate = float(learning_rate_init) + + def update_params(self, params, grads): + """Update parameters with given gradients + + Parameters + ---------- + params : list of length = len(coefs_) + len(intercepts_) + The concatenated list containing coefs_ and intercepts_ in MLP + model. Used for initializing velocities and updating params + + grads : list of length = len(params) + Containing gradients with respect to coefs_ and intercepts_ in MLP + model. So length should be aligned with params + """ + updates = self._get_updates(grads) + for param, update in zip((p for p in params), updates): + param += update + + def iteration_ends(self, time_step): + """Perform update to learning rate and potentially other states at the + end of an iteration + """ + pass + + def trigger_stopping(self, msg, verbose): + """Decides whether it is time to stop training + + Parameters + ---------- + msg : str + Message passed in for verbose output + + verbose : bool + Print message to stdin if True + + Returns + ------- + is_stopping : bool + True if training needs to stop + """ + if verbose: + print(msg + " Stopping.") + return True + + +class SGDOptimizer(BaseOptimizer): + """Stochastic gradient descent optimizer with momentum + + Parameters + ---------- + params : list, length = len(coefs_) + len(intercepts_) + The concatenated list containing coefs_ and intercepts_ in MLP model. + Used for initializing velocities and updating params + + learning_rate_init : float, default=0.1 + The initial learning rate used. It controls the step-size in updating + the weights + + lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant' + Learning rate schedule for weight updates. + + -'constant', is a constant learning rate given by + 'learning_rate_init'. + + -'invscaling' gradually decreases the learning rate 'learning_rate_' at + each time step 't' using an inverse scaling exponent of 'power_t'. + learning_rate_ = learning_rate_init / pow(t, power_t) + + -'adaptive', keeps the learning rate constant to + 'learning_rate_init' as long as the training keeps decreasing. + Each time 2 consecutive epochs fail to decrease the training loss by + tol, or fail to increase validation score by tol if 'early_stopping' + is on, the current learning rate is divided by 5. + + momentum : float, default=0.9 + Value of momentum used, must be larger than or equal to 0 + + nesterov : bool, default=True + Whether to use nesterov's momentum or not. Use nesterov's if True + + power_t : float, default=0.5 + Power of time step 't' in inverse scaling. See `lr_schedule` for + more details. + + Attributes + ---------- + learning_rate : float + the current learning rate + + velocities : list, length = len(params) + velocities that are used to update params + """ + + def __init__( + self, + params, + learning_rate_init=0.1, + lr_schedule="constant", + momentum=0.9, + nesterov=True, + power_t=0.5, + ): + super().__init__(learning_rate_init) + + self.lr_schedule = lr_schedule + self.momentum = momentum + self.nesterov = nesterov + self.power_t = power_t + self.velocities = [np.zeros_like(param) for param in params] + + def iteration_ends(self, time_step): + """Perform updates to learning rate and potential other states at the + end of an iteration + + Parameters + ---------- + time_step : int + number of training samples trained on so far, used to update + learning rate for 'invscaling' + """ + if self.lr_schedule == "invscaling": + self.learning_rate = ( + float(self.learning_rate_init) / (time_step + 1) ** self.power_t + ) + + def trigger_stopping(self, msg, verbose): + if self.lr_schedule != "adaptive": + if verbose: + print(msg + " Stopping.") + return True + + if self.learning_rate <= 1e-6: + if verbose: + print(msg + " Learning rate too small. Stopping.") + return True + + self.learning_rate /= 5.0 + if verbose: + print(msg + " Setting learning rate to %f" % self.learning_rate) + return False + + def _get_updates(self, grads): + """Get the values used to update params with given gradients + + Parameters + ---------- + grads : list, length = len(coefs_) + len(intercepts_) + Containing gradients with respect to coefs_ and intercepts_ in MLP + model. So length should be aligned with params + + Returns + ------- + updates : list, length = len(grads) + The values to add to params + """ + updates = [ + self.momentum * velocity - self.learning_rate * grad + for velocity, grad in zip(self.velocities, grads) + ] + self.velocities = updates + + if self.nesterov: + updates = [ + self.momentum * velocity - self.learning_rate * grad + for velocity, grad in zip(self.velocities, grads) + ] + + return updates + + +class AdamOptimizer(BaseOptimizer): + """Stochastic gradient descent optimizer with Adam + + Note: All default values are from the original Adam paper + + Parameters + ---------- + params : list, length = len(coefs_) + len(intercepts_) + The concatenated list containing coefs_ and intercepts_ in MLP model. + Used for initializing velocities and updating params + + learning_rate_init : float, default=0.001 + The initial learning rate used. It controls the step-size in updating + the weights + + beta_1 : float, default=0.9 + Exponential decay rate for estimates of first moment vector, should be + in [0, 1) + + beta_2 : float, default=0.999 + Exponential decay rate for estimates of second moment vector, should be + in [0, 1) + + epsilon : float, default=1e-8 + Value for numerical stability + + Attributes + ---------- + learning_rate : float + The current learning rate + + t : int + Timestep + + ms : list, length = len(params) + First moment vectors + + vs : list, length = len(params) + Second moment vectors + + References + ---------- + :arxiv:`Kingma, Diederik, and Jimmy Ba (2014) "Adam: A method for + stochastic optimization." <1412.6980> + """ + + def __init__( + self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8 + ): + super().__init__(learning_rate_init) + + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.t = 0 + self.ms = [np.zeros_like(param) for param in params] + self.vs = [np.zeros_like(param) for param in params] + + def _get_updates(self, grads): + """Get the values used to update params with given gradients + + Parameters + ---------- + grads : list, length = len(coefs_) + len(intercepts_) + Containing gradients with respect to coefs_ and intercepts_ in MLP + model. So length should be aligned with params + + Returns + ------- + updates : list, length = len(grads) + The values to add to params + """ + self.t += 1 + self.ms = [ + self.beta_1 * m + (1 - self.beta_1) * grad + for m, grad in zip(self.ms, grads) + ] + self.vs = [ + self.beta_2 * v + (1 - self.beta_2) * (grad**2) + for v, grad in zip(self.vs, grads) + ] + self.learning_rate = ( + self.learning_rate_init + * np.sqrt(1 - self.beta_2**self.t) + / (1 - self.beta_1**self.t) + ) + updates = [ + -self.learning_rate * m / (np.sqrt(v) + self.epsilon) + for m, v in zip(self.ms, self.vs) + ] + return updates diff --git a/.venv/Lib/site-packages/sklearn/neural_network/tests/__init__.py b/.venv/Lib/site-packages/sklearn/neural_network/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/Lib/site-packages/sklearn/neural_network/tests/test_base.py b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..5f40efd5af5af1ee689305babb7d931428e8c595 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_base.py @@ -0,0 +1,29 @@ +import numpy as np +import pytest + +from sklearn.neural_network._base import binary_log_loss, log_loss + + +def test_binary_log_loss_1_prob_finite(): + # y_proba is equal to one should result in a finite logloss + y_true = np.array([[0, 0, 1]]).T + y_prob = np.array([[0.9, 1.0, 1.0]]).T + + loss = binary_log_loss(y_true, y_prob) + assert np.isfinite(loss) + + +@pytest.mark.parametrize( + "y_true, y_prob", + [ + ( + np.array([[1, 0, 0], [0, 1, 0]]), + np.array([[0.0, 1.0, 0.0], [0.9, 0.05, 0.05]]), + ), + (np.array([[0, 0, 1]]).T, np.array([[0.9, 1.0, 1.0]]).T), + ], +) +def test_log_loss_1_prob_finite(y_true, y_prob): + # y_proba is equal to 1 should result in a finite logloss + loss = log_loss(y_true, y_prob) + assert np.isfinite(loss) diff --git a/.venv/Lib/site-packages/sklearn/neural_network/tests/test_mlp.py b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..3f94b25defde4747f720f2facb0d09b2059ecb26 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_mlp.py @@ -0,0 +1,1021 @@ +""" +Testing for Multi-layer Perceptron module (sklearn.neural_network) +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import re +import sys +import warnings +from io import StringIO + +import joblib +import numpy as np +import pytest + +from sklearn.datasets import ( + load_digits, + load_iris, + make_multilabel_classification, + make_regression, +) +from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics import roc_auc_score +from sklearn.neural_network import MLPClassifier, MLPRegressor +from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, scale +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.fixes import CSR_CONTAINERS + +ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"] + +X_digits, y_digits = load_digits(n_class=3, return_X_y=True) + +X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200]) +y_digits_multi = y_digits[:200] + +X_digits, y_digits = load_digits(n_class=2, return_X_y=True) + +X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200]) +y_digits_binary = y_digits[:200] + +classification_datasets = [ + (X_digits_multi, y_digits_multi), + (X_digits_binary, y_digits_binary), +] + +X_reg, y_reg = make_regression( + n_samples=200, n_features=10, bias=20.0, noise=100.0, random_state=7 +) +y_reg = scale(y_reg) +regression_datasets = [(X_reg, y_reg)] + +iris = load_iris() + +X_iris = iris.data +y_iris = iris.target + + +def test_alpha(): + # Test that larger alpha yields weights closer to zero + X = X_digits_binary[:100] + y = y_digits_binary[:100] + + alpha_vectors = [] + alpha_values = np.arange(2) + absolute_sum = lambda x: np.sum(np.abs(x)) + + for alpha in alpha_values: + mlp = MLPClassifier(hidden_layer_sizes=10, alpha=alpha, random_state=1) + with ignore_warnings(category=ConvergenceWarning): + mlp.fit(X, y) + alpha_vectors.append( + np.array([absolute_sum(mlp.coefs_[0]), absolute_sum(mlp.coefs_[1])]) + ) + + for i in range(len(alpha_values) - 1): + assert (alpha_vectors[i] > alpha_vectors[i + 1]).all() + + +def test_fit(): + # Test that the algorithm solution is equal to a worked out example. + X = np.array([[0.6, 0.8, 0.7]]) + y = np.array([0]) + mlp = MLPClassifier( + solver="sgd", + learning_rate_init=0.1, + alpha=0.1, + activation="logistic", + random_state=1, + max_iter=1, + hidden_layer_sizes=2, + momentum=0, + ) + # set weights + mlp.coefs_ = [0] * 2 + mlp.intercepts_ = [0] * 2 + mlp.n_outputs_ = 1 + mlp.coefs_[0] = np.array([[0.1, 0.2], [0.3, 0.1], [0.5, 0]]) + mlp.coefs_[1] = np.array([[0.1], [0.2]]) + mlp.intercepts_[0] = np.array([0.1, 0.1]) + mlp.intercepts_[1] = np.array([1.0]) + mlp._coef_grads = [] * 2 + mlp._intercept_grads = [] * 2 + mlp.n_features_in_ = 3 + + # Initialize parameters + mlp.n_iter_ = 0 + mlp.learning_rate_ = 0.1 + + # Compute the number of layers + mlp.n_layers_ = 3 + + # Pre-allocate gradient matrices + mlp._coef_grads = [0] * (mlp.n_layers_ - 1) + mlp._intercept_grads = [0] * (mlp.n_layers_ - 1) + + mlp.out_activation_ = "logistic" + mlp.t_ = 0 + mlp.best_loss_ = np.inf + mlp.loss_curve_ = [] + mlp._no_improvement_count = 0 + mlp._intercept_velocity = [ + np.zeros_like(intercepts) for intercepts in mlp.intercepts_ + ] + mlp._coef_velocity = [np.zeros_like(coefs) for coefs in mlp.coefs_] + + mlp.partial_fit(X, y, classes=[0, 1]) + # Manually worked out example + # h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.1 + 0.8 * 0.3 + 0.7 * 0.5 + 0.1) + # = 0.679178699175393 + # h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.2 + 0.8 * 0.1 + 0.7 * 0 + 0.1) + # = 0.574442516811659 + # o1 = g(h * W2 + b21) = g(0.679 * 0.1 + 0.574 * 0.2 + 1) + # = 0.7654329236196236 + # d21 = -(0 - 0.765) = 0.765 + # d11 = (1 - 0.679) * 0.679 * 0.765 * 0.1 = 0.01667 + # d12 = (1 - 0.574) * 0.574 * 0.765 * 0.2 = 0.0374 + # W1grad11 = X1 * d11 + alpha * W11 = 0.6 * 0.01667 + 0.1 * 0.1 = 0.0200 + # W1grad11 = X1 * d12 + alpha * W12 = 0.6 * 0.0374 + 0.1 * 0.2 = 0.04244 + # W1grad21 = X2 * d11 + alpha * W13 = 0.8 * 0.01667 + 0.1 * 0.3 = 0.043336 + # W1grad22 = X2 * d12 + alpha * W14 = 0.8 * 0.0374 + 0.1 * 0.1 = 0.03992 + # W1grad31 = X3 * d11 + alpha * W15 = 0.6 * 0.01667 + 0.1 * 0.5 = 0.060002 + # W1grad32 = X3 * d12 + alpha * W16 = 0.6 * 0.0374 + 0.1 * 0 = 0.02244 + # W2grad1 = h1 * d21 + alpha * W21 = 0.679 * 0.765 + 0.1 * 0.1 = 0.5294 + # W2grad2 = h2 * d21 + alpha * W22 = 0.574 * 0.765 + 0.1 * 0.2 = 0.45911 + # b1grad1 = d11 = 0.01667 + # b1grad2 = d12 = 0.0374 + # b2grad = d21 = 0.765 + # W1 = W1 - eta * [W1grad11, .., W1grad32] = [[0.1, 0.2], [0.3, 0.1], + # [0.5, 0]] - 0.1 * [[0.0200, 0.04244], [0.043336, 0.03992], + # [0.060002, 0.02244]] = [[0.098, 0.195756], [0.2956664, + # 0.096008], [0.4939998, -0.002244]] + # W2 = W2 - eta * [W2grad1, W2grad2] = [[0.1], [0.2]] - 0.1 * + # [[0.5294], [0.45911]] = [[0.04706], [0.154089]] + # b1 = b1 - eta * [b1grad1, b1grad2] = 0.1 - 0.1 * [0.01667, 0.0374] + # = [0.098333, 0.09626] + # b2 = b2 - eta * b2grad = 1.0 - 0.1 * 0.765 = 0.9235 + assert_almost_equal( + mlp.coefs_[0], + np.array([[0.098, 0.195756], [0.2956664, 0.096008], [0.4939998, -0.002244]]), + decimal=3, + ) + assert_almost_equal(mlp.coefs_[1], np.array([[0.04706], [0.154089]]), decimal=3) + assert_almost_equal(mlp.intercepts_[0], np.array([0.098333, 0.09626]), decimal=3) + assert_almost_equal(mlp.intercepts_[1], np.array(0.9235), decimal=3) + # Testing output + # h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.098 + 0.8 * 0.2956664 + + # 0.7 * 0.4939998 + 0.098333) = 0.677 + # h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.195756 + 0.8 * 0.096008 + + # 0.7 * -0.002244 + 0.09626) = 0.572 + # o1 = h * W2 + b21 = 0.677 * 0.04706 + + # 0.572 * 0.154089 + 0.9235 = 1.043 + # prob = sigmoid(o1) = 0.739 + assert_almost_equal(mlp.predict_proba(X)[0, 1], 0.739, decimal=3) + + +def test_gradient(): + # Test gradient. + + # This makes sure that the activation functions and their derivatives + # are correct. The numerical and analytical computation of the gradient + # should be close. + for n_labels in [2, 3]: + n_samples = 5 + n_features = 10 + random_state = np.random.RandomState(seed=42) + X = random_state.rand(n_samples, n_features) + y = 1 + np.mod(np.arange(n_samples) + 1, n_labels) + Y = LabelBinarizer().fit_transform(y) + + for activation in ACTIVATION_TYPES: + mlp = MLPClassifier( + activation=activation, + hidden_layer_sizes=10, + solver="lbfgs", + alpha=1e-5, + learning_rate_init=0.2, + max_iter=1, + random_state=1, + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + mlp.fit(X, y) + + theta = np.hstack([l.ravel() for l in mlp.coefs_ + mlp.intercepts_]) + + layer_units = [X.shape[1]] + [mlp.hidden_layer_sizes] + [mlp.n_outputs_] + + activations = [] + deltas = [] + coef_grads = [] + intercept_grads = [] + + activations.append(X) + for i in range(mlp.n_layers_ - 1): + activations.append(np.empty((X.shape[0], layer_units[i + 1]))) + deltas.append(np.empty((X.shape[0], layer_units[i + 1]))) + + fan_in = layer_units[i] + fan_out = layer_units[i + 1] + coef_grads.append(np.empty((fan_in, fan_out))) + intercept_grads.append(np.empty(fan_out)) + + # analytically compute the gradients + def loss_grad_fun(t): + return mlp._loss_grad_lbfgs( + t, X, Y, activations, deltas, coef_grads, intercept_grads + ) + + [value, grad] = loss_grad_fun(theta) + numgrad = np.zeros(np.size(theta)) + n = np.size(theta, 0) + E = np.eye(n) + epsilon = 1e-5 + # numerically compute the gradients + for i in range(n): + dtheta = E[:, i] * epsilon + numgrad[i] = ( + loss_grad_fun(theta + dtheta)[0] - loss_grad_fun(theta - dtheta)[0] + ) / (epsilon * 2.0) + assert_almost_equal(numgrad, grad) + + +@pytest.mark.parametrize("X,y", classification_datasets) +def test_lbfgs_classification(X, y): + # Test lbfgs on classification. + # It should achieve a score higher than 0.95 for the binary and multi-class + # versions of the digits dataset. + X_train = X[:150] + y_train = y[:150] + X_test = X[150:] + expected_shape_dtype = (X_test.shape[0], y_train.dtype.kind) + + for activation in ACTIVATION_TYPES: + mlp = MLPClassifier( + solver="lbfgs", + hidden_layer_sizes=50, + max_iter=150, + shuffle=True, + random_state=1, + activation=activation, + ) + mlp.fit(X_train, y_train) + y_predict = mlp.predict(X_test) + assert mlp.score(X_train, y_train) > 0.95 + assert (y_predict.shape[0], y_predict.dtype.kind) == expected_shape_dtype + + +@pytest.mark.parametrize("X,y", regression_datasets) +def test_lbfgs_regression(X, y): + # Test lbfgs on the regression dataset. + for activation in ACTIVATION_TYPES: + mlp = MLPRegressor( + solver="lbfgs", + hidden_layer_sizes=50, + max_iter=200, + tol=1e-3, + shuffle=True, + random_state=1, + activation=activation, + ) + mlp.fit(X, y) + if activation == "identity": + assert mlp.score(X, y) > 0.80 + else: + # Non linear models perform much better than linear bottleneck: + assert mlp.score(X, y) > 0.98 + + +@pytest.mark.parametrize("X,y", classification_datasets) +def test_lbfgs_classification_maxfun(X, y): + # Test lbfgs parameter max_fun. + # It should independently limit the number of iterations for lbfgs. + max_fun = 10 + # classification tests + for activation in ACTIVATION_TYPES: + mlp = MLPClassifier( + solver="lbfgs", + hidden_layer_sizes=50, + max_iter=150, + max_fun=max_fun, + shuffle=True, + random_state=1, + activation=activation, + ) + with pytest.warns(ConvergenceWarning): + mlp.fit(X, y) + assert max_fun >= mlp.n_iter_ + + +@pytest.mark.parametrize("X,y", regression_datasets) +def test_lbfgs_regression_maxfun(X, y): + # Test lbfgs parameter max_fun. + # It should independently limit the number of iterations for lbfgs. + max_fun = 10 + # regression tests + for activation in ACTIVATION_TYPES: + mlp = MLPRegressor( + solver="lbfgs", + hidden_layer_sizes=50, + tol=0.0, + max_iter=150, + max_fun=max_fun, + shuffle=True, + random_state=1, + activation=activation, + ) + with pytest.warns(ConvergenceWarning): + mlp.fit(X, y) + assert max_fun >= mlp.n_iter_ + + +def test_learning_rate_warmstart(): + # Tests that warm_start reuse past solutions. + X = [[3, 2], [1, 6], [5, 6], [-2, -4]] + y = [1, 1, 1, 0] + for learning_rate in ["invscaling", "constant"]: + mlp = MLPClassifier( + solver="sgd", + hidden_layer_sizes=4, + learning_rate=learning_rate, + max_iter=1, + power_t=0.25, + warm_start=True, + ) + with ignore_warnings(category=ConvergenceWarning): + mlp.fit(X, y) + prev_eta = mlp._optimizer.learning_rate + mlp.fit(X, y) + post_eta = mlp._optimizer.learning_rate + + if learning_rate == "constant": + assert prev_eta == post_eta + elif learning_rate == "invscaling": + assert mlp.learning_rate_init / pow(8 + 1, mlp.power_t) == post_eta + + +def test_multilabel_classification(): + # Test that multi-label classification works as expected. + # test fit method + X, y = make_multilabel_classification( + n_samples=50, random_state=0, return_indicator=True + ) + mlp = MLPClassifier( + solver="lbfgs", + hidden_layer_sizes=50, + alpha=1e-5, + max_iter=150, + random_state=0, + activation="logistic", + learning_rate_init=0.2, + ) + mlp.fit(X, y) + assert mlp.score(X, y) > 0.97 + + # test partial fit method + mlp = MLPClassifier( + solver="sgd", + hidden_layer_sizes=50, + max_iter=150, + random_state=0, + activation="logistic", + alpha=1e-5, + learning_rate_init=0.2, + ) + for i in range(100): + mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4]) + assert mlp.score(X, y) > 0.9 + + # Make sure early stopping still work now that splitting is stratified by + # default (it is disabled for multilabel classification) + mlp = MLPClassifier(early_stopping=True) + mlp.fit(X, y).predict(X) + + +def test_multioutput_regression(): + # Test that multi-output regression works as expected + X, y = make_regression(n_samples=200, n_targets=5, random_state=11) + mlp = MLPRegressor( + solver="lbfgs", hidden_layer_sizes=50, max_iter=200, tol=1e-2, random_state=1 + ) + mlp.fit(X, y) + assert mlp.score(X, y) > 0.9 + + +def test_partial_fit_classes_error(): + # Tests that passing different classes to partial_fit raises an error + X = [[3, 2]] + y = [0] + clf = MLPClassifier(solver="sgd") + clf.partial_fit(X, y, classes=[0, 1]) + with pytest.raises(ValueError): + clf.partial_fit(X, y, classes=[1, 2]) + + +def test_partial_fit_classification(): + # Test partial_fit on classification. + # `partial_fit` should yield the same results as 'fit' for binary and + # multi-class classification. + for X, y in classification_datasets: + mlp = MLPClassifier( + solver="sgd", + max_iter=100, + random_state=1, + tol=0, + alpha=1e-5, + learning_rate_init=0.2, + ) + + with ignore_warnings(category=ConvergenceWarning): + mlp.fit(X, y) + pred1 = mlp.predict(X) + mlp = MLPClassifier( + solver="sgd", random_state=1, alpha=1e-5, learning_rate_init=0.2 + ) + for i in range(100): + mlp.partial_fit(X, y, classes=np.unique(y)) + pred2 = mlp.predict(X) + assert_array_equal(pred1, pred2) + assert mlp.score(X, y) > 0.95 + + +def test_partial_fit_unseen_classes(): + # Non regression test for bug 6994 + # Tests for labeling errors in partial fit + + clf = MLPClassifier(random_state=0) + clf.partial_fit([[1], [2], [3]], ["a", "b", "c"], classes=["a", "b", "c", "d"]) + clf.partial_fit([[4]], ["d"]) + assert clf.score([[1], [2], [3], [4]], ["a", "b", "c", "d"]) > 0 + + +def test_partial_fit_regression(): + # Test partial_fit on regression. + # `partial_fit` should yield the same results as 'fit' for regression. + X = X_reg + y = y_reg + + for momentum in [0, 0.9]: + mlp = MLPRegressor( + solver="sgd", + max_iter=100, + activation="relu", + random_state=1, + learning_rate_init=0.01, + batch_size=X.shape[0], + momentum=momentum, + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + mlp.fit(X, y) + pred1 = mlp.predict(X) + mlp = MLPRegressor( + solver="sgd", + activation="relu", + learning_rate_init=0.01, + random_state=1, + batch_size=X.shape[0], + momentum=momentum, + ) + for i in range(100): + mlp.partial_fit(X, y) + + pred2 = mlp.predict(X) + assert_allclose(pred1, pred2) + score = mlp.score(X, y) + assert score > 0.65 + + +def test_partial_fit_errors(): + # Test partial_fit error handling. + X = [[3, 2], [1, 6]] + y = [1, 0] + + # no classes passed + with pytest.raises(ValueError): + MLPClassifier(solver="sgd").partial_fit(X, y, classes=[2]) + + # lbfgs doesn't support partial_fit + assert not hasattr(MLPClassifier(solver="lbfgs"), "partial_fit") + + +def test_nonfinite_params(): + # Check that MLPRegressor throws ValueError when dealing with non-finite + # parameter values + rng = np.random.RandomState(0) + n_samples = 10 + fmax = np.finfo(np.float64).max + X = fmax * rng.uniform(size=(n_samples, 2)) + y = rng.standard_normal(size=n_samples) + + clf = MLPRegressor() + msg = ( + "Solver produced non-finite parameter weights. The input data may contain large" + " values and need to be preprocessed." + ) + with pytest.raises(ValueError, match=msg): + with warnings.catch_warnings(): + # RuntimeWarning: overflow encountered in square + warnings.simplefilter("ignore") + clf.fit(X, y) + + +def test_predict_proba_binary(): + # Test that predict_proba works as expected for binary class. + X = X_digits_binary[:50] + y = y_digits_binary[:50] + + clf = MLPClassifier(hidden_layer_sizes=5, activation="logistic", random_state=1) + with ignore_warnings(category=ConvergenceWarning): + clf.fit(X, y) + y_proba = clf.predict_proba(X) + y_log_proba = clf.predict_log_proba(X) + + (n_samples, n_classes) = y.shape[0], 2 + + proba_max = y_proba.argmax(axis=1) + proba_log_max = y_log_proba.argmax(axis=1) + + assert y_proba.shape == (n_samples, n_classes) + assert_array_equal(proba_max, proba_log_max) + assert_allclose(y_log_proba, np.log(y_proba)) + + assert roc_auc_score(y, y_proba[:, 1]) == 1.0 + + +def test_predict_proba_multiclass(): + # Test that predict_proba works as expected for multi class. + X = X_digits_multi[:10] + y = y_digits_multi[:10] + + clf = MLPClassifier(hidden_layer_sizes=5) + with ignore_warnings(category=ConvergenceWarning): + clf.fit(X, y) + y_proba = clf.predict_proba(X) + y_log_proba = clf.predict_log_proba(X) + + (n_samples, n_classes) = y.shape[0], np.unique(y).size + + proba_max = y_proba.argmax(axis=1) + proba_log_max = y_log_proba.argmax(axis=1) + + assert y_proba.shape == (n_samples, n_classes) + assert_array_equal(proba_max, proba_log_max) + assert_allclose(y_log_proba, np.log(y_proba)) + + +def test_predict_proba_multilabel(): + # Test that predict_proba works as expected for multilabel. + # Multilabel should not use softmax which makes probabilities sum to 1 + X, Y = make_multilabel_classification( + n_samples=50, random_state=0, return_indicator=True + ) + n_samples, n_classes = Y.shape + + clf = MLPClassifier(solver="lbfgs", hidden_layer_sizes=30, random_state=0) + clf.fit(X, Y) + y_proba = clf.predict_proba(X) + + assert y_proba.shape == (n_samples, n_classes) + assert_array_equal(y_proba > 0.5, Y) + + y_log_proba = clf.predict_log_proba(X) + proba_max = y_proba.argmax(axis=1) + proba_log_max = y_log_proba.argmax(axis=1) + + assert (y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1) > 1e-10 + assert_array_equal(proba_max, proba_log_max) + assert_allclose(y_log_proba, np.log(y_proba)) + + +def test_shuffle(): + # Test that the shuffle parameter affects the training process (it should) + X, y = make_regression(n_samples=50, n_features=5, n_targets=1, random_state=0) + + # The coefficients will be identical if both do or do not shuffle + for shuffle in [True, False]: + mlp1 = MLPRegressor( + hidden_layer_sizes=1, + max_iter=1, + batch_size=1, + random_state=0, + shuffle=shuffle, + ) + mlp2 = MLPRegressor( + hidden_layer_sizes=1, + max_iter=1, + batch_size=1, + random_state=0, + shuffle=shuffle, + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + mlp1.fit(X, y) + mlp2.fit(X, y) + + assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0]) + + # The coefficients will be slightly different if shuffle=True + mlp1 = MLPRegressor( + hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=True + ) + mlp2 = MLPRegressor( + hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=False + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + mlp1.fit(X, y) + mlp2.fit(X, y) + + assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_matrices(csr_container): + # Test that sparse and dense input matrices output the same results. + X = X_digits_binary[:50] + y = y_digits_binary[:50] + X_sparse = csr_container(X) + mlp = MLPClassifier(solver="lbfgs", hidden_layer_sizes=15, random_state=1) + mlp.fit(X, y) + pred1 = mlp.predict(X) + mlp.fit(X_sparse, y) + pred2 = mlp.predict(X_sparse) + assert_almost_equal(pred1, pred2) + pred1 = mlp.predict(X) + pred2 = mlp.predict(X_sparse) + assert_array_equal(pred1, pred2) + + +def test_tolerance(): + # Test tolerance. + # It should force the solver to exit the loop when it converges. + X = [[3, 2], [1, 6]] + y = [1, 0] + clf = MLPClassifier(tol=0.5, max_iter=3000, solver="sgd") + clf.fit(X, y) + assert clf.max_iter > clf.n_iter_ + + +def test_verbose_sgd(): + # Test verbose. + X = [[3, 2], [1, 6]] + y = [1, 0] + clf = MLPClassifier(solver="sgd", max_iter=2, verbose=10, hidden_layer_sizes=2) + old_stdout = sys.stdout + sys.stdout = output = StringIO() + + with ignore_warnings(category=ConvergenceWarning): + clf.fit(X, y) + clf.partial_fit(X, y) + + sys.stdout = old_stdout + assert "Iteration" in output.getvalue() + + +@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor]) +def test_early_stopping(MLPEstimator): + X = X_digits_binary[:100] + y = y_digits_binary[:100] + tol = 0.2 + mlp_estimator = MLPEstimator( + tol=tol, max_iter=3000, solver="sgd", early_stopping=True + ) + mlp_estimator.fit(X, y) + assert mlp_estimator.max_iter > mlp_estimator.n_iter_ + + assert mlp_estimator.best_loss_ is None + assert isinstance(mlp_estimator.validation_scores_, list) + + valid_scores = mlp_estimator.validation_scores_ + best_valid_score = mlp_estimator.best_validation_score_ + assert max(valid_scores) == best_valid_score + assert best_valid_score + tol > valid_scores[-2] + assert best_valid_score + tol > valid_scores[-1] + + # check that the attributes `validation_scores_` and `best_validation_score_` + # are set to None when `early_stopping=False` + mlp_estimator = MLPEstimator( + tol=tol, max_iter=3000, solver="sgd", early_stopping=False + ) + mlp_estimator.fit(X, y) + assert mlp_estimator.validation_scores_ is None + assert mlp_estimator.best_validation_score_ is None + assert mlp_estimator.best_loss_ is not None + + +def test_adaptive_learning_rate(): + X = [[3, 2], [1, 6]] + y = [1, 0] + clf = MLPClassifier(tol=0.5, max_iter=3000, solver="sgd", learning_rate="adaptive") + clf.fit(X, y) + assert clf.max_iter > clf.n_iter_ + assert 1e-6 > clf._optimizer.learning_rate + + +def test_warm_start(): + X = X_iris + y = y_iris + + y_2classes = np.array([0] * 75 + [1] * 75) + y_3classes = np.array([0] * 40 + [1] * 40 + [2] * 70) + y_3classes_alt = np.array([0] * 50 + [1] * 50 + [3] * 50) + y_4classes = np.array([0] * 37 + [1] * 37 + [2] * 38 + [3] * 38) + y_5classes = np.array([0] * 30 + [1] * 30 + [2] * 30 + [3] * 30 + [4] * 30) + + # No error raised + clf = MLPClassifier( + hidden_layer_sizes=2, solver="lbfgs", warm_start=True, random_state=42, tol=1e-2 + ).fit(X, y) + clf.fit(X, y) + clf.fit(X, y_3classes) + + for y_i in (y_2classes, y_3classes_alt, y_4classes, y_5classes): + clf = MLPClassifier( + hidden_layer_sizes=2, + solver="lbfgs", + warm_start=True, + random_state=42, + tol=1e-2, + ).fit(X, y) + message = ( + "warm_start can only be used where `y` has the same " + "classes as in the previous call to fit." + " Previously got [0 1 2], `y` has %s" % np.unique(y_i) + ) + with pytest.raises(ValueError, match=re.escape(message)): + clf.fit(X, y_i) + + +@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor]) +def test_warm_start_full_iteration(MLPEstimator): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16812 + # Check that the MLP estimator accomplish `max_iter` with a + # warm started estimator. + X, y = X_iris, y_iris + max_iter = 3 + clf = MLPEstimator( + hidden_layer_sizes=2, solver="sgd", warm_start=True, max_iter=max_iter + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + clf.fit(X, y) + assert max_iter == clf.n_iter_ + clf.fit(X, y) + assert max_iter == clf.n_iter_ + + +def test_n_iter_no_change(): + # test n_iter_no_change using binary data set + # the classifying fitting process is not prone to loss curve fluctuations + X = X_digits_binary[:100] + y = y_digits_binary[:100] + tol = 0.01 + max_iter = 3000 + + # test multiple n_iter_no_change + for n_iter_no_change in [2, 5, 10, 50, 100]: + clf = MLPClassifier( + tol=tol, max_iter=max_iter, solver="sgd", n_iter_no_change=n_iter_no_change + ) + clf.fit(X, y) + + # validate n_iter_no_change + assert clf._no_improvement_count == n_iter_no_change + 1 + assert max_iter > clf.n_iter_ + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_n_iter_no_change_inf(): + # test n_iter_no_change using binary data set + # the fitting process should go to max_iter iterations + X = X_digits_binary[:100] + y = y_digits_binary[:100] + + # set a ridiculous tolerance + # this should always trigger _update_no_improvement_count() + tol = 1e9 + + # fit + n_iter_no_change = np.inf + max_iter = 3000 + clf = MLPClassifier( + tol=tol, max_iter=max_iter, solver="sgd", n_iter_no_change=n_iter_no_change + ) + clf.fit(X, y) + + # validate n_iter_no_change doesn't cause early stopping + assert clf.n_iter_ == max_iter + + # validate _update_no_improvement_count() was always triggered + assert clf._no_improvement_count == clf.n_iter_ - 1 + + +def test_early_stopping_stratified(): + # Make sure data splitting for early stopping is stratified + X = [[1, 2], [2, 3], [3, 4], [4, 5]] + y = [0, 0, 0, 1] + + mlp = MLPClassifier(early_stopping=True) + with pytest.raises( + ValueError, match="The least populated class in y has only 1 member" + ): + mlp.fit(X, y) + + +def test_mlp_classifier_dtypes_casting(): + # Compare predictions for different dtypes + mlp_64 = MLPClassifier( + alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=100, tol=1e-1 + ) + mlp_64.fit(X_digits[:300], y_digits[:300]) + pred_64 = mlp_64.predict(X_digits[300:]) + proba_64 = mlp_64.predict_proba(X_digits[300:]) + + mlp_32 = MLPClassifier( + alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=100, tol=1e-1 + ) + mlp_32.fit(X_digits[:300].astype(np.float32), y_digits[:300]) + pred_32 = mlp_32.predict(X_digits[300:].astype(np.float32)) + proba_32 = mlp_32.predict_proba(X_digits[300:].astype(np.float32)) + + assert_array_equal(pred_64, pred_32) + assert_allclose(proba_64, proba_32, rtol=1e-02) + + +def test_mlp_regressor_dtypes_casting(): + mlp_64 = MLPRegressor( + alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=150, tol=1e-3 + ) + mlp_64.fit(X_digits[:300], y_digits[:300]) + pred_64 = mlp_64.predict(X_digits[300:]) + + mlp_32 = MLPRegressor( + alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=150, tol=1e-3 + ) + mlp_32.fit(X_digits[:300].astype(np.float32), y_digits[:300]) + pred_32 = mlp_32.predict(X_digits[300:].astype(np.float32)) + + assert_allclose(pred_64, pred_32, rtol=5e-04) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("Estimator", [MLPClassifier, MLPRegressor]) +def test_mlp_param_dtypes(dtype, Estimator): + # Checks if input dtype is used for network parameters + # and predictions + X, y = X_digits.astype(dtype), y_digits + mlp = Estimator( + alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=50, tol=1e-1 + ) + mlp.fit(X[:300], y[:300]) + pred = mlp.predict(X[300:]) + + assert all([intercept.dtype == dtype for intercept in mlp.intercepts_]) + + assert all([coef.dtype == dtype for coef in mlp.coefs_]) + + if Estimator == MLPRegressor: + assert pred.dtype == dtype + + +def test_mlp_loading_from_joblib_partial_fit(tmp_path): + """Loading from MLP and partial fitting updates weights. Non-regression + test for #19626.""" + pre_trained_estimator = MLPRegressor( + hidden_layer_sizes=(42,), random_state=42, learning_rate_init=0.01, max_iter=200 + ) + features, target = [[2]], [4] + + # Fit on x=2, y=4 + pre_trained_estimator.fit(features, target) + + # dump and load model + pickled_file = tmp_path / "mlp.pkl" + joblib.dump(pre_trained_estimator, pickled_file) + load_estimator = joblib.load(pickled_file) + + # Train for a more epochs on point x=2, y=1 + fine_tune_features, fine_tune_target = [[2]], [1] + + for _ in range(200): + load_estimator.partial_fit(fine_tune_features, fine_tune_target) + + # finetuned model learned the new target + predicted_value = load_estimator.predict(fine_tune_features) + assert_allclose(predicted_value, fine_tune_target, rtol=1e-4) + + +@pytest.mark.parametrize("Estimator", [MLPClassifier, MLPRegressor]) +def test_preserve_feature_names(Estimator): + """Check that feature names are preserved when early stopping is enabled. + + Feature names are required for consistency checks during scoring. + + Non-regression test for gh-24846 + """ + pd = pytest.importorskip("pandas") + rng = np.random.RandomState(0) + + X = pd.DataFrame(data=rng.randn(10, 2), columns=["colname_a", "colname_b"]) + y = pd.Series(data=np.full(10, 1), name="colname_y") + + model = Estimator(early_stopping=True, validation_fraction=0.2) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + model.fit(X, y) + + +@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor]) +def test_mlp_warm_start_with_early_stopping(MLPEstimator): + """Check that early stopping works with warm start.""" + mlp = MLPEstimator( + max_iter=10, random_state=0, warm_start=True, early_stopping=True + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + mlp.fit(X_iris, y_iris) + n_validation_scores = len(mlp.validation_scores_) + mlp.set_params(max_iter=20) + mlp.fit(X_iris, y_iris) + assert len(mlp.validation_scores_) > n_validation_scores + + +@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor]) +@pytest.mark.parametrize("solver", ["sgd", "adam", "lbfgs"]) +def test_mlp_warm_start_no_convergence(MLPEstimator, solver): + """Check that we stop the number of iteration at `max_iter` when warm starting. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/24764 + """ + model = MLPEstimator( + solver=solver, + warm_start=True, + early_stopping=False, + max_iter=10, + n_iter_no_change=np.inf, + random_state=0, + ) + + with pytest.warns(ConvergenceWarning): + model.fit(X_iris, y_iris) + assert model.n_iter_ == 10 + + model.set_params(max_iter=20) + with pytest.warns(ConvergenceWarning): + model.fit(X_iris, y_iris) + assert model.n_iter_ == 20 + + +@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor]) +def test_mlp_partial_fit_after_fit(MLPEstimator): + """Check partial fit does not fail after fit when early_stopping=True. + + Non-regression test for gh-25693. + """ + mlp = MLPEstimator(early_stopping=True, random_state=0).fit(X_iris, y_iris) + + msg = "partial_fit does not support early_stopping=True" + with pytest.raises(ValueError, match=msg): + mlp.partial_fit(X_iris, y_iris) + + +def test_mlp_diverging_loss(): + """Test that a diverging model does not raise errors when early stopping is enabled. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/29504 + """ + mlp = MLPRegressor( + hidden_layer_sizes=100, + activation="identity", + solver="sgd", + alpha=0.0001, + learning_rate="constant", + learning_rate_init=1, + shuffle=True, + max_iter=20, + early_stopping=True, + n_iter_no_change=10, + random_state=0, + ) + + with warnings.catch_warnings(): + # RuntimeWarning: overflow encountered in matmul + # ConvergenceWarning: Stochastic Optimizer: Maximum iteration + warnings.simplefilter("ignore", RuntimeWarning) + warnings.simplefilter("ignore", ConvergenceWarning) + mlp.fit(X_iris, y_iris) + + # In python, float("nan") != float("nan") + assert str(mlp.validation_scores_[-1]) == str(np.nan) + assert isinstance(mlp.validation_scores_[-1], float) diff --git a/.venv/Lib/site-packages/sklearn/neural_network/tests/test_rbm.py b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_rbm.py new file mode 100644 index 0000000000000000000000000000000000000000..7dec2d165daa0e557c63113afc29a9c65ddbc258 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_rbm.py @@ -0,0 +1,251 @@ +import re +import sys +from io import StringIO + +import numpy as np +import pytest + +from sklearn.datasets import load_digits +from sklearn.neural_network import BernoulliRBM +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS +from sklearn.utils.validation import assert_all_finite + +Xdigits, _ = load_digits(return_X_y=True) +Xdigits -= Xdigits.min() +Xdigits /= Xdigits.max() + + +def test_fit(): + X = Xdigits.copy() + + rbm = BernoulliRBM( + n_components=64, learning_rate=0.1, batch_size=10, n_iter=7, random_state=9 + ) + rbm.fit(X) + + assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0) + + # in-place tricks shouldn't have modified X + assert_array_equal(X, Xdigits) + + +def test_partial_fit(): + X = Xdigits.copy() + rbm = BernoulliRBM( + n_components=64, learning_rate=0.1, batch_size=20, random_state=9 + ) + n_samples = X.shape[0] + n_batches = int(np.ceil(float(n_samples) / rbm.batch_size)) + batch_slices = np.array_split(X, n_batches) + + for i in range(7): + for batch in batch_slices: + rbm.partial_fit(batch) + + assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0) + assert_array_equal(X, Xdigits) + + +def test_transform(): + X = Xdigits[:100] + rbm1 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) + rbm1.fit(X) + + Xt1 = rbm1.transform(X) + Xt2 = rbm1._mean_hiddens(X) + + assert_array_equal(Xt1, Xt2) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_small_sparse(csr_container): + # BernoulliRBM should work on small sparse matrices. + X = csr_container(Xdigits[:4]) + BernoulliRBM().fit(X) # no exception + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_small_sparse_partial_fit(sparse_container): + X_sparse = sparse_container(Xdigits[:100]) + X = Xdigits[:100].copy() + + rbm1 = BernoulliRBM( + n_components=64, learning_rate=0.1, batch_size=10, random_state=9 + ) + rbm2 = BernoulliRBM( + n_components=64, learning_rate=0.1, batch_size=10, random_state=9 + ) + + rbm1.partial_fit(X_sparse) + rbm2.partial_fit(X) + + assert_almost_equal( + rbm1.score_samples(X).mean(), rbm2.score_samples(X).mean(), decimal=0 + ) + + +def test_sample_hiddens(): + rng = np.random.RandomState(0) + X = Xdigits[:100] + rbm1 = BernoulliRBM(n_components=2, batch_size=5, n_iter=5, random_state=42) + rbm1.fit(X) + + h = rbm1._mean_hiddens(X[0]) + hs = np.mean([rbm1._sample_hiddens(X[0], rng) for i in range(100)], 0) + + assert_almost_equal(h, hs, decimal=1) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_fit_gibbs(csc_container): + # XXX: this test is very seed-dependent! It probably needs to be rewritten. + + # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] + # from the same input + rng = np.random.RandomState(42) + X = np.array([[0.0], [1.0]]) + rbm1 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng) + # you need that much iters + rbm1.fit(X) + assert_almost_equal( + rbm1.components_, np.array([[0.02649814], [0.02009084]]), decimal=4 + ) + assert_almost_equal(rbm1.gibbs(X), X) + + # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] from + # the same input even when the input is sparse, and test against non-sparse + rng = np.random.RandomState(42) + X = csc_container([[0.0], [1.0]]) + rbm2 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng) + rbm2.fit(X) + assert_almost_equal( + rbm2.components_, np.array([[0.02649814], [0.02009084]]), decimal=4 + ) + assert_almost_equal(rbm2.gibbs(X), X.toarray()) + assert_almost_equal(rbm1.components_, rbm2.components_) + + +def test_gibbs_smoke(): + # Check if we don't get NaNs sampling the full digits dataset. + # Also check that sampling again will yield different results. + X = Xdigits + rbm1 = BernoulliRBM(n_components=42, batch_size=40, n_iter=20, random_state=42) + rbm1.fit(X) + X_sampled = rbm1.gibbs(X) + assert_all_finite(X_sampled) + X_sampled2 = rbm1.gibbs(X) + assert np.all((X_sampled != X_sampled2).max(axis=1)) + + +@pytest.mark.parametrize("lil_containers", LIL_CONTAINERS) +def test_score_samples(lil_containers): + # Test score_samples (pseudo-likelihood) method. + # Assert that pseudo-likelihood is computed without clipping. + # See Fabian's blog, http://bit.ly/1iYefRk + rng = np.random.RandomState(42) + X = np.vstack([np.zeros(1000), np.ones(1000)]) + rbm1 = BernoulliRBM(n_components=10, batch_size=2, n_iter=10, random_state=rng) + rbm1.fit(X) + assert (rbm1.score_samples(X) < -300).all() + + # Sparse vs. dense should not affect the output. Also test sparse input + # validation. + rbm1.random_state = 42 + d_score = rbm1.score_samples(X) + rbm1.random_state = 42 + s_score = rbm1.score_samples(lil_containers(X)) + assert_almost_equal(d_score, s_score) + + # Test numerical stability (#2785): would previously generate infinities + # and crash with an exception. + with np.errstate(under="ignore"): + rbm1.score_samples([np.arange(1000) * 100]) + + +def test_rbm_verbose(): + rbm = BernoulliRBM(n_iter=2, verbose=10) + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + rbm.fit(Xdigits) + finally: + sys.stdout = old_stdout + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_sparse_and_verbose(csc_container): + # Make sure RBM works with sparse input when verbose=True + old_stdout = sys.stdout + sys.stdout = StringIO() + + X = csc_container([[0.0], [1.0]]) + rbm = BernoulliRBM( + n_components=2, batch_size=2, n_iter=1, random_state=42, verbose=True + ) + try: + rbm.fit(X) + s = sys.stdout.getvalue() + # make sure output is sound + assert re.match( + r"\[BernoulliRBM\] Iteration 1," + r" pseudo-likelihood = -?(\d)+(\.\d+)?," + r" time = (\d|\.)+s", + s, + ) + finally: + sys.stdout = old_stdout + + +@pytest.mark.parametrize( + "dtype_in, dtype_out", + [(np.float32, np.float32), (np.float64, np.float64), (int, np.float64)], +) +def test_transformer_dtypes_casting(dtype_in, dtype_out): + X = Xdigits[:100].astype(dtype_in) + rbm = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) + Xt = rbm.fit_transform(X) + + # dtype_in and dtype_out should be consistent + assert Xt.dtype == dtype_out, "transform dtype: {} - original dtype: {}".format( + Xt.dtype, X.dtype + ) + + +def test_convergence_dtype_consistency(): + # float 64 transformer + X_64 = Xdigits[:100].astype(np.float64) + rbm_64 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) + Xt_64 = rbm_64.fit_transform(X_64) + + # float 32 transformer + X_32 = Xdigits[:100].astype(np.float32) + rbm_32 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) + Xt_32 = rbm_32.fit_transform(X_32) + + # results and attributes should be close enough in 32 bit and 64 bit + assert_allclose(Xt_64, Xt_32, rtol=1e-06, atol=0) + assert_allclose( + rbm_64.intercept_hidden_, rbm_32.intercept_hidden_, rtol=1e-06, atol=0 + ) + assert_allclose( + rbm_64.intercept_visible_, rbm_32.intercept_visible_, rtol=1e-05, atol=0 + ) + assert_allclose(rbm_64.components_, rbm_32.components_, rtol=1e-03, atol=0) + assert_allclose(rbm_64.h_samples_, rbm_32.h_samples_) + + +@pytest.mark.parametrize("method", ["fit", "partial_fit"]) +def test_feature_names_out(method): + """Check `get_feature_names_out` for `BernoulliRBM`.""" + n_components = 10 + rbm = BernoulliRBM(n_components=n_components) + getattr(rbm, method)(Xdigits) + + names = rbm.get_feature_names_out() + expected_names = [f"bernoullirbm{i}" for i in range(n_components)] + assert_array_equal(expected_names, names) diff --git a/.venv/Lib/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py new file mode 100644 index 0000000000000000000000000000000000000000..1087da642c3c497602bdc90a7aaa9da0e069ee57 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py @@ -0,0 +1,112 @@ +import numpy as np + +from sklearn.neural_network._stochastic_optimizers import ( + AdamOptimizer, + BaseOptimizer, + SGDOptimizer, +) +from sklearn.utils._testing import assert_array_equal + +shapes = [(4, 6), (6, 8), (7, 8, 9)] + + +def test_base_optimizer(): + for lr in [10**i for i in range(-3, 4)]: + optimizer = BaseOptimizer(lr) + assert optimizer.trigger_stopping("", False) + + +def test_sgd_optimizer_no_momentum(): + params = [np.zeros(shape) for shape in shapes] + rng = np.random.RandomState(0) + + for lr in [10**i for i in range(-3, 4)]: + optimizer = SGDOptimizer(params, lr, momentum=0, nesterov=False) + grads = [rng.random_sample(shape) for shape in shapes] + expected = [param - lr * grad for param, grad in zip(params, grads)] + optimizer.update_params(params, grads) + + for exp, param in zip(expected, params): + assert_array_equal(exp, param) + + +def test_sgd_optimizer_momentum(): + params = [np.zeros(shape) for shape in shapes] + lr = 0.1 + rng = np.random.RandomState(0) + + for momentum in np.arange(0.5, 0.9, 0.1): + optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=False) + velocities = [rng.random_sample(shape) for shape in shapes] + optimizer.velocities = velocities + grads = [rng.random_sample(shape) for shape in shapes] + updates = [ + momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads) + ] + expected = [param + update for param, update in zip(params, updates)] + optimizer.update_params(params, grads) + + for exp, param in zip(expected, params): + assert_array_equal(exp, param) + + +def test_sgd_optimizer_trigger_stopping(): + params = [np.zeros(shape) for shape in shapes] + lr = 2e-6 + optimizer = SGDOptimizer(params, lr, lr_schedule="adaptive") + assert not optimizer.trigger_stopping("", False) + assert lr / 5 == optimizer.learning_rate + assert optimizer.trigger_stopping("", False) + + +def test_sgd_optimizer_nesterovs_momentum(): + params = [np.zeros(shape) for shape in shapes] + lr = 0.1 + rng = np.random.RandomState(0) + + for momentum in np.arange(0.5, 0.9, 0.1): + optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=True) + velocities = [rng.random_sample(shape) for shape in shapes] + optimizer.velocities = velocities + grads = [rng.random_sample(shape) for shape in shapes] + updates = [ + momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads) + ] + updates = [ + momentum * update - lr * grad for update, grad in zip(updates, grads) + ] + expected = [param + update for param, update in zip(params, updates)] + optimizer.update_params(params, grads) + + for exp, param in zip(expected, params): + assert_array_equal(exp, param) + + +def test_adam_optimizer(): + params = [np.zeros(shape) for shape in shapes] + lr = 0.001 + epsilon = 1e-8 + rng = np.random.RandomState(0) + + for beta_1 in np.arange(0.9, 1.0, 0.05): + for beta_2 in np.arange(0.995, 1.0, 0.001): + optimizer = AdamOptimizer(params, lr, beta_1, beta_2, epsilon) + ms = [rng.random_sample(shape) for shape in shapes] + vs = [rng.random_sample(shape) for shape in shapes] + t = 10 + optimizer.ms = ms + optimizer.vs = vs + optimizer.t = t - 1 + grads = [rng.random_sample(shape) for shape in shapes] + + ms = [beta_1 * m + (1 - beta_1) * grad for m, grad in zip(ms, grads)] + vs = [beta_2 * v + (1 - beta_2) * (grad**2) for v, grad in zip(vs, grads)] + learning_rate = lr * np.sqrt(1 - beta_2**t) / (1 - beta_1**t) + updates = [ + -learning_rate * m / (np.sqrt(v) + epsilon) for m, v in zip(ms, vs) + ] + expected = [param + update for param, update in zip(params, updates)] + + optimizer.update_params(params, grads) + for exp, param in zip(expected, params): + assert_array_equal(exp, param) diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/__init__.py b/.venv/Lib/site-packages/sklearn/preprocessing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9d48372874958cf43d13c4c4bebd90abc28344f3 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/__init__.py @@ -0,0 +1,63 @@ +"""Methods for scaling, centering, normalization, binarization, and more.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._data import ( + Binarizer, + KernelCenterer, + MaxAbsScaler, + MinMaxScaler, + Normalizer, + PowerTransformer, + QuantileTransformer, + RobustScaler, + StandardScaler, + add_dummy_feature, + binarize, + maxabs_scale, + minmax_scale, + normalize, + power_transform, + quantile_transform, + robust_scale, + scale, +) +from ._discretization import KBinsDiscretizer +from ._encoders import OneHotEncoder, OrdinalEncoder +from ._function_transformer import FunctionTransformer +from ._label import LabelBinarizer, LabelEncoder, MultiLabelBinarizer, label_binarize +from ._polynomial import PolynomialFeatures, SplineTransformer +from ._target_encoder import TargetEncoder + +__all__ = [ + "Binarizer", + "FunctionTransformer", + "KBinsDiscretizer", + "KernelCenterer", + "LabelBinarizer", + "LabelEncoder", + "MultiLabelBinarizer", + "MinMaxScaler", + "MaxAbsScaler", + "QuantileTransformer", + "Normalizer", + "OneHotEncoder", + "OrdinalEncoder", + "PowerTransformer", + "RobustScaler", + "SplineTransformer", + "StandardScaler", + "TargetEncoder", + "add_dummy_feature", + "PolynomialFeatures", + "binarize", + "normalize", + "scale", + "robust_scale", + "maxabs_scale", + "minmax_scale", + "label_binarize", + "quantile_transform", + "power_transform", +] diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..bf29e3455aa6a7241418c917282ecbe907f7a300 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.lib differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.pyd new file mode 100644 index 0000000000000000000000000000000000000000..e43d91b51982942b22afdec5ad9a70e3065d65d9 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.pyd differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx new file mode 100644 index 0000000000000000000000000000000000000000..8a7260f2f48f6373981e9dd8fe8ecbaadb752bf6 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -0,0 +1,258 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ..utils._typedefs cimport uint8_t, int64_t, intp_t + +ctypedef uint8_t FLAG_t + +# We use the following verbatim block to determine whether the current +# platform's compiler supports 128-bit integer values intrinsically. +# This should work for GCC and CLANG on 64-bit architectures, but doesn't for +# MSVC on any architecture. We prefer to use 128-bit integers when possible +# because the intermediate calculations have a non-trivial risk of overflow. It +# is, however, very unlikely to come up on an average use case, hence 64-bit +# integers (i.e. `long long`) are "good enough" for most common cases. There is +# not much we can do to efficiently mitigate the overflow risk on the Windows +# platform at this time. Consider this a "best effort" design decision that +# could be revisited later in case someone comes up with a safer option that +# does not hurt the performance of the common cases. +# See `test_sizeof_LARGEST_INT_t()`for more information on exact type expectations. +cdef extern from *: + """ + #ifdef __SIZEOF_INT128__ + typedef __int128 LARGEST_INT_t; + #elif (__clang__ || __EMSCRIPTEN__) && !__i386__ + typedef _BitInt(128) LARGEST_INT_t; + #else + typedef long long LARGEST_INT_t; + #endif + """ + ctypedef long long LARGEST_INT_t + + +# Determine the size of `LARGEST_INT_t` at runtime. +# Used in `test_sizeof_LARGEST_INT_t`. +def _get_sizeof_LARGEST_INT_t(): + return sizeof(LARGEST_INT_t) + + +# TODO: use `{int,float}{32,64}_t` when cython#5230 is resolved: +# https://github.com/cython/cython/issues/5230 +ctypedef fused DATA_t: + float + double + int + long long +# INDEX_{A,B}_t are defined to generate a proper Cartesian product +# of types through Cython fused-type expansion. +ctypedef fused INDEX_A_t: + signed int + signed long long +ctypedef fused INDEX_B_t: + signed int + signed long long + +cdef inline int64_t _deg2_column( + LARGEST_INT_t n_features, + LARGEST_INT_t i, + LARGEST_INT_t j, + FLAG_t interaction_only +) nogil: + """Compute the index of the column for a degree 2 expansion + + n_features is the dimensionality of the input data, i and j are the indices + for the columns involved in the expansion. + """ + if interaction_only: + return n_features * i - i * (i + 3) / 2 - 1 + j + else: + return n_features * i - i* (i + 1) / 2 + j + + +cdef inline int64_t _deg3_column( + LARGEST_INT_t n_features, + LARGEST_INT_t i, + LARGEST_INT_t j, + LARGEST_INT_t k, + FLAG_t interaction_only +) nogil: + """Compute the index of the column for a degree 3 expansion + + n_features is the dimensionality of the input data, i, j and k are the indices + for the columns involved in the expansion. + """ + if interaction_only: + return ( + ( + (3 * n_features) * (n_features * i - i**2) + + i * (i**2 + 11) - (3 * j) * (j + 3) + ) / 6 + i**2 + n_features * (j - 1 - 2 * i) + k + ) + else: + return ( + ( + (3 * n_features) * (n_features * i - i**2) + + i ** 3 - i - (3 * j) * (j + 1) + ) / 6 + n_features * j + k + ) + + +def py_calc_expanded_nnz_deg2(n, interaction_only): + return n * (n + 1) // 2 - interaction_only * n + + +def py_calc_expanded_nnz_deg3(n, interaction_only): + return n * (n**2 + 3 * n + 2) // 6 - interaction_only * n**2 + + +cpdef int64_t _calc_expanded_nnz( + LARGEST_INT_t n, + FLAG_t interaction_only, + LARGEST_INT_t degree +): + """ + Calculates the number of non-zero interaction terms generated by the + non-zero elements of a single row. + """ + # This is the maximum value before the intermediate computation + # d**2 + d overflows + # Solution to d**2 + d = maxint64 + # SymPy: solve(x**2 + x - int64_max, x) + cdef int64_t MAX_SAFE_INDEX_CALC_DEG2 = 3037000499 + + # This is the maximum value before the intermediate computation + # d**3 + 3 * d**2 + 2*d overflows + # Solution to d**3 + 3 * d**2 + 2*d = maxint64 + # SymPy: solve(x * (x**2 + 3 * x + 2) - int64_max, x) + cdef int64_t MAX_SAFE_INDEX_CALC_DEG3 = 2097151 + + if degree == 2: + # Only need to check when not using 128-bit integers + if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG2: + return n * (n + 1) / 2 - interaction_only * n + return py_calc_expanded_nnz_deg2(n, interaction_only) + else: + # Only need to check when not using 128-bit integers + if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG3: + return n * (n**2 + 3 * n + 2) / 6 - interaction_only * n**2 + return py_calc_expanded_nnz_deg3(n, interaction_only) + +cpdef int64_t _calc_total_nnz( + INDEX_A_t[:] indptr, + FLAG_t interaction_only, + int64_t degree, +): + """ + Calculates the number of non-zero interaction terms generated by the + non-zero elements across all rows for a single degree. + """ + cdef int64_t total_nnz=0 + cdef intp_t row_idx + for row_idx in range(len(indptr) - 1): + total_nnz += _calc_expanded_nnz( + indptr[row_idx + 1] - indptr[row_idx], + interaction_only, + degree + ) + return total_nnz + + +cpdef void _csr_polynomial_expansion( + const DATA_t[:] data, # IN READ-ONLY + const INDEX_A_t[:] indices, # IN READ-ONLY + const INDEX_A_t[:] indptr, # IN READ-ONLY + INDEX_A_t n_features, + DATA_t[:] result_data, # OUT + INDEX_B_t[:] result_indices, # OUT + INDEX_B_t[:] result_indptr, # OUT + FLAG_t interaction_only, + FLAG_t degree +): + """ + Perform a second or third degree polynomial or interaction expansion on a + compressed sparse row (CSR) matrix. The method used only takes products of + non-zero features. For a matrix with density :math:`d`, this results in a + speedup on the order of :math:`(1/d)^k` where :math:`k` is the degree of + the expansion, assuming all rows are of similar density. + + Parameters + ---------- + data : memory view on nd-array + The "data" attribute of the input CSR matrix. + + indices : memory view on nd-array + The "indices" attribute of the input CSR matrix. + + indptr : memory view on nd-array + The "indptr" attribute of the input CSR matrix. + + n_features : int + The dimensionality of the input CSR matrix. + + result_data : nd-array + The output CSR matrix's "data" attribute. + It is modified by this routine. + + result_indices : nd-array + The output CSR matrix's "indices" attribute. + It is modified by this routine. + + result_indptr : nd-array + The output CSR matrix's "indptr" attribute. + It is modified by this routine. + + interaction_only : int + 0 for a polynomial expansion, 1 for an interaction expansion. + + degree : int + The degree of the expansion. This must be either 2 or 3. + + References + ---------- + "Leveraging Sparsity to Speed Up Polynomial Feature Expansions of CSR + Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes. + """ + + # Make the arrays that will form the CSR matrix of the expansion. + cdef INDEX_A_t row_i, row_starts, row_ends, i, j, k, i_ptr, j_ptr, k_ptr + cdef INDEX_B_t expanded_index=0, num_cols_in_row, col + with nogil: + result_indptr[0] = indptr[0] + for row_i in range(indptr.shape[0]-1): + row_starts = indptr[row_i] + row_ends = indptr[row_i + 1] + num_cols_in_row = 0 + for i_ptr in range(row_starts, row_ends): + i = indices[i_ptr] + for j_ptr in range(i_ptr + interaction_only, row_ends): + j = indices[j_ptr] + if degree == 2: + col = _deg2_column( + n_features, + i, j, + interaction_only + ) + result_indices[expanded_index] = col + result_data[expanded_index] = ( + data[i_ptr] * data[j_ptr] + ) + expanded_index += 1 + num_cols_in_row += 1 + else: + # degree == 3 + for k_ptr in range(j_ptr + interaction_only, row_ends): + k = indices[k_ptr] + col = _deg3_column( + n_features, + i, j, k, + interaction_only + ) + result_indices[expanded_index] = col + result_data[expanded_index] = ( + data[i_ptr] * data[j_ptr] * data[k_ptr] + ) + expanded_index += 1 + num_cols_in_row += 1 + + result_indptr[row_i+1] = result_indptr[row_i] + num_cols_in_row + return diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_data.py b/.venv/Lib/site-packages/sklearn/preprocessing/_data.py new file mode 100644 index 0000000000000000000000000000000000000000..b8f4ec3696dc3379521a359ecbc22db9a419a4d9 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/_data.py @@ -0,0 +1,3683 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy import optimize, sparse, stats +from scipy.special import boxcox, inv_boxcox + +from sklearn.utils import metadata_routing + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + OneToOneFeatureMixin, + TransformerMixin, + _fit_context, +) +from ..utils import _array_api, check_array, resample +from ..utils._array_api import _modify_in_place_if_numpy, device, get_namespace +from ..utils._param_validation import Interval, Options, StrOptions, validate_params +from ..utils.extmath import _incremental_mean_and_var, row_norms +from ..utils.sparsefuncs import ( + incr_mean_variance_axis, + inplace_column_scale, + mean_variance_axis, + min_max_axis, +) +from ..utils.sparsefuncs_fast import ( + inplace_csr_row_normalize_l1, + inplace_csr_row_normalize_l2, +) +from ..utils.validation import ( + FLOAT_DTYPES, + _check_sample_weight, + check_is_fitted, + check_random_state, + validate_data, +) +from ._encoders import OneHotEncoder + +BOUNDS_THRESHOLD = 1e-7 + +__all__ = [ + "Binarizer", + "KernelCenterer", + "MinMaxScaler", + "MaxAbsScaler", + "Normalizer", + "OneHotEncoder", + "RobustScaler", + "StandardScaler", + "QuantileTransformer", + "PowerTransformer", + "add_dummy_feature", + "binarize", + "normalize", + "scale", + "robust_scale", + "maxabs_scale", + "minmax_scale", + "quantile_transform", + "power_transform", +] + + +def _is_constant_feature(var, mean, n_samples): + """Detect if a feature is indistinguishable from a constant feature. + + The detection is based on its computed variance and on the theoretical + error bounds of the '2 pass algorithm' for variance computation. + + See "Algorithms for computing the sample variance: analysis and + recommendations", by Chan, Golub, and LeVeque. + """ + # In scikit-learn, variance is always computed using float64 accumulators. + eps = np.finfo(np.float64).eps + + upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2 + return var <= upper_bound + + +def _handle_zeros_in_scale(scale, copy=True, constant_mask=None): + """Set scales of near constant features to 1. + + The goal is to avoid division by very small or zero values. + + Near constant features are detected automatically by identifying + scales close to machine precision unless they are precomputed by + the caller and passed with the `constant_mask` kwarg. + + Typically for standard scaling, the scales are the standard + deviation while near constant features are better detected on the + computed variances which are closer to machine precision by + construction. + """ + # if we are fitting on 1D arrays, scale might be a scalar + if np.isscalar(scale): + if scale == 0.0: + scale = 1.0 + return scale + # scale is an array + else: + xp, _ = get_namespace(scale) + if constant_mask is None: + # Detect near constant values to avoid dividing by a very small + # value that could lead to surprising results and numerical + # stability issues. + constant_mask = scale < 10 * xp.finfo(scale.dtype).eps + + if copy: + # New array to avoid side-effects + scale = xp.asarray(scale, copy=True) + scale[constant_mask] = 1.0 + return scale + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "axis": [Options(Integral, {0, 1})], + "with_mean": ["boolean"], + "with_std": ["boolean"], + "copy": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): + """Standardize a dataset along any axis. + + Center to the mean and component wise scale to unit variance. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to center and scale. + + axis : {0, 1}, default=0 + Axis used to compute the means and standard deviations along. If 0, + independently standardize each feature, otherwise (if 1) standardize + each sample. + + with_mean : bool, default=True + If True, center the data before scaling. + + with_std : bool, default=True + If True, scale the data to unit variance (or equivalently, + unit standard deviation). + + copy : bool, default=True + If False, try to avoid a copy and scale in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + The transformed data. + + See Also + -------- + StandardScaler : Performs scaling to unit variance using the Transformer + API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + This implementation will refuse to center scipy.sparse matrices + since it would make them non-sparse and would potentially crash the + program with memory exhaustion problems. + + Instead the caller is expected to either set explicitly + `with_mean=False` (in that case, only variance scaling will be + performed on the features of the CSC matrix) or to call `X.toarray()` + if he/she expects the materialized dense array to fit in memory. + + To avoid memory copy the caller should pass a CSC matrix. + + NaNs are treated as missing values: disregarded to compute the statistics, + and maintained during the data transformation. + + We use a biased estimator for the standard deviation, equivalent to + `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to + affect model performance. + + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.scale` unless you know + what you are doing. A common mistake is to apply it to the entire data + *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.StandardScaler` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`. + + Examples + -------- + >>> from sklearn.preprocessing import scale + >>> X = [[-2, 1, 2], [-1, 0, 1]] + >>> scale(X, axis=0) # scaling each column independently + array([[-1., 1., 1.], + [ 1., -1., -1.]]) + >>> scale(X, axis=1) # scaling each row independently + array([[-1.37..., 0.39..., 0.98...], + [-1.22..., 0. , 1.22...]]) + """ + X = check_array( + X, + accept_sparse="csc", + copy=copy, + ensure_2d=False, + estimator="the scale function", + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + ) + if sparse.issparse(X): + if with_mean: + raise ValueError( + "Cannot center sparse matrices: pass `with_mean=False` instead" + " See docstring for motivation and alternatives." + ) + if axis != 0: + raise ValueError( + "Can only scale sparse matrix on axis=0, got axis=%d" % axis + ) + if with_std: + _, var = mean_variance_axis(X, axis=0) + var = _handle_zeros_in_scale(var, copy=False) + inplace_column_scale(X, 1 / np.sqrt(var)) + else: + X = np.asarray(X) + if with_mean: + mean_ = np.nanmean(X, axis) + if with_std: + scale_ = np.nanstd(X, axis) + # Xr is a view on the original array that enables easy use of + # broadcasting on the axis in which we are interested in + Xr = np.rollaxis(X, axis) + if with_mean: + Xr -= mean_ + mean_1 = np.nanmean(Xr, axis=0) + # Verify that mean_1 is 'close to zero'. If X contains very + # large values, mean_1 can also be very large, due to a lack of + # precision of mean_. In this case, a pre-scaling of the + # concerned feature is efficient, for instance by its mean or + # maximum. + if not np.allclose(mean_1, 0): + warnings.warn( + "Numerical issues were encountered " + "when centering the data " + "and might not be solved. Dataset may " + "contain too large values. You may need " + "to prescale your features." + ) + Xr -= mean_1 + if with_std: + scale_ = _handle_zeros_in_scale(scale_, copy=False) + Xr /= scale_ + if with_mean: + mean_2 = np.nanmean(Xr, axis=0) + # If mean_2 is not 'close to zero', it comes from the fact that + # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even + # if mean_1 was close to zero. The problem is thus essentially + # due to the lack of precision of mean_. A solution is then to + # subtract the mean again: + if not np.allclose(mean_2, 0): + warnings.warn( + "Numerical issues were encountered " + "when scaling the data " + "and might not be solved. The standard " + "deviation of the data is probably " + "very close to 0. " + ) + Xr -= mean_2 + return X + + +class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Transform features by scaling each feature to a given range. + + This estimator scales and translates each feature individually such + that it is in the given range on the training set, e.g. between + zero and one. + + The transformation is given by:: + + X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) + X_scaled = X_std * (max - min) + min + + where min, max = feature_range. + + This transformation is often used as an alternative to zero mean, + unit variance scaling. + + `MinMaxScaler` doesn't reduce the effect of outliers, but it linearly + scales them down into a fixed range, where the largest occurring data point + corresponds to the maximum value and the smallest one corresponds to the + minimum value. For an example visualization, refer to :ref:`Compare + MinMaxScaler with other scalers `. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + feature_range : tuple (min, max), default=(0, 1) + Desired range of transformed data. + + copy : bool, default=True + Set to False to perform inplace row normalization and avoid a + copy (if the input is already a numpy array). + + clip : bool, default=False + Set to True to clip transformed values of held-out data to + provided `feature range`. + + .. versionadded:: 0.24 + + Attributes + ---------- + min_ : ndarray of shape (n_features,) + Per feature adjustment for minimum. Equivalent to + ``min - X.min(axis=0) * self.scale_`` + + scale_ : ndarray of shape (n_features,) + Per feature relative scaling of the data. Equivalent to + ``(max - min) / (X.max(axis=0) - X.min(axis=0))`` + + .. versionadded:: 0.17 + *scale_* attribute. + + data_min_ : ndarray of shape (n_features,) + Per feature minimum seen in the data + + .. versionadded:: 0.17 + *data_min_* + + data_max_ : ndarray of shape (n_features,) + Per feature maximum seen in the data + + .. versionadded:: 0.17 + *data_max_* + + data_range_ : ndarray of shape (n_features,) + Per feature range ``(data_max_ - data_min_)`` seen in the data + + .. versionadded:: 0.17 + *data_range_* + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + n_samples_seen_ : int + The number of samples processed by the estimator. + It will be reset on new calls to fit, but increments across + ``partial_fit`` calls. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + minmax_scale : Equivalent function without the estimator API. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + Examples + -------- + >>> from sklearn.preprocessing import MinMaxScaler + >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]] + >>> scaler = MinMaxScaler() + >>> print(scaler.fit(data)) + MinMaxScaler() + >>> print(scaler.data_max_) + [ 1. 18.] + >>> print(scaler.transform(data)) + [[0. 0. ] + [0.25 0.25] + [0.5 0.5 ] + [1. 1. ]] + >>> print(scaler.transform([[2, 2]])) + [[1.5 0. ]] + """ + + _parameter_constraints: dict = { + "feature_range": [tuple], + "copy": ["boolean"], + "clip": ["boolean"], + } + + def __init__(self, feature_range=(0, 1), *, copy=True, clip=False): + self.feature_range = feature_range + self.copy = copy + self.clip = clip + + def _reset(self): + """Reset internal data-dependent state of the scaler, if necessary. + + __init__ parameters are not touched. + """ + # Checking one attribute is enough, because they are all set together + # in partial_fit + if hasattr(self, "scale_"): + del self.scale_ + del self.min_ + del self.n_samples_seen_ + del self.data_min_ + del self.data_max_ + del self.data_range_ + + def fit(self, X, y=None): + """Compute the minimum and maximum to be used for later scaling. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data used to compute the per-feature minimum and maximum + used for later scaling along the features axis. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted scaler. + """ + # Reset internal state before fitting + self._reset() + return self.partial_fit(X, y) + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None): + """Online computation of min and max on X for later scaling. + + All of X is processed as a single batch. This is intended for cases + when :meth:`fit` is not feasible due to very large number of + `n_samples` or because X is read from a continuous stream. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted scaler. + """ + feature_range = self.feature_range + if feature_range[0] >= feature_range[1]: + raise ValueError( + "Minimum of desired feature range must be smaller than maximum. Got %s." + % str(feature_range) + ) + + if sparse.issparse(X): + raise TypeError( + "MinMaxScaler does not support sparse input. " + "Consider using MaxAbsScaler instead." + ) + + xp, _ = get_namespace(X) + + first_pass = not hasattr(self, "n_samples_seen_") + X = validate_data( + self, + X, + reset=first_pass, + dtype=_array_api.supported_float_dtypes(xp), + ensure_all_finite="allow-nan", + ) + + data_min = _array_api._nanmin(X, axis=0, xp=xp) + data_max = _array_api._nanmax(X, axis=0, xp=xp) + + if first_pass: + self.n_samples_seen_ = X.shape[0] + else: + data_min = xp.minimum(self.data_min_, data_min) + data_max = xp.maximum(self.data_max_, data_max) + self.n_samples_seen_ += X.shape[0] + + data_range = data_max - data_min + self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale( + data_range, copy=True + ) + self.min_ = feature_range[0] - data_min * self.scale_ + self.data_min_ = data_min + self.data_max_ = data_max + self.data_range_ = data_range + return self + + def transform(self, X): + """Scale features of X according to feature_range. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data that will be transformed. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_features) + Transformed data. + """ + check_is_fitted(self) + + xp, _ = get_namespace(X) + + X = validate_data( + self, + X, + copy=self.copy, + dtype=_array_api.supported_float_dtypes(xp), + force_writeable=True, + ensure_all_finite="allow-nan", + reset=False, + ) + + X *= self.scale_ + X += self.min_ + if self.clip: + device_ = device(X) + X = _modify_in_place_if_numpy( + xp, + xp.clip, + X, + xp.asarray(self.feature_range[0], dtype=X.dtype, device=device_), + xp.asarray(self.feature_range[1], dtype=X.dtype, device=device_), + out=X, + ) + return X + + def inverse_transform(self, X): + """Undo the scaling of X according to feature_range. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data that will be transformed. It cannot be sparse. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_features) + Transformed data. + """ + check_is_fitted(self) + + xp, _ = get_namespace(X) + + X = check_array( + X, + copy=self.copy, + dtype=_array_api.supported_float_dtypes(xp), + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + X -= self.min_ + X /= self.scale_ + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.array_api_support = True + return tags + + +@validate_params( + { + "X": ["array-like"], + "axis": [Options(Integral, {0, 1})], + }, + prefer_skip_nested_validation=False, +) +def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): + """Transform features by scaling each feature to a given range. + + This estimator scales and translates each feature individually such + that it is in the given range on the training set, i.e. between + zero and one. + + The transformation is given by (when ``axis=0``):: + + X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) + X_scaled = X_std * (max - min) + min + + where min, max = feature_range. + + The transformation is calculated as (when ``axis=0``):: + + X_scaled = scale * X + min - X.min(axis=0) * scale + where scale = (max - min) / (X.max(axis=0) - X.min(axis=0)) + + This transformation is often used as an alternative to zero mean, + unit variance scaling. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.17 + *minmax_scale* function interface + to :class:`~sklearn.preprocessing.MinMaxScaler`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data. + + feature_range : tuple (min, max), default=(0, 1) + Desired range of transformed data. + + axis : {0, 1}, default=0 + Axis used to scale along. If 0, independently scale each feature, + otherwise (if 1) scale each sample. + + copy : bool, default=True + If False, try to avoid a copy and scale in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + Returns + ------- + X_tr : ndarray of shape (n_samples, n_features) + The transformed data. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know + what you are doing. A common mistake is to apply it to the entire data + *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.MinMaxScaler` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`. + + See Also + -------- + MinMaxScaler : Performs scaling to a given range using the Transformer + API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + Examples + -------- + >>> from sklearn.preprocessing import minmax_scale + >>> X = [[-2, 1, 2], [-1, 0, 1]] + >>> minmax_scale(X, axis=0) # scale each column independently + array([[0., 1., 1.], + [1., 0., 0.]]) + >>> minmax_scale(X, axis=1) # scale each row independently + array([[0. , 0.75, 1. ], + [0. , 0.5 , 1. ]]) + """ + # Unlike the scaler object, this function allows 1d input. + # If copy is required, it will be done inside the scaler object. + X = check_array( + X, + copy=False, + ensure_2d=False, + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + ) + original_ndim = X.ndim + + if original_ndim == 1: + X = X.reshape(X.shape[0], 1) + + s = MinMaxScaler(feature_range=feature_range, copy=copy) + if axis == 0: + X = s.fit_transform(X) + else: + X = s.fit_transform(X.T).T + + if original_ndim == 1: + X = X.ravel() + + return X + + +class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Standardize features by removing the mean and scaling to unit variance. + + The standard score of a sample `x` is calculated as: + + .. code-block:: text + + z = (x - u) / s + + where `u` is the mean of the training samples or zero if `with_mean=False`, + and `s` is the standard deviation of the training samples or one if + `with_std=False`. + + Centering and scaling happen independently on each feature by computing + the relevant statistics on the samples in the training set. Mean and + standard deviation are then stored to be used on later data using + :meth:`transform`. + + Standardization of a dataset is a common requirement for many + machine learning estimators: they might behave badly if the + individual features do not more or less look like standard normally + distributed data (e.g. Gaussian with 0 mean and unit variance). + + For instance many elements used in the objective function of + a learning algorithm (such as the RBF kernel of Support Vector + Machines or the L1 and L2 regularizers of linear models) assume that + all features are centered around 0 and have variance in the same + order. If a feature has a variance that is orders of magnitude larger + than others, it might dominate the objective function and make the + estimator unable to learn from other features correctly as expected. + + `StandardScaler` is sensitive to outliers, and the features may scale + differently from each other in the presence of outliers. For an example + visualization, refer to :ref:`Compare StandardScaler with other scalers + `. + + This scaler can also be applied to sparse CSR or CSC matrices by passing + `with_mean=False` to avoid breaking the sparsity structure of the data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + copy : bool, default=True + If False, try to avoid a copy and do inplace scaling instead. + This is not guaranteed to always work inplace; e.g. if the data is + not a NumPy array or scipy.sparse CSR matrix, a copy may still be + returned. + + with_mean : bool, default=True + If True, center the data before scaling. + This does not work (and will raise an exception) when attempted on + sparse matrices, because centering them entails building a dense + matrix which in common use cases is likely to be too large to fit in + memory. + + with_std : bool, default=True + If True, scale the data to unit variance (or equivalently, + unit standard deviation). + + Attributes + ---------- + scale_ : ndarray of shape (n_features,) or None + Per feature relative scaling of the data to achieve zero mean and unit + variance. Generally this is calculated using `np.sqrt(var_)`. If a + variance is zero, we can't achieve unit variance, and the data is left + as-is, giving a scaling factor of 1. `scale_` is equal to `None` + when `with_std=False`. + + .. versionadded:: 0.17 + *scale_* + + mean_ : ndarray of shape (n_features,) or None + The mean value for each feature in the training set. + Equal to ``None`` when ``with_mean=False`` and ``with_std=False``. + + var_ : ndarray of shape (n_features,) or None + The variance for each feature in the training set. Used to compute + `scale_`. Equal to ``None`` when ``with_mean=False`` and + ``with_std=False``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_seen_ : int or ndarray of shape (n_features,) + The number of samples processed by the estimator for each feature. + If there are no missing samples, the ``n_samples_seen`` will be an + integer, otherwise it will be an array of dtype int. If + `sample_weights` are used it will be a float (if no missing data) + or an array of dtype float that sums the weights seen so far. + Will be reset on new calls to fit, but increments across + ``partial_fit`` calls. + + See Also + -------- + scale : Equivalent function without the estimator API. + + :class:`~sklearn.decomposition.PCA` : Further removes the linear + correlation across features with 'whiten=True'. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + We use a biased estimator for the standard deviation, equivalent to + `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to + affect model performance. + + Examples + -------- + >>> from sklearn.preprocessing import StandardScaler + >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]] + >>> scaler = StandardScaler() + >>> print(scaler.fit(data)) + StandardScaler() + >>> print(scaler.mean_) + [0.5 0.5] + >>> print(scaler.transform(data)) + [[-1. -1.] + [-1. -1.] + [ 1. 1.] + [ 1. 1.]] + >>> print(scaler.transform([[2, 2]])) + [[3. 3.]] + """ + + _parameter_constraints: dict = { + "copy": ["boolean"], + "with_mean": ["boolean"], + "with_std": ["boolean"], + } + + def __init__(self, *, copy=True, with_mean=True, with_std=True): + self.with_mean = with_mean + self.with_std = with_std + self.copy = copy + + def _reset(self): + """Reset internal data-dependent state of the scaler, if necessary. + + __init__ parameters are not touched. + """ + # Checking one attribute is enough, because they are all set together + # in partial_fit + if hasattr(self, "scale_"): + del self.scale_ + del self.n_samples_seen_ + del self.mean_ + del self.var_ + + def fit(self, X, y=None, sample_weight=None): + """Compute the mean and std to be used for later scaling. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + + y : None + Ignored. + + sample_weight : array-like of shape (n_samples,), default=None + Individual weights for each sample. + + .. versionadded:: 0.24 + parameter *sample_weight* support to StandardScaler. + + Returns + ------- + self : object + Fitted scaler. + """ + # Reset internal state before fitting + self._reset() + return self.partial_fit(X, y, sample_weight) + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None, sample_weight=None): + """Online computation of mean and std on X for later scaling. + + All of X is processed as a single batch. This is intended for cases + when :meth:`fit` is not feasible due to very large number of + `n_samples` or because X is read from a continuous stream. + + The algorithm for incremental mean and std is given in Equation 1.5a,b + in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms + for computing the sample variance: Analysis and recommendations." + The American Statistician 37.3 (1983): 242-247: + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + + y : None + Ignored. + + sample_weight : array-like of shape (n_samples,), default=None + Individual weights for each sample. + + .. versionadded:: 0.24 + parameter *sample_weight* support to StandardScaler. + + Returns + ------- + self : object + Fitted scaler. + """ + first_call = not hasattr(self, "n_samples_seen_") + X = validate_data( + self, + X, + accept_sparse=("csr", "csc"), + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + reset=first_call, + ) + n_features = X.shape[1] + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + # Even in the case of `with_mean=False`, we update the mean anyway + # This is needed for the incremental computation of the var + # See incr_mean_variance_axis and _incremental_mean_variance_axis + + # if n_samples_seen_ is an integer (i.e. no missing values), we need to + # transform it to a NumPy array of shape (n_features,) required by + # incr_mean_variance_axis and _incremental_variance_axis + dtype = np.int64 if sample_weight is None else X.dtype + if not hasattr(self, "n_samples_seen_"): + self.n_samples_seen_ = np.zeros(n_features, dtype=dtype) + elif np.size(self.n_samples_seen_) == 1: + self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1]) + self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False) + + if sparse.issparse(X): + if self.with_mean: + raise ValueError( + "Cannot center sparse matrices: pass `with_mean=False` " + "instead. See docstring for motivation and alternatives." + ) + sparse_constructor = ( + sparse.csr_matrix if X.format == "csr" else sparse.csc_matrix + ) + + if self.with_std: + # First pass + if not hasattr(self, "scale_"): + self.mean_, self.var_, self.n_samples_seen_ = mean_variance_axis( + X, axis=0, weights=sample_weight, return_sum_weights=True + ) + # Next passes + else: + ( + self.mean_, + self.var_, + self.n_samples_seen_, + ) = incr_mean_variance_axis( + X, + axis=0, + last_mean=self.mean_, + last_var=self.var_, + last_n=self.n_samples_seen_, + weights=sample_weight, + ) + # We force the mean and variance to float64 for large arrays + # See https://github.com/scikit-learn/scikit-learn/pull/12338 + self.mean_ = self.mean_.astype(np.float64, copy=False) + self.var_ = self.var_.astype(np.float64, copy=False) + else: + self.mean_ = None # as with_mean must be False for sparse + self.var_ = None + weights = _check_sample_weight(sample_weight, X) + sum_weights_nan = weights @ sparse_constructor( + (np.isnan(X.data), X.indices, X.indptr), shape=X.shape + ) + self.n_samples_seen_ += (np.sum(weights) - sum_weights_nan).astype( + dtype + ) + else: + # First pass + if not hasattr(self, "scale_"): + self.mean_ = 0.0 + if self.with_std: + self.var_ = 0.0 + else: + self.var_ = None + + if not self.with_mean and not self.with_std: + self.mean_ = None + self.var_ = None + self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0) + + else: + self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var( + X, + self.mean_, + self.var_, + self.n_samples_seen_, + sample_weight=sample_weight, + ) + + # for backward-compatibility, reduce n_samples_seen_ to an integer + # if the number of samples is the same for each feature (i.e. no + # missing values) + if np.ptp(self.n_samples_seen_) == 0: + self.n_samples_seen_ = self.n_samples_seen_[0] + + if self.with_std: + # Extract the list of near constant features on the raw variances, + # before taking the square root. + constant_mask = _is_constant_feature( + self.var_, self.mean_, self.n_samples_seen_ + ) + self.scale_ = _handle_zeros_in_scale( + np.sqrt(self.var_), copy=False, constant_mask=constant_mask + ) + else: + self.scale_ = None + + return self + + def transform(self, X, copy=None): + """Perform standardization by centering and scaling. + + Parameters + ---------- + X : {array-like, sparse matrix of shape (n_samples, n_features) + The data used to scale along the features axis. + copy : bool, default=None + Copy the input X or not. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + + copy = copy if copy is not None else self.copy + X = validate_data( + self, + X, + reset=False, + accept_sparse="csr", + copy=copy, + dtype=FLOAT_DTYPES, + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + if self.with_mean: + raise ValueError( + "Cannot center sparse matrices: pass `with_mean=False` " + "instead. See docstring for motivation and alternatives." + ) + if self.scale_ is not None: + inplace_column_scale(X, 1 / self.scale_) + else: + if self.with_mean: + X -= self.mean_ + if self.with_std: + X /= self.scale_ + return X + + def inverse_transform(self, X, copy=None): + """Scale back the data to the original representation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to scale along the features axis. + copy : bool, default=None + Copy the input X or not. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + + copy = copy if copy is not None else self.copy + X = check_array( + X, + accept_sparse="csr", + copy=copy, + dtype=FLOAT_DTYPES, + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + if self.with_mean: + raise ValueError( + "Cannot uncenter sparse matrices: pass `with_mean=False` " + "instead See docstring for motivation and alternatives." + ) + if self.scale_ is not None: + inplace_column_scale(X, self.scale_) + else: + if self.with_std: + X *= self.scale_ + if self.with_mean: + X += self.mean_ + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags + + +class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Scale each feature by its maximum absolute value. + + This estimator scales and translates each feature individually such + that the maximal absolute value of each feature in the + training set will be 1.0. It does not shift/center the data, and + thus does not destroy any sparsity. + + This scaler can also be applied to sparse CSR or CSC matrices. + + `MaxAbsScaler` doesn't reduce the effect of outliers; it only linearly + scales them down. For an example visualization, refer to :ref:`Compare + MaxAbsScaler with other scalers `. + + .. versionadded:: 0.17 + + Parameters + ---------- + copy : bool, default=True + Set to False to perform inplace scaling and avoid a copy (if the input + is already a numpy array). + + Attributes + ---------- + scale_ : ndarray of shape (n_features,) + Per feature relative scaling of the data. + + .. versionadded:: 0.17 + *scale_* attribute. + + max_abs_ : ndarray of shape (n_features,) + Per feature maximum absolute value. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_seen_ : int + The number of samples processed by the estimator. Will be reset on + new calls to fit, but increments across ``partial_fit`` calls. + + See Also + -------- + maxabs_scale : Equivalent function without the estimator API. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + Examples + -------- + >>> from sklearn.preprocessing import MaxAbsScaler + >>> X = [[ 1., -1., 2.], + ... [ 2., 0., 0.], + ... [ 0., 1., -1.]] + >>> transformer = MaxAbsScaler().fit(X) + >>> transformer + MaxAbsScaler() + >>> transformer.transform(X) + array([[ 0.5, -1. , 1. ], + [ 1. , 0. , 0. ], + [ 0. , 1. , -0.5]]) + """ + + _parameter_constraints: dict = {"copy": ["boolean"]} + + def __init__(self, *, copy=True): + self.copy = copy + + def _reset(self): + """Reset internal data-dependent state of the scaler, if necessary. + + __init__ parameters are not touched. + """ + # Checking one attribute is enough, because they are all set together + # in partial_fit + if hasattr(self, "scale_"): + del self.scale_ + del self.n_samples_seen_ + del self.max_abs_ + + def fit(self, X, y=None): + """Compute the maximum absolute value to be used for later scaling. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to compute the per-feature minimum and maximum + used for later scaling along the features axis. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted scaler. + """ + # Reset internal state before fitting + self._reset() + return self.partial_fit(X, y) + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None): + """Online computation of max absolute value of X for later scaling. + + All of X is processed as a single batch. This is intended for cases + when :meth:`fit` is not feasible due to very large number of + `n_samples` or because X is read from a continuous stream. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted scaler. + """ + xp, _ = get_namespace(X) + + first_pass = not hasattr(self, "n_samples_seen_") + X = validate_data( + self, + X, + reset=first_pass, + accept_sparse=("csr", "csc"), + dtype=_array_api.supported_float_dtypes(xp), + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + mins, maxs = min_max_axis(X, axis=0, ignore_nan=True) + max_abs = np.maximum(np.abs(mins), np.abs(maxs)) + else: + max_abs = _array_api._nanmax(xp.abs(X), axis=0, xp=xp) + + if first_pass: + self.n_samples_seen_ = X.shape[0] + else: + max_abs = xp.maximum(self.max_abs_, max_abs) + self.n_samples_seen_ += X.shape[0] + + self.max_abs_ = max_abs + self.scale_ = _handle_zeros_in_scale(max_abs, copy=True) + return self + + def transform(self, X): + """Scale the data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data that should be scaled. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + + xp, _ = get_namespace(X) + + X = validate_data( + self, + X, + accept_sparse=("csr", "csc"), + copy=self.copy, + reset=False, + dtype=_array_api.supported_float_dtypes(xp), + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + inplace_column_scale(X, 1.0 / self.scale_) + else: + X /= self.scale_ + return X + + def inverse_transform(self, X): + """Scale back the data to the original representation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data that should be transformed back. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + + xp, _ = get_namespace(X) + + X = check_array( + X, + accept_sparse=("csr", "csc"), + copy=self.copy, + dtype=_array_api.supported_float_dtypes(xp), + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + inplace_column_scale(X, self.scale_) + else: + X *= self.scale_ + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "axis": [Options(Integral, {0, 1})], + }, + prefer_skip_nested_validation=False, +) +def maxabs_scale(X, *, axis=0, copy=True): + """Scale each feature to the [-1, 1] range without breaking the sparsity. + + This estimator scales each feature individually such + that the maximal absolute value of each feature in the + training set will be 1.0. + + This scaler can also be applied to sparse CSR or CSC matrices. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data. + + axis : {0, 1}, default=0 + Axis used to scale along. If 0, independently scale each feature, + otherwise (if 1) scale each sample. + + copy : bool, default=True + If False, try to avoid a copy and scale in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + The transformed data. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know + what you are doing. A common mistake is to apply it to the entire data + *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.MaxAbsScaler` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`. + + See Also + -------- + MaxAbsScaler : Performs scaling to the [-1, 1] range using + the Transformer API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + NaNs are treated as missing values: disregarded to compute the statistics, + and maintained during the data transformation. + + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + Examples + -------- + >>> from sklearn.preprocessing import maxabs_scale + >>> X = [[-2, 1, 2], [-1, 0, 1]] + >>> maxabs_scale(X, axis=0) # scale each column independently + array([[-1. , 1. , 1. ], + [-0.5, 0. , 0.5]]) + >>> maxabs_scale(X, axis=1) # scale each row independently + array([[-1. , 0.5, 1. ], + [-1. , 0. , 1. ]]) + """ + # Unlike the scaler object, this function allows 1d input. + + # If copy is required, it will be done inside the scaler object. + X = check_array( + X, + accept_sparse=("csr", "csc"), + copy=False, + ensure_2d=False, + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + ) + original_ndim = X.ndim + + if original_ndim == 1: + X = X.reshape(X.shape[0], 1) + + s = MaxAbsScaler(copy=copy) + if axis == 0: + X = s.fit_transform(X) + else: + X = s.fit_transform(X.T).T + + if original_ndim == 1: + X = X.ravel() + + return X + + +class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Scale features using statistics that are robust to outliers. + + This Scaler removes the median and scales the data according to + the quantile range (defaults to IQR: Interquartile Range). + The IQR is the range between the 1st quartile (25th quantile) + and the 3rd quartile (75th quantile). + + Centering and scaling happen independently on each feature by + computing the relevant statistics on the samples in the training + set. Median and interquartile range are then stored to be used on + later data using the :meth:`transform` method. + + Standardization of a dataset is a common preprocessing for many machine + learning estimators. Typically this is done by removing the mean and + scaling to unit variance. However, outliers can often influence the sample + mean / variance in a negative way. In such cases, using the median and the + interquartile range often give better results. For an example visualization + and comparison to other scalers, refer to :ref:`Compare RobustScaler with + other scalers `. + + .. versionadded:: 0.17 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + with_centering : bool, default=True + If `True`, center the data before scaling. + This will cause :meth:`transform` to raise an exception when attempted + on sparse matrices, because centering them entails building a dense + matrix which in common use cases is likely to be too large to fit in + memory. + + with_scaling : bool, default=True + If `True`, scale the data to interquartile range. + + quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, \ + default=(25.0, 75.0) + Quantile range used to calculate `scale_`. By default this is equal to + the IQR, i.e., `q_min` is the first quantile and `q_max` is the third + quantile. + + .. versionadded:: 0.18 + + copy : bool, default=True + If `False`, try to avoid a copy and do inplace scaling instead. + This is not guaranteed to always work inplace; e.g. if the data is + not a NumPy array or scipy.sparse CSR matrix, a copy may still be + returned. + + unit_variance : bool, default=False + If `True`, scale data so that normally distributed features have a + variance of 1. In general, if the difference between the x-values of + `q_max` and `q_min` for a standard normal distribution is greater + than 1, the dataset will be scaled down. If less than 1, the dataset + will be scaled up. + + .. versionadded:: 0.24 + + Attributes + ---------- + center_ : array of floats + The median value for each feature in the training set. + + scale_ : array of floats + The (scaled) interquartile range for each feature in the training set. + + .. versionadded:: 0.17 + *scale_* attribute. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + robust_scale : Equivalent function without the estimator API. + sklearn.decomposition.PCA : Further removes the linear correlation across + features with 'whiten=True'. + + Notes + ----- + + https://en.wikipedia.org/wiki/Median + https://en.wikipedia.org/wiki/Interquartile_range + + Examples + -------- + >>> from sklearn.preprocessing import RobustScaler + >>> X = [[ 1., -2., 2.], + ... [ -2., 1., 3.], + ... [ 4., 1., -2.]] + >>> transformer = RobustScaler().fit(X) + >>> transformer + RobustScaler() + >>> transformer.transform(X) + array([[ 0. , -2. , 0. ], + [-1. , 0. , 0.4], + [ 1. , 0. , -1.6]]) + """ + + _parameter_constraints: dict = { + "with_centering": ["boolean"], + "with_scaling": ["boolean"], + "quantile_range": [tuple], + "copy": ["boolean"], + "unit_variance": ["boolean"], + } + + def __init__( + self, + *, + with_centering=True, + with_scaling=True, + quantile_range=(25.0, 75.0), + copy=True, + unit_variance=False, + ): + self.with_centering = with_centering + self.with_scaling = with_scaling + self.quantile_range = quantile_range + self.unit_variance = unit_variance + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Compute the median and quantiles to be used for scaling. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to compute the median and quantiles + used for later scaling along the features axis. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Fitted scaler. + """ + # at fit, convert sparse matrices to csc for optimized computation of + # the quantiles + X = validate_data( + self, + X, + accept_sparse="csc", + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + ) + + q_min, q_max = self.quantile_range + if not 0 <= q_min <= q_max <= 100: + raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) + + if self.with_centering: + if sparse.issparse(X): + raise ValueError( + "Cannot center sparse matrices: use `with_centering=False`" + " instead. See docstring for motivation and alternatives." + ) + self.center_ = np.nanmedian(X, axis=0) + else: + self.center_ = None + + if self.with_scaling: + quantiles = [] + for feature_idx in range(X.shape[1]): + if sparse.issparse(X): + column_nnz_data = X.data[ + X.indptr[feature_idx] : X.indptr[feature_idx + 1] + ] + column_data = np.zeros(shape=X.shape[0], dtype=X.dtype) + column_data[: len(column_nnz_data)] = column_nnz_data + else: + column_data = X[:, feature_idx] + + quantiles.append(np.nanpercentile(column_data, self.quantile_range)) + + quantiles = np.transpose(quantiles) + + self.scale_ = quantiles[1] - quantiles[0] + self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False) + if self.unit_variance: + adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0) + self.scale_ = self.scale_ / adjust + else: + self.scale_ = None + + return self + + def transform(self, X): + """Center and scale the data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to scale along the specified axis. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + X = validate_data( + self, + X, + accept_sparse=("csr", "csc"), + copy=self.copy, + dtype=FLOAT_DTYPES, + force_writeable=True, + reset=False, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + if self.with_scaling: + inplace_column_scale(X, 1.0 / self.scale_) + else: + if self.with_centering: + X -= self.center_ + if self.with_scaling: + X /= self.scale_ + return X + + def inverse_transform(self, X): + """Scale back the data to the original representation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The rescaled data to be transformed back. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + X = check_array( + X, + accept_sparse=("csr", "csc"), + copy=self.copy, + dtype=FLOAT_DTYPES, + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + if self.with_scaling: + inplace_column_scale(X, self.scale_) + else: + if self.with_scaling: + X *= self.scale_ + if self.with_centering: + X += self.center_ + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +@validate_params( + {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]}, + prefer_skip_nested_validation=False, +) +def robust_scale( + X, + *, + axis=0, + with_centering=True, + with_scaling=True, + quantile_range=(25.0, 75.0), + copy=True, + unit_variance=False, +): + """Standardize a dataset along any axis. + + Center to the median and component wise scale + according to the interquartile range. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_sample, n_features) + The data to center and scale. + + axis : int, default=0 + Axis used to compute the medians and IQR along. If 0, + independently scale each feature, otherwise (if 1) scale + each sample. + + with_centering : bool, default=True + If `True`, center the data before scaling. + + with_scaling : bool, default=True + If `True`, scale the data to unit variance (or equivalently, + unit standard deviation). + + quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0,\ + default=(25.0, 75.0) + Quantile range used to calculate `scale_`. By default this is equal to + the IQR, i.e., `q_min` is the first quantile and `q_max` is the third + quantile. + + .. versionadded:: 0.18 + + copy : bool, default=True + If False, try to avoid a copy and scale in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + unit_variance : bool, default=False + If `True`, scale data so that normally distributed features have a + variance of 1. In general, if the difference between the x-values of + `q_max` and `q_min` for a standard normal distribution is greater + than 1, the dataset will be scaled down. If less than 1, the dataset + will be scaled up. + + .. versionadded:: 0.24 + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + The transformed data. + + See Also + -------- + RobustScaler : Performs centering and scaling using the Transformer API + (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + This implementation will refuse to center scipy.sparse matrices + since it would make them non-sparse and would potentially crash the + program with memory exhaustion problems. + + Instead the caller is expected to either set explicitly + `with_centering=False` (in that case, only variance scaling will be + performed on the features of the CSR matrix) or to call `X.toarray()` + if he/she expects the materialized dense array to fit in memory. + + To avoid memory copy the caller should pass a CSR matrix. + + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know + what you are doing. A common mistake is to apply it to the entire data + *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.RobustScaler` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`. + + Examples + -------- + >>> from sklearn.preprocessing import robust_scale + >>> X = [[-2, 1, 2], [-1, 0, 1]] + >>> robust_scale(X, axis=0) # scale each column independently + array([[-1., 1., 1.], + [ 1., -1., -1.]]) + >>> robust_scale(X, axis=1) # scale each row independently + array([[-1.5, 0. , 0.5], + [-1. , 0. , 1. ]]) + """ + X = check_array( + X, + accept_sparse=("csr", "csc"), + copy=False, + ensure_2d=False, + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + ) + original_ndim = X.ndim + + if original_ndim == 1: + X = X.reshape(X.shape[0], 1) + + s = RobustScaler( + with_centering=with_centering, + with_scaling=with_scaling, + quantile_range=quantile_range, + unit_variance=unit_variance, + copy=copy, + ) + if axis == 0: + X = s.fit_transform(X) + else: + X = s.fit_transform(X.T).T + + if original_ndim == 1: + X = X.ravel() + + return X + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "norm": [StrOptions({"l1", "l2", "max"})], + "axis": [Options(Integral, {0, 1})], + "copy": ["boolean"], + "return_norm": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False): + """Scale input vectors individually to unit norm (vector length). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to normalize, element by element. + scipy.sparse matrices should be in CSR format to avoid an + un-necessary copy. + + norm : {'l1', 'l2', 'max'}, default='l2' + The norm to use to normalize each non zero sample (or each non-zero + feature if axis is 0). + + axis : {0, 1}, default=1 + Define axis used to normalize the data along. If 1, independently + normalize each sample, otherwise (if 0) normalize each feature. + + copy : bool, default=True + If False, try to avoid a copy and normalize in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + return_norm : bool, default=False + Whether to return the computed norms. + + Returns + ------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Normalized input X. + + norms : ndarray of shape (n_samples, ) if axis=1 else (n_features, ) + An array of norms along given axis for X. + When X is sparse, a NotImplementedError will be raised + for norm 'l1' or 'l2'. + + See Also + -------- + Normalizer : Performs normalization using the Transformer API + (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + Examples + -------- + >>> from sklearn.preprocessing import normalize + >>> X = [[-2, 1, 2], [-1, 0, 1]] + >>> normalize(X, norm="l1") # L1 normalization each row independently + array([[-0.4, 0.2, 0.4], + [-0.5, 0. , 0.5]]) + >>> normalize(X, norm="l2") # L2 normalization each row independently + array([[-0.66..., 0.33..., 0.66...], + [-0.70..., 0. , 0.70...]]) + """ + if axis == 0: + sparse_format = "csc" + else: # axis == 1: + sparse_format = "csr" + + xp, _ = get_namespace(X) + + X = check_array( + X, + accept_sparse=sparse_format, + copy=copy, + estimator="the normalize function", + dtype=_array_api.supported_float_dtypes(xp), + force_writeable=True, + ) + if axis == 0: + X = X.T + + if sparse.issparse(X): + if return_norm and norm in ("l1", "l2"): + raise NotImplementedError( + "return_norm=True is not implemented " + "for sparse matrices with norm 'l1' " + "or norm 'l2'" + ) + if norm == "l1": + inplace_csr_row_normalize_l1(X) + elif norm == "l2": + inplace_csr_row_normalize_l2(X) + elif norm == "max": + mins, maxes = min_max_axis(X, 1) + norms = np.maximum(abs(mins), maxes) + norms_elementwise = norms.repeat(np.diff(X.indptr)) + mask = norms_elementwise != 0 + X.data[mask] /= norms_elementwise[mask] + else: + if norm == "l1": + norms = xp.sum(xp.abs(X), axis=1) + elif norm == "l2": + norms = row_norms(X) + elif norm == "max": + norms = xp.max(xp.abs(X), axis=1) + norms = _handle_zeros_in_scale(norms, copy=False) + X /= norms[:, None] + + if axis == 0: + X = X.T + + if return_norm: + return X, norms + else: + return X + + +class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Normalize samples individually to unit norm. + + Each sample (i.e. each row of the data matrix) with at least one + non zero component is rescaled independently of other samples so + that its norm (l1, l2 or inf) equals one. + + This transformer is able to work both with dense numpy arrays and + scipy.sparse matrix (use CSR format if you want to avoid the burden of + a copy / conversion). + + Scaling inputs to unit norms is a common operation for text + classification or clustering for instance. For instance the dot + product of two l2-normalized TF-IDF vectors is the cosine similarity + of the vectors and is the base similarity metric for the Vector + Space Model commonly used by the Information Retrieval community. + + For an example visualization, refer to :ref:`Compare Normalizer with other + scalers `. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + norm : {'l1', 'l2', 'max'}, default='l2' + The norm to use to normalize each non zero sample. If norm='max' + is used, values will be rescaled by the maximum of the absolute + values. + + copy : bool, default=True + Set to False to perform inplace row normalization and avoid a + copy (if the input is already a numpy array or a scipy.sparse + CSR matrix). + + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + normalize : Equivalent function without the estimator API. + + Notes + ----- + This estimator is :term:`stateless` and does not need to be fitted. + However, we recommend to call :meth:`fit_transform` instead of + :meth:`transform`, as parameter validation is only performed in + :meth:`fit`. + + Examples + -------- + >>> from sklearn.preprocessing import Normalizer + >>> X = [[4, 1, 2, 2], + ... [1, 3, 9, 3], + ... [5, 7, 5, 1]] + >>> transformer = Normalizer().fit(X) # fit does nothing. + >>> transformer + Normalizer() + >>> transformer.transform(X) + array([[0.8, 0.2, 0.4, 0.4], + [0.1, 0.3, 0.9, 0.3], + [0.5, 0.7, 0.5, 0.1]]) + """ + + _parameter_constraints: dict = { + "norm": [StrOptions({"l1", "l2", "max"})], + "copy": ["boolean"], + } + + def __init__(self, norm="l2", *, copy=True): + self.norm = norm + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Only validates estimator's parameters. + + This method allows to: (i) validate the estimator's parameters and + (ii) be consistent with the scikit-learn transformer API. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to estimate the normalization parameters. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Fitted transformer. + """ + validate_data(self, X, accept_sparse="csr") + return self + + def transform(self, X, copy=None): + """Scale each non zero row of X to unit norm. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to normalize, row by row. scipy.sparse matrices should be + in CSR format to avoid an un-necessary copy. + + copy : bool, default=None + Copy the input X or not. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + copy = copy if copy is not None else self.copy + X = validate_data( + self, X, accept_sparse="csr", force_writeable=True, copy=copy, reset=False + ) + return normalize(X, norm=self.norm, axis=1, copy=False) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.requires_fit = False + tags.array_api_support = True + return tags + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "threshold": [Interval(Real, None, None, closed="neither")], + "copy": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def binarize(X, *, threshold=0.0, copy=True): + """Boolean thresholding of array-like or scipy.sparse matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to binarize, element by element. + scipy.sparse matrices should be in CSR or CSC format to avoid an + un-necessary copy. + + threshold : float, default=0.0 + Feature values below or equal to this are replaced by 0, above it by 1. + Threshold may not be less than 0 for operations on sparse matrices. + + copy : bool, default=True + If False, try to avoid a copy and binarize in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an object dtype, a copy will be returned even with + copy=False. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + The transformed data. + + See Also + -------- + Binarizer : Performs binarization using the Transformer API + (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`). + + Examples + -------- + >>> from sklearn.preprocessing import binarize + >>> X = [[0.4, 0.6, 0.5], [0.6, 0.1, 0.2]] + >>> binarize(X, threshold=0.5) + array([[0., 1., 0.], + [1., 0., 0.]]) + """ + X = check_array(X, accept_sparse=["csr", "csc"], force_writeable=True, copy=copy) + if sparse.issparse(X): + if threshold < 0: + raise ValueError("Cannot binarize a sparse matrix with threshold < 0") + cond = X.data > threshold + not_cond = np.logical_not(cond) + X.data[cond] = 1 + X.data[not_cond] = 0 + X.eliminate_zeros() + else: + cond = X > threshold + not_cond = np.logical_not(cond) + X[cond] = 1 + X[not_cond] = 0 + return X + + +class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Binarize data (set feature values to 0 or 1) according to a threshold. + + Values greater than the threshold map to 1, while values less than + or equal to the threshold map to 0. With the default threshold of 0, + only positive values map to 1. + + Binarization is a common operation on text count data where the + analyst can decide to only consider the presence or absence of a + feature rather than a quantified number of occurrences for instance. + + It can also be used as a pre-processing step for estimators that + consider boolean random variables (e.g. modelled using the Bernoulli + distribution in a Bayesian setting). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + threshold : float, default=0.0 + Feature values below or equal to this are replaced by 0, above it by 1. + Threshold may not be less than 0 for operations on sparse matrices. + + copy : bool, default=True + Set to False to perform inplace binarization and avoid a copy (if + the input is already a numpy array or a scipy.sparse CSR matrix). + + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + binarize : Equivalent function without the estimator API. + KBinsDiscretizer : Bin continuous data into intervals. + OneHotEncoder : Encode categorical features as a one-hot numeric array. + + Notes + ----- + If the input is a sparse matrix, only the non-zero values are subject + to update by the :class:`Binarizer` class. + + This estimator is :term:`stateless` and does not need to be fitted. + However, we recommend to call :meth:`fit_transform` instead of + :meth:`transform`, as parameter validation is only performed in + :meth:`fit`. + + Examples + -------- + >>> from sklearn.preprocessing import Binarizer + >>> X = [[ 1., -1., 2.], + ... [ 2., 0., 0.], + ... [ 0., 1., -1.]] + >>> transformer = Binarizer().fit(X) # fit does nothing. + >>> transformer + Binarizer() + >>> transformer.transform(X) + array([[1., 0., 1.], + [1., 0., 0.], + [0., 1., 0.]]) + """ + + _parameter_constraints: dict = { + "threshold": [Real], + "copy": ["boolean"], + } + + def __init__(self, *, threshold=0.0, copy=True): + self.threshold = threshold + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Only validates estimator's parameters. + + This method allows to: (i) validate the estimator's parameters and + (ii) be consistent with the scikit-learn transformer API. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted transformer. + """ + validate_data(self, X, accept_sparse="csr") + return self + + def transform(self, X, copy=None): + """Binarize each element of X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to binarize, element by element. + scipy.sparse matrices should be in CSR format to avoid an + un-necessary copy. + + copy : bool + Copy the input X or not. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + copy = copy if copy is not None else self.copy + # TODO: This should be refactored because binarize also calls + # check_array + X = validate_data( + self, + X, + accept_sparse=["csr", "csc"], + force_writeable=True, + copy=copy, + reset=False, + ) + return binarize(X, threshold=self.threshold, copy=False) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.requires_fit = False + return tags + + +class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + r"""Center an arbitrary kernel matrix :math:`K`. + + Let define a kernel :math:`K` such that: + + .. math:: + K(X, Y) = \phi(X) . \phi(Y)^{T} + + :math:`\phi(X)` is a function mapping of rows of :math:`X` to a + Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`. + + This class allows to compute :math:`\tilde{K}(X, Y)` such that: + + .. math:: + \tilde{K(X, Y)} = \tilde{\phi}(X) . \tilde{\phi}(Y)^{T} + + :math:`\tilde{\phi}(X)` is the centered mapped data in the Hilbert + space. + + `KernelCenterer` centers the features without explicitly computing the + mapping :math:`\phi(\cdot)`. Working with centered kernels is sometime + expected when dealing with algebra computation such as eigendecomposition + for :class:`~sklearn.decomposition.KernelPCA` for instance. + + Read more in the :ref:`User Guide `. + + Attributes + ---------- + K_fit_rows_ : ndarray of shape (n_samples,) + Average of each column of kernel matrix. + + K_fit_all_ : float + Average of kernel matrix. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.kernel_approximation.Nystroem : Approximate a kernel map + using a subset of the training data. + + References + ---------- + .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller. + "Nonlinear component analysis as a kernel eigenvalue problem." + Neural computation 10.5 (1998): 1299-1319. + `_ + + Examples + -------- + >>> from sklearn.preprocessing import KernelCenterer + >>> from sklearn.metrics.pairwise import pairwise_kernels + >>> X = [[ 1., -2., 2.], + ... [ -2., 1., 3.], + ... [ 4., 1., -2.]] + >>> K = pairwise_kernels(X, metric='linear') + >>> K + array([[ 9., 2., -2.], + [ 2., 14., -13.], + [ -2., -13., 21.]]) + >>> transformer = KernelCenterer().fit(K) + >>> transformer + KernelCenterer() + >>> transformer.transform(K) + array([[ 5., 0., -5.], + [ 0., 14., -14.], + [ -5., -14., 19.]]) + """ + + # X is called K in these methods. + __metadata_request__transform = {"K": metadata_routing.UNUSED} + __metadata_request__fit = {"K": metadata_routing.UNUSED} + + def fit(self, K, y=None): + """Fit KernelCenterer. + + Parameters + ---------- + K : ndarray of shape (n_samples, n_samples) + Kernel matrix. + + y : None + Ignored. + + Returns + ------- + self : object + Returns the instance itself. + """ + xp, _ = get_namespace(K) + + K = validate_data(self, K, dtype=_array_api.supported_float_dtypes(xp)) + + if K.shape[0] != K.shape[1]: + raise ValueError( + "Kernel matrix must be a square matrix." + " Input is a {}x{} matrix.".format(K.shape[0], K.shape[1]) + ) + + n_samples = K.shape[0] + self.K_fit_rows_ = xp.sum(K, axis=0) / n_samples + self.K_fit_all_ = xp.sum(self.K_fit_rows_) / n_samples + return self + + def transform(self, K, copy=True): + """Center kernel matrix. + + Parameters + ---------- + K : ndarray of shape (n_samples1, n_samples2) + Kernel matrix. + + copy : bool, default=True + Set to False to perform inplace computation. + + Returns + ------- + K_new : ndarray of shape (n_samples1, n_samples2) + Returns the instance itself. + """ + check_is_fitted(self) + + xp, _ = get_namespace(K) + + K = validate_data( + self, + K, + copy=copy, + force_writeable=True, + dtype=_array_api.supported_float_dtypes(xp), + reset=False, + ) + + K_pred_cols = (xp.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, None] + + K -= self.K_fit_rows_ + K -= K_pred_cols + K += self.K_fit_all_ + + return K + + @property + def _n_features_out(self): + """Number of transformed output features.""" + # Used by ClassNamePrefixFeaturesOutMixin. This model preserves the + # number of input features but this is not a one-to-one mapping in the + # usual sense. Hence the choice not to use OneToOneFeatureMixin to + # implement get_feature_names_out for this class. + return self.n_features_in_ + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = True + tags.array_api_support = True + return tags + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "value": [Interval(Real, None, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def add_dummy_feature(X, value=1.0): + """Augment dataset with an additional dummy feature. + + This is useful for fitting an intercept term with implementations which + cannot otherwise fit it directly. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Data. + + value : float + Value to use for the dummy feature. + + Returns + ------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features + 1) + Same data with dummy feature added as first column. + + Examples + -------- + >>> from sklearn.preprocessing import add_dummy_feature + >>> add_dummy_feature([[0, 1], [1, 0]]) + array([[1., 0., 1.], + [1., 1., 0.]]) + """ + X = check_array(X, accept_sparse=["csc", "csr", "coo"], dtype=FLOAT_DTYPES) + n_samples, n_features = X.shape + shape = (n_samples, n_features + 1) + if sparse.issparse(X): + if X.format == "coo": + # Shift columns to the right. + col = X.col + 1 + # Column indices of dummy feature are 0 everywhere. + col = np.concatenate((np.zeros(n_samples), col)) + # Row indices of dummy feature are 0, ..., n_samples-1. + row = np.concatenate((np.arange(n_samples), X.row)) + # Prepend the dummy feature n_samples times. + data = np.concatenate((np.full(n_samples, value), X.data)) + return sparse.coo_matrix((data, (row, col)), shape) + elif X.format == "csc": + # Shift index pointers since we need to add n_samples elements. + indptr = X.indptr + n_samples + # indptr[0] must be 0. + indptr = np.concatenate((np.array([0]), indptr)) + # Row indices of dummy feature are 0, ..., n_samples-1. + indices = np.concatenate((np.arange(n_samples), X.indices)) + # Prepend the dummy feature n_samples times. + data = np.concatenate((np.full(n_samples, value), X.data)) + return sparse.csc_matrix((data, indices, indptr), shape) + else: + klass = X.__class__ + return klass(add_dummy_feature(X.tocoo(), value)) + else: + return np.hstack((np.full((n_samples, 1), value), X)) + + +class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Transform features using quantiles information. + + This method transforms the features to follow a uniform or a normal + distribution. Therefore, for a given feature, this transformation tends + to spread out the most frequent values. It also reduces the impact of + (marginal) outliers: this is therefore a robust preprocessing scheme. + + The transformation is applied on each feature independently. First an + estimate of the cumulative distribution function of a feature is + used to map the original values to a uniform distribution. The obtained + values are then mapped to the desired output distribution using the + associated quantile function. Features values of new/unseen data that fall + below or above the fitted range will be mapped to the bounds of the output + distribution. Note that this transform is non-linear. It may distort linear + correlations between variables measured at the same scale but renders + variables measured at different scales more directly comparable. + + For example visualizations, refer to :ref:`Compare QuantileTransformer with + other scalers `. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.19 + + Parameters + ---------- + n_quantiles : int, default=1000 or n_samples + Number of quantiles to be computed. It corresponds to the number + of landmarks used to discretize the cumulative distribution function. + If n_quantiles is larger than the number of samples, n_quantiles is set + to the number of samples as a larger number of quantiles does not give + a better approximation of the cumulative distribution function + estimator. + + output_distribution : {'uniform', 'normal'}, default='uniform' + Marginal distribution for the transformed data. The choices are + 'uniform' (default) or 'normal'. + + ignore_implicit_zeros : bool, default=False + Only applies to sparse matrices. If True, the sparse entries of the + matrix are discarded to compute the quantile statistics. If False, + these entries are treated as zeros. + + subsample : int or None, default=10_000 + Maximum number of samples used to estimate the quantiles for + computational efficiency. Note that the subsampling procedure may + differ for value-identical sparse and dense matrices. + Disable subsampling by setting `subsample=None`. + + .. versionadded:: 1.5 + The option `None` to disable subsampling was added. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for subsampling and smoothing + noise. + Please see ``subsample`` for more details. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + copy : bool, default=True + Set to False to perform inplace transformation and avoid a copy (if the + input is already a numpy array). + + Attributes + ---------- + n_quantiles_ : int + The actual number of quantiles used to discretize the cumulative + distribution function. + + quantiles_ : ndarray of shape (n_quantiles, n_features) + The values corresponding the quantiles of reference. + + references_ : ndarray of shape (n_quantiles, ) + Quantiles of references. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + quantile_transform : Equivalent function without the estimator API. + PowerTransformer : Perform mapping to a normal distribution using a power + transform. + StandardScaler : Perform standardization that is faster, but less robust + to outliers. + RobustScaler : Perform robust standardization that removes the influence + of outliers but does not put outliers and inliers on the same scale. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import QuantileTransformer + >>> rng = np.random.RandomState(0) + >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) + >>> qt = QuantileTransformer(n_quantiles=10, random_state=0) + >>> qt.fit_transform(X) + array([...]) + """ + + _parameter_constraints: dict = { + "n_quantiles": [Interval(Integral, 1, None, closed="left")], + "output_distribution": [StrOptions({"uniform", "normal"})], + "ignore_implicit_zeros": ["boolean"], + "subsample": [Interval(Integral, 1, None, closed="left"), None], + "random_state": ["random_state"], + "copy": ["boolean"], + } + + def __init__( + self, + *, + n_quantiles=1000, + output_distribution="uniform", + ignore_implicit_zeros=False, + subsample=10_000, + random_state=None, + copy=True, + ): + self.n_quantiles = n_quantiles + self.output_distribution = output_distribution + self.ignore_implicit_zeros = ignore_implicit_zeros + self.subsample = subsample + self.random_state = random_state + self.copy = copy + + def _dense_fit(self, X, random_state): + """Compute percentiles for dense matrices. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The data used to scale along the features axis. + """ + if self.ignore_implicit_zeros: + warnings.warn( + "'ignore_implicit_zeros' takes effect only with" + " sparse matrix. This parameter has no effect." + ) + + n_samples, n_features = X.shape + references = self.references_ * 100 + + if self.subsample is not None and self.subsample < n_samples: + # Take a subsample of `X` + X = resample( + X, replace=False, n_samples=self.subsample, random_state=random_state + ) + + self.quantiles_ = np.nanpercentile(X, references, axis=0) + # Due to floating-point precision error in `np.nanpercentile`, + # make sure that quantiles are monotonically increasing. + # Upstream issue in numpy: + # https://github.com/numpy/numpy/issues/14685 + self.quantiles_ = np.maximum.accumulate(self.quantiles_) + + def _sparse_fit(self, X, random_state): + """Compute percentiles for sparse matrices. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features) + The data used to scale along the features axis. The sparse matrix + needs to be nonnegative. If a sparse matrix is provided, + it will be converted into a sparse ``csc_matrix``. + """ + n_samples, n_features = X.shape + references = self.references_ * 100 + + self.quantiles_ = [] + for feature_idx in range(n_features): + column_nnz_data = X.data[X.indptr[feature_idx] : X.indptr[feature_idx + 1]] + if self.subsample is not None and len(column_nnz_data) > self.subsample: + column_subsample = self.subsample * len(column_nnz_data) // n_samples + if self.ignore_implicit_zeros: + column_data = np.zeros(shape=column_subsample, dtype=X.dtype) + else: + column_data = np.zeros(shape=self.subsample, dtype=X.dtype) + column_data[:column_subsample] = random_state.choice( + column_nnz_data, size=column_subsample, replace=False + ) + else: + if self.ignore_implicit_zeros: + column_data = np.zeros(shape=len(column_nnz_data), dtype=X.dtype) + else: + column_data = np.zeros(shape=n_samples, dtype=X.dtype) + column_data[: len(column_nnz_data)] = column_nnz_data + + if not column_data.size: + # if no nnz, an error will be raised for computing the + # quantiles. Force the quantiles to be zeros. + self.quantiles_.append([0] * len(references)) + else: + self.quantiles_.append(np.nanpercentile(column_data, references)) + self.quantiles_ = np.transpose(self.quantiles_) + # due to floating-point precision error in `np.nanpercentile`, + # make sure the quantiles are monotonically increasing + # Upstream issue in numpy: + # https://github.com/numpy/numpy/issues/14685 + self.quantiles_ = np.maximum.accumulate(self.quantiles_) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Compute the quantiles used for transforming. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to scale along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + nonnegative if `ignore_implicit_zeros` is False. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted transformer. + """ + if self.subsample is not None and self.n_quantiles > self.subsample: + raise ValueError( + "The number of quantiles cannot be greater than" + " the number of samples used. Got {} quantiles" + " and {} samples.".format(self.n_quantiles, self.subsample) + ) + + X = self._check_inputs(X, in_fit=True, copy=False) + n_samples = X.shape[0] + + if self.n_quantiles > n_samples: + warnings.warn( + "n_quantiles (%s) is greater than the total number " + "of samples (%s). n_quantiles is set to " + "n_samples." % (self.n_quantiles, n_samples) + ) + self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples)) + + rng = check_random_state(self.random_state) + + # Create the quantiles of reference + self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True) + if sparse.issparse(X): + self._sparse_fit(X, rng) + else: + self._dense_fit(X, rng) + + return self + + def _transform_col(self, X_col, quantiles, inverse): + """Private function to transform a single feature.""" + + output_distribution = self.output_distribution + + if not inverse: + lower_bound_x = quantiles[0] + upper_bound_x = quantiles[-1] + lower_bound_y = 0 + upper_bound_y = 1 + else: + lower_bound_x = 0 + upper_bound_x = 1 + lower_bound_y = quantiles[0] + upper_bound_y = quantiles[-1] + # for inverse transform, match a uniform distribution + with np.errstate(invalid="ignore"): # hide NaN comparison warnings + if output_distribution == "normal": + X_col = stats.norm.cdf(X_col) + # else output distribution is already a uniform distribution + + # find index for lower and higher bounds + with np.errstate(invalid="ignore"): # hide NaN comparison warnings + if output_distribution == "normal": + lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x + upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x + if output_distribution == "uniform": + lower_bounds_idx = X_col == lower_bound_x + upper_bounds_idx = X_col == upper_bound_x + + isfinite_mask = ~np.isnan(X_col) + X_col_finite = X_col[isfinite_mask] + if not inverse: + # Interpolate in one direction and in the other and take the + # mean. This is in case of repeated values in the features + # and hence repeated quantiles + # + # If we don't do this, only one extreme of the duplicated is + # used (the upper when we do ascending, and the + # lower for descending). We take the mean of these two + X_col[isfinite_mask] = 0.5 * ( + np.interp(X_col_finite, quantiles, self.references_) + - np.interp(-X_col_finite, -quantiles[::-1], -self.references_[::-1]) + ) + else: + X_col[isfinite_mask] = np.interp(X_col_finite, self.references_, quantiles) + + X_col[upper_bounds_idx] = upper_bound_y + X_col[lower_bounds_idx] = lower_bound_y + # for forward transform, match the output distribution + if not inverse: + with np.errstate(invalid="ignore"): # hide NaN comparison warnings + if output_distribution == "normal": + X_col = stats.norm.ppf(X_col) + # find the value to clip the data to avoid mapping to + # infinity. Clip such that the inverse transform will be + # consistent + clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1)) + clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - np.spacing(1))) + X_col = np.clip(X_col, clip_min, clip_max) + # else output distribution is uniform and the ppf is the + # identity function so we let X_col unchanged + + return X_col + + def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False): + """Check inputs before fit and transform.""" + X = validate_data( + self, + X, + reset=in_fit, + accept_sparse="csc", + copy=copy, + dtype=FLOAT_DTYPES, + # only set force_writeable for the validation at transform time because + # it's the only place where QuantileTransformer performs inplace operations. + force_writeable=True if not in_fit else None, + ensure_all_finite="allow-nan", + ) + # we only accept positive sparse matrix when ignore_implicit_zeros is + # false and that we call fit or transform. + with np.errstate(invalid="ignore"): # hide NaN comparison warnings + if ( + not accept_sparse_negative + and not self.ignore_implicit_zeros + and (sparse.issparse(X) and np.any(X.data < 0)) + ): + raise ValueError( + "QuantileTransformer only accepts non-negative sparse matrices." + ) + + return X + + def _transform(self, X, inverse=False): + """Forward and inverse transform. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The data used to scale along the features axis. + + inverse : bool, default=False + If False, apply forward transform. If True, apply + inverse transform. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + Projected data. + """ + if sparse.issparse(X): + for feature_idx in range(X.shape[1]): + column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1]) + X.data[column_slice] = self._transform_col( + X.data[column_slice], self.quantiles_[:, feature_idx], inverse + ) + else: + for feature_idx in range(X.shape[1]): + X[:, feature_idx] = self._transform_col( + X[:, feature_idx], self.quantiles_[:, feature_idx], inverse + ) + + return X + + def transform(self, X): + """Feature-wise transformation of the data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to scale along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + nonnegative if `ignore_implicit_zeros` is False. + + Returns + ------- + Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) + The projected data. + """ + check_is_fitted(self) + X = self._check_inputs(X, in_fit=False, copy=self.copy) + + return self._transform(X, inverse=False) + + def inverse_transform(self, X): + """Back-projection to the original space. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to scale along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + nonnegative if `ignore_implicit_zeros` is False. + + Returns + ------- + Xt : {ndarray, sparse matrix} of (n_samples, n_features) + The projected data. + """ + check_is_fitted(self) + X = self._check_inputs( + X, in_fit=False, accept_sparse_negative=True, copy=self.copy + ) + + return self._transform(X, inverse=True) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +@validate_params( + {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]}, + prefer_skip_nested_validation=False, +) +def quantile_transform( + X, + *, + axis=0, + n_quantiles=1000, + output_distribution="uniform", + ignore_implicit_zeros=False, + subsample=int(1e5), + random_state=None, + copy=True, +): + """Transform features using quantiles information. + + This method transforms the features to follow a uniform or a normal + distribution. Therefore, for a given feature, this transformation tends + to spread out the most frequent values. It also reduces the impact of + (marginal) outliers: this is therefore a robust preprocessing scheme. + + The transformation is applied on each feature independently. First an + estimate of the cumulative distribution function of a feature is + used to map the original values to a uniform distribution. The obtained + values are then mapped to the desired output distribution using the + associated quantile function. Features values of new/unseen data that fall + below or above the fitted range will be mapped to the bounds of the output + distribution. Note that this transform is non-linear. It may distort linear + correlations between variables measured at the same scale but renders + variables measured at different scales more directly comparable. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to transform. + + axis : int, default=0 + Axis used to compute the means and standard deviations along. If 0, + transform each feature, otherwise (if 1) transform each sample. + + n_quantiles : int, default=1000 or n_samples + Number of quantiles to be computed. It corresponds to the number + of landmarks used to discretize the cumulative distribution function. + If n_quantiles is larger than the number of samples, n_quantiles is set + to the number of samples as a larger number of quantiles does not give + a better approximation of the cumulative distribution function + estimator. + + output_distribution : {'uniform', 'normal'}, default='uniform' + Marginal distribution for the transformed data. The choices are + 'uniform' (default) or 'normal'. + + ignore_implicit_zeros : bool, default=False + Only applies to sparse matrices. If True, the sparse entries of the + matrix are discarded to compute the quantile statistics. If False, + these entries are treated as zeros. + + subsample : int or None, default=1e5 + Maximum number of samples used to estimate the quantiles for + computational efficiency. Note that the subsampling procedure may + differ for value-identical sparse and dense matrices. + Disable subsampling by setting `subsample=None`. + + .. versionadded:: 1.5 + The option `None` to disable subsampling was added. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for subsampling and smoothing + noise. + Please see ``subsample`` for more details. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + copy : bool, default=True + If False, try to avoid a copy and transform in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + .. versionchanged:: 0.23 + The default value of `copy` changed from False to True in 0.23. + + Returns + ------- + Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) + The transformed data. + + See Also + -------- + QuantileTransformer : Performs quantile-based scaling using the + Transformer API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + power_transform : Maps data to a normal distribution using a + power transformation. + scale : Performs standardization that is faster, but less robust + to outliers. + robust_scale : Performs robust standardization that removes the influence + of outliers but does not put outliers and inliers on the same scale. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.quantile_transform` unless + you know what you are doing. A common mistake is to apply it + to the entire data *before* splitting into training and + test sets. This will bias the model evaluation because + information would have leaked from the test set to the + training set. + In general, we recommend using + :class:`~sklearn.preprocessing.QuantileTransformer` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking:`pipe = make_pipeline(QuantileTransformer(), + LogisticRegression())`. + + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import quantile_transform + >>> rng = np.random.RandomState(0) + >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) + >>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True) + array([...]) + """ + n = QuantileTransformer( + n_quantiles=n_quantiles, + output_distribution=output_distribution, + subsample=subsample, + ignore_implicit_zeros=ignore_implicit_zeros, + random_state=random_state, + copy=copy, + ) + if axis == 0: + X = n.fit_transform(X) + else: # axis == 1 + X = n.fit_transform(X.T).T + return X + + +class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Apply a power transform featurewise to make data more Gaussian-like. + + Power transforms are a family of parametric, monotonic transformations + that are applied to make data more Gaussian-like. This is useful for + modeling issues related to heteroscedasticity (non-constant variance), + or other situations where normality is desired. + + Currently, PowerTransformer supports the Box-Cox transform and the + Yeo-Johnson transform. The optimal parameter for stabilizing variance and + minimizing skewness is estimated through maximum likelihood. + + Box-Cox requires input data to be strictly positive, while Yeo-Johnson + supports both positive or negative data. + + By default, zero-mean, unit-variance normalization is applied to the + transformed data. + + For an example visualization, refer to :ref:`Compare PowerTransformer with + other scalers `. To see the + effect of Box-Cox and Yeo-Johnson transformations on different + distributions, see: + :ref:`sphx_glr_auto_examples_preprocessing_plot_map_data_to_normal.py`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + + Parameters + ---------- + method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson' + The power transform method. Available methods are: + + - 'yeo-johnson' [1]_, works with positive and negative values + - 'box-cox' [2]_, only works with strictly positive values + + standardize : bool, default=True + Set to True to apply zero-mean, unit-variance normalization to the + transformed output. + + copy : bool, default=True + Set to False to perform inplace computation during transformation. + + Attributes + ---------- + lambdas_ : ndarray of float of shape (n_features,) + The parameters of the power transformation for the selected features. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + power_transform : Equivalent function without the estimator API. + + QuantileTransformer : Maps data to a standard normal distribution with + the parameter `output_distribution='normal'`. + + Notes + ----- + NaNs are treated as missing values: disregarded in ``fit``, and maintained + in ``transform``. + + References + ---------- + + .. [1] :doi:`I.K. Yeo and R.A. Johnson, "A new family of power + transformations to improve normality or symmetry." Biometrika, + 87(4), pp.954-959, (2000). <10.1093/biomet/87.4.954>` + + .. [2] :doi:`G.E.P. Box and D.R. Cox, "An Analysis of Transformations", + Journal of the Royal Statistical Society B, 26, 211-252 (1964). + <10.1111/j.2517-6161.1964.tb00553.x>` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import PowerTransformer + >>> pt = PowerTransformer() + >>> data = [[1, 2], [3, 2], [4, 5]] + >>> print(pt.fit(data)) + PowerTransformer() + >>> print(pt.lambdas_) + [ 1.386... -3.100...] + >>> print(pt.transform(data)) + [[-1.316... -0.707...] + [ 0.209... -0.707...] + [ 1.106... 1.414...]] + """ + + _parameter_constraints: dict = { + "method": [StrOptions({"yeo-johnson", "box-cox"})], + "standardize": ["boolean"], + "copy": ["boolean"], + } + + def __init__(self, method="yeo-johnson", *, standardize=True, copy=True): + self.method = method + self.standardize = standardize + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Estimate the optimal parameter lambda for each feature. + + The optimal lambda parameter for minimizing skewness is estimated on + each feature independently using maximum likelihood. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data used to estimate the optimal transformation parameters. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted transformer. + """ + self._fit(X, y=y, force_transform=False) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None): + """Fit `PowerTransformer` to `X`, then transform `X`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data used to estimate the optimal transformation parameters + and to be transformed using a power transformation. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_features) + Transformed data. + """ + return self._fit(X, y, force_transform=True) + + def _fit(self, X, y=None, force_transform=False): + X = self._check_input(X, in_fit=True, check_positive=True) + + if not self.copy and not force_transform: # if call from fit() + X = X.copy() # force copy so that fit does not change X inplace + + n_samples = X.shape[0] + mean = np.mean(X, axis=0, dtype=np.float64) + var = np.var(X, axis=0, dtype=np.float64) + + optim_function = { + "box-cox": self._box_cox_optimize, + "yeo-johnson": self._yeo_johnson_optimize, + }[self.method] + + transform_function = { + "box-cox": boxcox, + "yeo-johnson": self._yeo_johnson_transform, + }[self.method] + + with np.errstate(invalid="ignore"): # hide NaN warnings + self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype) + for i, col in enumerate(X.T): + # For yeo-johnson, leave constant features unchanged + # lambda=1 corresponds to the identity transformation + is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples) + if self.method == "yeo-johnson" and is_constant_feature: + self.lambdas_[i] = 1.0 + continue + + self.lambdas_[i] = optim_function(col) + + if self.standardize or force_transform: + X[:, i] = transform_function(X[:, i], self.lambdas_[i]) + + if self.standardize: + self._scaler = StandardScaler(copy=False).set_output(transform="default") + if force_transform: + X = self._scaler.fit_transform(X) + else: + self._scaler.fit(X) + + return X + + def transform(self, X): + """Apply the power transform to each feature using the fitted lambdas. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to be transformed using a power transformation. + + Returns + ------- + X_trans : ndarray of shape (n_samples, n_features) + The transformed data. + """ + check_is_fitted(self) + X = self._check_input(X, in_fit=False, check_positive=True, check_shape=True) + + transform_function = { + "box-cox": boxcox, + "yeo-johnson": self._yeo_johnson_transform, + }[self.method] + for i, lmbda in enumerate(self.lambdas_): + with np.errstate(invalid="ignore"): # hide NaN warnings + X[:, i] = transform_function(X[:, i], lmbda) + + if self.standardize: + X = self._scaler.transform(X) + + return X + + def inverse_transform(self, X): + """Apply the inverse power transformation using the fitted lambdas. + + The inverse of the Box-Cox transformation is given by:: + + if lambda_ == 0: + X = exp(X_trans) + else: + X = (X_trans * lambda_ + 1) ** (1 / lambda_) + + The inverse of the Yeo-Johnson transformation is given by:: + + if X >= 0 and lambda_ == 0: + X = exp(X_trans) - 1 + elif X >= 0 and lambda_ != 0: + X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1 + elif X < 0 and lambda_ != 2: + X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_)) + elif X < 0 and lambda_ == 2: + X = 1 - exp(-X_trans) + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The transformed data. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + The original data. + """ + check_is_fitted(self) + X = self._check_input(X, in_fit=False, check_shape=True) + + if self.standardize: + X = self._scaler.inverse_transform(X) + + inv_fun = { + "box-cox": inv_boxcox, + "yeo-johnson": self._yeo_johnson_inverse_transform, + }[self.method] + for i, lmbda in enumerate(self.lambdas_): + with np.errstate(invalid="ignore"): # hide NaN warnings + X[:, i] = inv_fun(X[:, i], lmbda) + + return X + + def _yeo_johnson_inverse_transform(self, x, lmbda): + """Return inverse-transformed input x following Yeo-Johnson inverse + transform with parameter lambda. + """ + x_inv = np.zeros_like(x) + pos = x >= 0 + + # when x >= 0 + if abs(lmbda) < np.spacing(1.0): + x_inv[pos] = np.exp(x[pos]) - 1 + else: # lmbda != 0 + x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1 + + # when x < 0 + if abs(lmbda - 2) > np.spacing(1.0): + x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda)) + else: # lmbda == 2 + x_inv[~pos] = 1 - np.exp(-x[~pos]) + + return x_inv + + def _yeo_johnson_transform(self, x, lmbda): + """Return transformed input x following Yeo-Johnson transform with + parameter lambda. + """ + + out = np.zeros_like(x) + pos = x >= 0 # binary mask + + # when x >= 0 + if abs(lmbda) < np.spacing(1.0): + out[pos] = np.log1p(x[pos]) + else: # lmbda != 0 + out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda + + # when x < 0 + if abs(lmbda - 2) > np.spacing(1.0): + out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda) + else: # lmbda == 2 + out[~pos] = -np.log1p(-x[~pos]) + + return out + + def _box_cox_optimize(self, x): + """Find and return optimal lambda parameter of the Box-Cox transform by + MLE, for observed data x. + + We here use scipy builtins which uses the brent optimizer. + """ + mask = np.isnan(x) + if np.all(mask): + raise ValueError("Column must not be all nan.") + + # the computation of lambda is influenced by NaNs so we need to + # get rid of them + _, lmbda = stats.boxcox(x[~mask], lmbda=None) + + return lmbda + + def _yeo_johnson_optimize(self, x): + """Find and return optimal lambda parameter of the Yeo-Johnson + transform by MLE, for observed data x. + + Like for Box-Cox, MLE is done via the brent optimizer. + """ + x_tiny = np.finfo(np.float64).tiny + + def _neg_log_likelihood(lmbda): + """Return the negative log likelihood of the observed data x as a + function of lambda.""" + x_trans = self._yeo_johnson_transform(x, lmbda) + n_samples = x.shape[0] + x_trans_var = x_trans.var() + + # Reject transformed data that would raise a RuntimeWarning in np.log + if x_trans_var < x_tiny: + return np.inf + + log_var = np.log(x_trans_var) + loglike = -n_samples / 2 * log_var + loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum() + + return -loglike + + # the computation of lambda is influenced by NaNs so we need to + # get rid of them + x = x[~np.isnan(x)] + # choosing bracket -2, 2 like for boxcox + return optimize.brent(_neg_log_likelihood, brack=(-2, 2)) + + def _check_input(self, X, in_fit, check_positive=False, check_shape=False): + """Validate the input before fit and transform. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + in_fit : bool + Whether or not `_check_input` is called from `fit` or other + methods, e.g. `predict`, `transform`, etc. + + check_positive : bool, default=False + If True, check that all data is positive and non-zero (only if + ``self.method=='box-cox'``). + + check_shape : bool, default=False + If True, check that n_features matches the length of self.lambdas_ + """ + X = validate_data( + self, + X, + ensure_2d=True, + dtype=FLOAT_DTYPES, + force_writeable=True, + copy=self.copy, + ensure_all_finite="allow-nan", + reset=in_fit, + ) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered") + if check_positive and self.method == "box-cox" and np.nanmin(X) <= 0: + raise ValueError( + "The Box-Cox transformation can only be " + "applied to strictly positive data" + ) + + if check_shape and not X.shape[1] == len(self.lambdas_): + raise ValueError( + "Input data has a different number of features " + "than fitting data. Should have {n}, data has {m}".format( + n=len(self.lambdas_), m=X.shape[1] + ) + ) + + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +@validate_params( + {"X": ["array-like"]}, + prefer_skip_nested_validation=False, +) +def power_transform(X, method="yeo-johnson", *, standardize=True, copy=True): + """Parametric, monotonic transformation to make data more Gaussian-like. + + Power transforms are a family of parametric, monotonic transformations + that are applied to make data more Gaussian-like. This is useful for + modeling issues related to heteroscedasticity (non-constant variance), + or other situations where normality is desired. + + Currently, power_transform supports the Box-Cox transform and the + Yeo-Johnson transform. The optimal parameter for stabilizing variance and + minimizing skewness is estimated through maximum likelihood. + + Box-Cox requires input data to be strictly positive, while Yeo-Johnson + supports both positive or negative data. + + By default, zero-mean, unit-variance normalization is applied to the + transformed data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to be transformed using a power transformation. + + method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson' + The power transform method. Available methods are: + + - 'yeo-johnson' [1]_, works with positive and negative values + - 'box-cox' [2]_, only works with strictly positive values + + .. versionchanged:: 0.23 + The default value of the `method` parameter changed from + 'box-cox' to 'yeo-johnson' in 0.23. + + standardize : bool, default=True + Set to True to apply zero-mean, unit-variance normalization to the + transformed output. + + copy : bool, default=True + If False, try to avoid a copy and transform in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + Returns + ------- + X_trans : ndarray of shape (n_samples, n_features) + The transformed data. + + See Also + -------- + PowerTransformer : Equivalent transformation with the + Transformer API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + + quantile_transform : Maps data to a standard normal distribution with + the parameter `output_distribution='normal'`. + + Notes + ----- + NaNs are treated as missing values: disregarded in ``fit``, and maintained + in ``transform``. + + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + References + ---------- + + .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to + improve normality or symmetry." Biometrika, 87(4), pp.954-959, + (2000). + + .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal + of the Royal Statistical Society B, 26, 211-252 (1964). + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import power_transform + >>> data = [[1, 2], [3, 2], [4, 5]] + >>> print(power_transform(data, method='box-cox')) + [[-1.332... -0.707...] + [ 0.256... -0.707...] + [ 1.076... 1.414...]] + + .. warning:: Risk of data leak. + Do not use :func:`~sklearn.preprocessing.power_transform` unless you + know what you are doing. A common mistake is to apply it to the entire + data *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.PowerTransformer` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking, e.g.: `pipe = make_pipeline(PowerTransformer(), + LogisticRegression())`. + """ + pt = PowerTransformer(method=method, standardize=standardize, copy=copy) + return pt.fit_transform(X) diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_discretization.py b/.venv/Lib/site-packages/sklearn/preprocessing/_discretization.py new file mode 100644 index 0000000000000000000000000000000000000000..ee505239dd9deea2d8680b6393548749df14f151 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/_discretization.py @@ -0,0 +1,464 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +import warnings +from numbers import Integral + +import numpy as np + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils import resample +from ..utils._param_validation import Interval, Options, StrOptions +from ..utils.deprecation import _deprecate_Xt_in_inverse_transform +from ..utils.stats import _weighted_percentile +from ..utils.validation import ( + _check_feature_names_in, + _check_sample_weight, + check_array, + check_is_fitted, + validate_data, +) +from ._encoders import OneHotEncoder + + +class KBinsDiscretizer(TransformerMixin, BaseEstimator): + """ + Bin continuous data into intervals. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + + Parameters + ---------- + n_bins : int or array-like of shape (n_features,), default=5 + The number of bins to produce. Raises ValueError if ``n_bins < 2``. + + encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' + Method used to encode the transformed result. + + - 'onehot': Encode the transformed result with one-hot encoding + and return a sparse matrix. Ignored features are always + stacked to the right. + - 'onehot-dense': Encode the transformed result with one-hot encoding + and return a dense array. Ignored features are always + stacked to the right. + - 'ordinal': Return the bin identifier encoded as an integer value. + + strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile' + Strategy used to define the widths of the bins. + + - 'uniform': All bins in each feature have identical widths. + - 'quantile': All bins in each feature have the same number of points. + - 'kmeans': Values in each bin have the same nearest center of a 1D + k-means cluster. + + For an example of the different strategies see: + :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`. + + dtype : {np.float32, np.float64}, default=None + The desired data-type for the output. If None, output dtype is + consistent with input dtype. Only np.float32 and np.float64 are + supported. + + .. versionadded:: 0.24 + + subsample : int or None, default=200_000 + Maximum number of samples, used to fit the model, for computational + efficiency. + `subsample=None` means that all the training samples are used when + computing the quantiles that determine the binning thresholds. + Since quantile computation relies on sorting each column of `X` and + that sorting has an `n log(n)` time complexity, + it is recommended to use subsampling on datasets with a + very large number of samples. + + .. versionchanged:: 1.3 + The default value of `subsample` changed from `None` to `200_000` when + `strategy="quantile"`. + + .. versionchanged:: 1.5 + The default value of `subsample` changed from `None` to `200_000` when + `strategy="uniform"` or `strategy="kmeans"`. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for subsampling. + Pass an int for reproducible results across multiple function calls. + See the `subsample` parameter for more details. + See :term:`Glossary `. + + .. versionadded:: 1.1 + + Attributes + ---------- + bin_edges_ : ndarray of ndarray of shape (n_features,) + The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` + Ignored features will have empty arrays. + + n_bins_ : ndarray of shape (n_features,), dtype=np.int64 + Number of bins per feature. Bins whose width are too small + (i.e., <= 1e-8) are removed with a warning. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + Binarizer : Class used to bin values as ``0`` or + ``1`` based on a parameter ``threshold``. + + Notes + ----- + + For a visualization of discretization on different datasets refer to + :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`. + On the effect of discretization on linear models see: + :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`. + + In bin edges for feature ``i``, the first and last values are used only for + ``inverse_transform``. During transform, bin edges are extended to:: + + np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf]) + + You can combine ``KBinsDiscretizer`` with + :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess + part of the features. + + ``KBinsDiscretizer`` might produce constant features (e.g., when + ``encode = 'onehot'`` and certain bins do not contain any data). + These features can be removed with feature selection algorithms + (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`). + + Examples + -------- + >>> from sklearn.preprocessing import KBinsDiscretizer + >>> X = [[-2, 1, -4, -1], + ... [-1, 2, -3, -0.5], + ... [ 0, 3, -2, 0.5], + ... [ 1, 4, -1, 2]] + >>> est = KBinsDiscretizer( + ... n_bins=3, encode='ordinal', strategy='uniform' + ... ) + >>> est.fit(X) + KBinsDiscretizer(...) + >>> Xt = est.transform(X) + >>> Xt # doctest: +SKIP + array([[ 0., 0., 0., 0.], + [ 1., 1., 1., 0.], + [ 2., 2., 2., 1.], + [ 2., 2., 2., 2.]]) + + Sometimes it may be useful to convert the data back into the original + feature space. The ``inverse_transform`` function converts the binned + data into the original feature space. Each value will be equal to the mean + of the two bin edges. + + >>> est.bin_edges_[0] + array([-2., -1., 0., 1.]) + >>> est.inverse_transform(Xt) + array([[-1.5, 1.5, -3.5, -0.5], + [-0.5, 2.5, -2.5, -0.5], + [ 0.5, 3.5, -1.5, 0.5], + [ 0.5, 3.5, -1.5, 1.5]]) + """ + + _parameter_constraints: dict = { + "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"], + "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})], + "strategy": [StrOptions({"uniform", "quantile", "kmeans"})], + "dtype": [Options(type, {np.float64, np.float32}), None], + "subsample": [Interval(Integral, 1, None, closed="left"), None], + "random_state": ["random_state"], + } + + def __init__( + self, + n_bins=5, + *, + encode="onehot", + strategy="quantile", + dtype=None, + subsample=200_000, + random_state=None, + ): + self.n_bins = n_bins + self.encode = encode + self.strategy = strategy + self.dtype = dtype + self.subsample = subsample + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None, sample_weight=None): + """ + Fit the estimator. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to be discretized. + + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + + sample_weight : ndarray of shape (n_samples,) + Contains weight values to be associated with each sample. + Cannot be used when `strategy` is set to `"uniform"`. + + .. versionadded:: 1.3 + + Returns + ------- + self : object + Returns the instance itself. + """ + X = validate_data(self, X, dtype="numeric") + + if self.dtype in (np.float64, np.float32): + output_dtype = self.dtype + else: # self.dtype is None + output_dtype = X.dtype + + n_samples, n_features = X.shape + + if sample_weight is not None and self.strategy == "uniform": + raise ValueError( + "`sample_weight` was provided but it cannot be " + "used with strategy='uniform'. Got strategy=" + f"{self.strategy!r} instead." + ) + + if self.subsample is not None and n_samples > self.subsample: + # Take a subsample of `X` + X = resample( + X, + replace=False, + n_samples=self.subsample, + random_state=self.random_state, + ) + + n_features = X.shape[1] + n_bins = self._validate_n_bins(n_features) + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + bin_edges = np.zeros(n_features, dtype=object) + for jj in range(n_features): + column = X[:, jj] + col_min, col_max = column.min(), column.max() + + if col_min == col_max: + warnings.warn( + "Feature %d is constant and will be replaced with 0." % jj + ) + n_bins[jj] = 1 + bin_edges[jj] = np.array([-np.inf, np.inf]) + continue + + if self.strategy == "uniform": + bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1) + + elif self.strategy == "quantile": + quantiles = np.linspace(0, 100, n_bins[jj] + 1) + if sample_weight is None: + bin_edges[jj] = np.asarray(np.percentile(column, quantiles)) + else: + bin_edges[jj] = np.asarray( + [ + _weighted_percentile(column, sample_weight, q) + for q in quantiles + ], + dtype=np.float64, + ) + elif self.strategy == "kmeans": + from ..cluster import KMeans # fixes import loops + + # Deterministic initialization with uniform spacing + uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1) + init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 + + # 1D k-means procedure + km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) + centers = km.fit( + column[:, None], sample_weight=sample_weight + ).cluster_centers_[:, 0] + # Must sort, centers may be unsorted even with sorted init + centers.sort() + bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 + bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] + + # Remove bins whose width are too small (i.e., <= 1e-8) + if self.strategy in ("quantile", "kmeans"): + mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 + bin_edges[jj] = bin_edges[jj][mask] + if len(bin_edges[jj]) - 1 != n_bins[jj]: + warnings.warn( + "Bins whose width are too small (i.e., <= " + "1e-8) in feature %d are removed. Consider " + "decreasing the number of bins." % jj + ) + n_bins[jj] = len(bin_edges[jj]) - 1 + + self.bin_edges_ = bin_edges + self.n_bins_ = n_bins + + if "onehot" in self.encode: + self._encoder = OneHotEncoder( + categories=[np.arange(i) for i in self.n_bins_], + sparse_output=self.encode == "onehot", + dtype=output_dtype, + ) + # Fit the OneHotEncoder with toy datasets + # so that it's ready for use after the KBinsDiscretizer is fitted + self._encoder.fit(np.zeros((1, len(self.n_bins_)))) + + return self + + def _validate_n_bins(self, n_features): + """Returns n_bins_, the number of bins per feature.""" + orig_bins = self.n_bins + if isinstance(orig_bins, Integral): + return np.full(n_features, orig_bins, dtype=int) + + n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False) + + if n_bins.ndim > 1 or n_bins.shape[0] != n_features: + raise ValueError("n_bins must be a scalar or array of shape (n_features,).") + + bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins) + + violating_indices = np.where(bad_nbins_value)[0] + if violating_indices.shape[0] > 0: + indices = ", ".join(str(i) for i in violating_indices) + raise ValueError( + "{} received an invalid number " + "of bins at indices {}. Number of bins " + "must be at least 2, and must be an int.".format( + KBinsDiscretizer.__name__, indices + ) + ) + return n_bins + + def transform(self, X): + """ + Discretize the data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to be discretized. + + Returns + ------- + Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64} + Data in the binned space. Will be a sparse matrix if + `self.encode='onehot'` and ndarray otherwise. + """ + check_is_fitted(self) + + # check input and attribute dtypes + dtype = (np.float64, np.float32) if self.dtype is None else self.dtype + Xt = validate_data(self, X, copy=True, dtype=dtype, reset=False) + + bin_edges = self.bin_edges_ + for jj in range(Xt.shape[1]): + Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right") + + if self.encode == "ordinal": + return Xt + + dtype_init = None + if "onehot" in self.encode: + dtype_init = self._encoder.dtype + self._encoder.dtype = Xt.dtype + try: + Xt_enc = self._encoder.transform(Xt) + finally: + # revert the initial dtype to avoid modifying self. + self._encoder.dtype = dtype_init + return Xt_enc + + def inverse_transform(self, X=None, *, Xt=None): + """ + Transform discretized data back to original feature space. + + Note that this function does not regenerate the original data + due to discretization rounding. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Transformed data in the binned space. + + Xt : array-like of shape (n_samples, n_features) + Transformed data in the binned space. + + .. deprecated:: 1.5 + `Xt` was deprecated in 1.5 and will be removed in 1.7. Use `X` instead. + + Returns + ------- + Xinv : ndarray, dtype={np.float32, np.float64} + Data in the original feature space. + """ + X = _deprecate_Xt_in_inverse_transform(X, Xt) + + check_is_fitted(self) + + if "onehot" in self.encode: + X = self._encoder.inverse_transform(X) + + Xinv = check_array(X, copy=True, dtype=(np.float64, np.float32)) + n_features = self.n_bins_.shape[0] + if Xinv.shape[1] != n_features: + raise ValueError( + "Incorrect number of features. Expecting {}, received {}.".format( + n_features, Xinv.shape[1] + ) + ) + + for jj in range(n_features): + bin_edges = self.bin_edges_[jj] + bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5 + Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)] + + return Xinv + + def get_feature_names_out(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + input_features = _check_feature_names_in(self, input_features) + if hasattr(self, "_encoder"): + return self._encoder.get_feature_names_out(input_features) + + # ordinal encoding + return input_features diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_encoders.py b/.venv/Lib/site-packages/sklearn/preprocessing/_encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..58ad8bca28a1c13c2304a52364e1a7329eea56e5 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/_encoders.py @@ -0,0 +1,1698 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import warnings +from numbers import Integral + +import numpy as np +from scipy import sparse + +from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context +from ..utils import _safe_indexing, check_array +from ..utils._encode import _check_unknown, _encode, _get_counts, _unique +from ..utils._mask import _get_mask +from ..utils._missing import is_scalar_nan +from ..utils._param_validation import Interval, RealNotInt, StrOptions +from ..utils._set_output import _get_output_config +from ..utils.validation import ( + _check_feature_names, + _check_feature_names_in, + _check_n_features, + check_is_fitted, +) + +__all__ = ["OneHotEncoder", "OrdinalEncoder"] + + +class _BaseEncoder(TransformerMixin, BaseEstimator): + """ + Base class for encoders that includes the code to categorize and + transform the input features. + + """ + + def _check_X(self, X, ensure_all_finite=True): + """ + Perform custom check_array: + - convert list of strings to object dtype + - check for missing values for object dtype data (check_array does + not do that) + - return list of features (arrays): this list of features is + constructed feature by feature to preserve the data types + of pandas DataFrame columns, as otherwise information is lost + and cannot be used, e.g. for the `categories_` attribute. + + """ + if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2): + # if not a dataframe, do normal check_array validation + X_temp = check_array(X, dtype=None, ensure_all_finite=ensure_all_finite) + if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_): + X = check_array(X, dtype=object, ensure_all_finite=ensure_all_finite) + else: + X = X_temp + needs_validation = False + else: + # pandas dataframe, do validation later column by column, in order + # to keep the dtype information to be used in the encoder. + needs_validation = ensure_all_finite + + n_samples, n_features = X.shape + X_columns = [] + + for i in range(n_features): + Xi = _safe_indexing(X, indices=i, axis=1) + Xi = check_array( + Xi, ensure_2d=False, dtype=None, ensure_all_finite=needs_validation + ) + X_columns.append(Xi) + + return X_columns, n_samples, n_features + + def _fit( + self, + X, + handle_unknown="error", + ensure_all_finite=True, + return_counts=False, + return_and_ignore_missing_for_infrequent=False, + ): + self._check_infrequent_enabled() + _check_n_features(self, X, reset=True) + _check_feature_names(self, X, reset=True) + X_list, n_samples, n_features = self._check_X( + X, ensure_all_finite=ensure_all_finite + ) + self.n_features_in_ = n_features + + if self.categories != "auto": + if len(self.categories) != n_features: + raise ValueError( + "Shape mismatch: if categories is an array," + " it has to be of shape (n_features,)." + ) + + self.categories_ = [] + category_counts = [] + compute_counts = return_counts or self._infrequent_enabled + + for i in range(n_features): + Xi = X_list[i] + + if self.categories == "auto": + result = _unique(Xi, return_counts=compute_counts) + if compute_counts: + cats, counts = result + category_counts.append(counts) + else: + cats = result + else: + if np.issubdtype(Xi.dtype, np.str_): + # Always convert string categories to objects to avoid + # unexpected string truncation for longer category labels + # passed in the constructor. + Xi_dtype = object + else: + Xi_dtype = Xi.dtype + + cats = np.array(self.categories[i], dtype=Xi_dtype) + if ( + cats.dtype == object + and isinstance(cats[0], bytes) + and Xi.dtype.kind != "S" + ): + msg = ( + f"In column {i}, the predefined categories have type 'bytes'" + " which is incompatible with values of type" + f" '{type(Xi[0]).__name__}'." + ) + raise ValueError(msg) + + # `nan` must be the last stated category + for category in cats[:-1]: + if is_scalar_nan(category): + raise ValueError( + "Nan should be the last element in user" + f" provided categories, see categories {cats}" + f" in column #{i}" + ) + + if cats.size != len(_unique(cats)): + msg = ( + f"In column {i}, the predefined categories" + " contain duplicate elements." + ) + raise ValueError(msg) + + if Xi.dtype.kind not in "OUS": + sorted_cats = np.sort(cats) + error_msg = ( + "Unsorted categories are not supported for numerical categories" + ) + # if there are nans, nan should be the last element + stop_idx = -1 if np.isnan(sorted_cats[-1]) else None + if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]): + raise ValueError(error_msg) + + if handle_unknown == "error": + diff = _check_unknown(Xi, cats) + if diff: + msg = ( + "Found unknown categories {0} in column {1}" + " during fit".format(diff, i) + ) + raise ValueError(msg) + if compute_counts: + category_counts.append(_get_counts(Xi, cats)) + + self.categories_.append(cats) + + output = {"n_samples": n_samples} + if return_counts: + output["category_counts"] = category_counts + + missing_indices = {} + if return_and_ignore_missing_for_infrequent: + for feature_idx, categories_for_idx in enumerate(self.categories_): + if is_scalar_nan(categories_for_idx[-1]): + # `nan` values can only be placed in the latest position + missing_indices[feature_idx] = categories_for_idx.size - 1 + output["missing_indices"] = missing_indices + + if self._infrequent_enabled: + self._fit_infrequent_category_mapping( + n_samples, + category_counts, + missing_indices, + ) + return output + + def _transform( + self, + X, + handle_unknown="error", + ensure_all_finite=True, + warn_on_unknown=False, + ignore_category_indices=None, + ): + X_list, n_samples, n_features = self._check_X( + X, ensure_all_finite=ensure_all_finite + ) + _check_feature_names(self, X, reset=False) + _check_n_features(self, X, reset=False) + + X_int = np.zeros((n_samples, n_features), dtype=int) + X_mask = np.ones((n_samples, n_features), dtype=bool) + + columns_with_unknown = [] + for i in range(n_features): + Xi = X_list[i] + diff, valid_mask = _check_unknown(Xi, self.categories_[i], return_mask=True) + + if not np.all(valid_mask): + if handle_unknown == "error": + msg = ( + "Found unknown categories {0} in column {1}" + " during transform".format(diff, i) + ) + raise ValueError(msg) + else: + if warn_on_unknown: + columns_with_unknown.append(i) + # Set the problematic rows to an acceptable value and + # continue `The rows are marked `X_mask` and will be + # removed later. + X_mask[:, i] = valid_mask + # cast Xi into the largest string type necessary + # to handle different lengths of numpy strings + if ( + self.categories_[i].dtype.kind in ("U", "S") + and self.categories_[i].itemsize > Xi.itemsize + ): + Xi = Xi.astype(self.categories_[i].dtype) + elif self.categories_[i].dtype.kind == "O" and Xi.dtype.kind == "U": + # categories are objects and Xi are numpy strings. + # Cast Xi to an object dtype to prevent truncation + # when setting invalid values. + Xi = Xi.astype("O") + else: + Xi = Xi.copy() + + Xi[~valid_mask] = self.categories_[i][0] + # We use check_unknown=False, since _check_unknown was + # already called above. + X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False) + if columns_with_unknown: + warnings.warn( + ( + "Found unknown categories in columns " + f"{columns_with_unknown} during transform. These " + "unknown categories will be encoded as all zeros" + ), + UserWarning, + ) + + self._map_infrequent_categories(X_int, X_mask, ignore_category_indices) + return X_int, X_mask + + @property + def infrequent_categories_(self): + """Infrequent categories for each feature.""" + # raises an AttributeError if `_infrequent_indices` is not defined + infrequent_indices = self._infrequent_indices + return [ + None if indices is None else category[indices] + for category, indices in zip(self.categories_, infrequent_indices) + ] + + def _check_infrequent_enabled(self): + """ + This functions checks whether _infrequent_enabled is True or False. + This has to be called after parameter validation in the fit function. + """ + max_categories = getattr(self, "max_categories", None) + min_frequency = getattr(self, "min_frequency", None) + self._infrequent_enabled = ( + max_categories is not None and max_categories >= 1 + ) or min_frequency is not None + + def _identify_infrequent(self, category_count, n_samples, col_idx): + """Compute the infrequent indices. + + Parameters + ---------- + category_count : ndarray of shape (n_cardinality,) + Category counts. + + n_samples : int + Number of samples. + + col_idx : int + Index of the current category. Only used for the error message. + + Returns + ------- + output : ndarray of shape (n_infrequent_categories,) or None + If there are infrequent categories, indices of infrequent + categories. Otherwise None. + """ + if isinstance(self.min_frequency, numbers.Integral): + infrequent_mask = category_count < self.min_frequency + elif isinstance(self.min_frequency, numbers.Real): + min_frequency_abs = n_samples * self.min_frequency + infrequent_mask = category_count < min_frequency_abs + else: + infrequent_mask = np.zeros(category_count.shape[0], dtype=bool) + + n_current_features = category_count.size - infrequent_mask.sum() + 1 + if self.max_categories is not None and self.max_categories < n_current_features: + # max_categories includes the one infrequent category + frequent_category_count = self.max_categories - 1 + if frequent_category_count == 0: + # All categories are infrequent + infrequent_mask[:] = True + else: + # stable sort to preserve original count order + smallest_levels = np.argsort(category_count, kind="mergesort")[ + :-frequent_category_count + ] + infrequent_mask[smallest_levels] = True + + output = np.flatnonzero(infrequent_mask) + return output if output.size > 0 else None + + def _fit_infrequent_category_mapping( + self, n_samples, category_counts, missing_indices + ): + """Fit infrequent categories. + + Defines the private attribute: `_default_to_infrequent_mappings`. For + feature `i`, `_default_to_infrequent_mappings[i]` defines the mapping + from the integer encoding returned by `super().transform()` into + infrequent categories. If `_default_to_infrequent_mappings[i]` is None, + there were no infrequent categories in the training set. + + For example if categories 0, 2 and 4 were frequent, while categories + 1, 3, 5 were infrequent for feature 7, then these categories are mapped + to a single output: + `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])` + + Defines private attribute: `_infrequent_indices`. `_infrequent_indices[i]` + is an array of indices such that + `categories_[i][_infrequent_indices[i]]` are all the infrequent category + labels. If the feature `i` has no infrequent categories + `_infrequent_indices[i]` is None. + + .. versionadded:: 1.1 + + Parameters + ---------- + n_samples : int + Number of samples in training set. + category_counts: list of ndarray + `category_counts[i]` is the category counts corresponding to + `self.categories_[i]`. + missing_indices : dict + Dict mapping from feature_idx to category index with a missing value. + """ + # Remove missing value from counts, so it is not considered as infrequent + if missing_indices: + category_counts_ = [] + for feature_idx, count in enumerate(category_counts): + if feature_idx in missing_indices: + category_counts_.append( + np.delete(count, missing_indices[feature_idx]) + ) + else: + category_counts_.append(count) + else: + category_counts_ = category_counts + + self._infrequent_indices = [ + self._identify_infrequent(category_count, n_samples, col_idx) + for col_idx, category_count in enumerate(category_counts_) + ] + + # compute mapping from default mapping to infrequent mapping + self._default_to_infrequent_mappings = [] + + for feature_idx, infreq_idx in enumerate(self._infrequent_indices): + cats = self.categories_[feature_idx] + # no infrequent categories + if infreq_idx is None: + self._default_to_infrequent_mappings.append(None) + continue + + n_cats = len(cats) + if feature_idx in missing_indices: + # Missing index was removed from this category when computing + # infrequent indices, thus we need to decrease the number of + # total categories when considering the infrequent mapping. + n_cats -= 1 + + # infrequent indices exist + mapping = np.empty(n_cats, dtype=np.int64) + n_infrequent_cats = infreq_idx.size + + # infrequent categories are mapped to the last element. + n_frequent_cats = n_cats - n_infrequent_cats + mapping[infreq_idx] = n_frequent_cats + + frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx) + mapping[frequent_indices] = np.arange(n_frequent_cats) + + self._default_to_infrequent_mappings.append(mapping) + + def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices): + """Map infrequent categories to integer representing the infrequent category. + + This modifies X_int in-place. Values that were invalid based on `X_mask` + are mapped to the infrequent category if there was an infrequent + category for that feature. + + Parameters + ---------- + X_int: ndarray of shape (n_samples, n_features) + Integer encoded categories. + + X_mask: ndarray of shape (n_samples, n_features) + Bool mask for valid values in `X_int`. + + ignore_category_indices : dict + Dictionary mapping from feature_idx to category index to ignore. + Ignored indexes will not be grouped and the original ordinal encoding + will remain. + """ + if not self._infrequent_enabled: + return + + ignore_category_indices = ignore_category_indices or {} + + for col_idx in range(X_int.shape[1]): + infrequent_idx = self._infrequent_indices[col_idx] + if infrequent_idx is None: + continue + + X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0] + if self.handle_unknown == "infrequent_if_exist": + # All the unknown values are now mapped to the + # infrequent_idx[0], which makes the unknown values valid + # This is needed in `transform` when the encoding is formed + # using `X_mask`. + X_mask[:, col_idx] = True + + # Remaps encoding in `X_int` where the infrequent categories are + # grouped together. + for i, mapping in enumerate(self._default_to_infrequent_mappings): + if mapping is None: + continue + + if i in ignore_category_indices: + # Update rows that are **not** ignored + rows_to_update = X_int[:, i] != ignore_category_indices[i] + else: + rows_to_update = slice(None) + + X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i]) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.categorical = True + tags.input_tags.allow_nan = True + return tags + + +class OneHotEncoder(_BaseEncoder): + """ + Encode categorical features as a one-hot numeric array. + + The input to this transformer should be an array-like of integers or + strings, denoting the values taken on by categorical (discrete) features. + The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') + encoding scheme. This creates a binary column for each category and + returns a sparse matrix or dense array (depending on the ``sparse_output`` + parameter). + + By default, the encoder derives the categories based on the unique values + in each feature. Alternatively, you can also specify the `categories` + manually. + + This encoding is needed for feeding categorical data to many scikit-learn + estimators, notably linear models and SVMs with the standard kernels. + + Note: a one-hot encoding of y labels should use a LabelBinarizer + instead. + + Read more in the :ref:`User Guide `. + For a comparison of different encoders, refer to: + :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. + + Parameters + ---------- + categories : 'auto' or a list of array-like, default='auto' + Categories (unique values) per feature: + + - 'auto' : Determine categories automatically from the training data. + - list : ``categories[i]`` holds the categories expected in the ith + column. The passed categories should not mix strings and numeric + values within a single feature, and should be sorted in case of + numeric values. + + The used categories can be found in the ``categories_`` attribute. + + .. versionadded:: 0.20 + + drop : {'first', 'if_binary'} or an array-like of shape (n_features,), \ + default=None + Specifies a methodology to use to drop one of the categories per + feature. This is useful in situations where perfectly collinear + features cause problems, such as when feeding the resulting data + into an unregularized linear regression model. + + However, dropping one category breaks the symmetry of the original + representation and can therefore induce a bias in downstream models, + for instance for penalized linear classification or regression models. + + - None : retain all features (the default). + - 'first' : drop the first category in each feature. If only one + category is present, the feature will be dropped entirely. + - 'if_binary' : drop the first category in each feature with two + categories. Features with 1 or more than 2 categories are + left intact. + - array : ``drop[i]`` is the category in feature ``X[:, i]`` that + should be dropped. + + When `max_categories` or `min_frequency` is configured to group + infrequent categories, the dropping behavior is handled after the + grouping. + + .. versionadded:: 0.21 + The parameter `drop` was added in 0.21. + + .. versionchanged:: 0.23 + The option `drop='if_binary'` was added in 0.23. + + .. versionchanged:: 1.1 + Support for dropping infrequent categories. + + sparse_output : bool, default=True + When ``True``, it returns a :class:`scipy.sparse.csr_matrix`, + i.e. a sparse matrix in "Compressed Sparse Row" (CSR) format. + + .. versionadded:: 1.2 + `sparse` was renamed to `sparse_output` + + dtype : number type, default=np.float64 + Desired dtype of output. + + handle_unknown : {'error', 'ignore', 'infrequent_if_exist', 'warn'}, \ + default='error' + Specifies the way unknown categories are handled during :meth:`transform`. + + - 'error' : Raise an error if an unknown category is present during transform. + - 'ignore' : When an unknown category is encountered during + transform, the resulting one-hot encoded columns for this feature + will be all zeros. In the inverse transform, an unknown category + will be denoted as None. + - 'infrequent_if_exist' : When an unknown category is encountered + during transform, the resulting one-hot encoded columns for this + feature will map to the infrequent category if it exists. The + infrequent category will be mapped to the last position in the + encoding. During inverse transform, an unknown category will be + mapped to the category denoted `'infrequent'` if it exists. If the + `'infrequent'` category does not exist, then :meth:`transform` and + :meth:`inverse_transform` will handle an unknown category as with + `handle_unknown='ignore'`. Infrequent categories exist based on + `min_frequency` and `max_categories`. Read more in the + :ref:`User Guide `. + - 'warn' : When an unknown category is encountered during transform + a warning is issued, and the encoding then proceeds as described for + `handle_unknown="infrequent_if_exist"`. + + .. versionchanged:: 1.1 + `'infrequent_if_exist'` was added to automatically handle unknown + categories and infrequent categories. + + .. versionadded:: 1.6 + The option `"warn"` was added in 1.6. + + min_frequency : int or float, default=None + Specifies the minimum frequency below which a category will be + considered infrequent. + + - If `int`, categories with a smaller cardinality will be considered + infrequent. + + - If `float`, categories with a smaller cardinality than + `min_frequency * n_samples` will be considered infrequent. + + .. versionadded:: 1.1 + Read more in the :ref:`User Guide `. + + max_categories : int, default=None + Specifies an upper limit to the number of output features for each input + feature when considering infrequent categories. If there are infrequent + categories, `max_categories` includes the category representing the + infrequent categories along with the frequent categories. If `None`, + there is no limit to the number of output features. + + .. versionadded:: 1.1 + Read more in the :ref:`User Guide `. + + feature_name_combiner : "concat" or callable, default="concat" + Callable with signature `def callable(input_feature, category)` that returns a + string. This is used to create feature names to be returned by + :meth:`get_feature_names_out`. + + `"concat"` concatenates encoded feature name and category with + `feature + "_" + str(category)`.E.g. feature X with values 1, 6, 7 create + feature names `X_1, X_6, X_7`. + + .. versionadded:: 1.3 + + Attributes + ---------- + categories_ : list of arrays + The categories of each feature determined during fitting + (in order of the features in X and corresponding with the output + of ``transform``). This includes the category specified in ``drop`` + (if any). + + drop_idx_ : array of shape (n_features,) + - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category + to be dropped for each feature. + - ``drop_idx_[i] = None`` if no category is to be dropped from the + feature with index ``i``, e.g. when `drop='if_binary'` and the + feature isn't binary. + - ``drop_idx_ = None`` if all the transformed features will be + retained. + + If infrequent categories are enabled by setting `min_frequency` or + `max_categories` to a non-default value and `drop_idx[i]` corresponds + to a infrequent category, then the entire infrequent category is + dropped. + + .. versionchanged:: 0.23 + Added the possibility to contain `None` values. + + infrequent_categories_ : list of ndarray + Defined only if infrequent categories are enabled by setting + `min_frequency` or `max_categories` to a non-default value. + `infrequent_categories_[i]` are the infrequent categories for feature + `i`. If the feature `i` has no infrequent categories + `infrequent_categories_[i]` is None. + + .. versionadded:: 1.1 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 1.0 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + feature_name_combiner : callable or None + Callable with signature `def callable(input_feature, category)` that returns a + string. This is used to create feature names to be returned by + :meth:`get_feature_names_out`. + + .. versionadded:: 1.3 + + See Also + -------- + OrdinalEncoder : Performs an ordinal (integer) + encoding of the categorical features. + TargetEncoder : Encodes categorical features using the target. + sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of + dictionary items (also handles string-valued features). + sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot + encoding of dictionary items or strings. + LabelBinarizer : Binarizes labels in a one-vs-all + fashion. + MultiLabelBinarizer : Transforms between iterable of + iterables and a multilabel format, e.g. a (samples x classes) binary + matrix indicating the presence of a class label. + + Examples + -------- + Given a dataset with two features, we let the encoder find the unique + values per feature and transform the data to a binary one-hot encoding. + + >>> from sklearn.preprocessing import OneHotEncoder + + One can discard categories not seen during `fit`: + + >>> enc = OneHotEncoder(handle_unknown='ignore') + >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] + >>> enc.fit(X) + OneHotEncoder(handle_unknown='ignore') + >>> enc.categories_ + [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] + >>> enc.transform([['Female', 1], ['Male', 4]]).toarray() + array([[1., 0., 1., 0., 0.], + [0., 1., 0., 0., 0.]]) + >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]) + array([['Male', 1], + [None, 2]], dtype=object) + >>> enc.get_feature_names_out(['gender', 'group']) + array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...) + + One can always drop the first column for each feature: + + >>> drop_enc = OneHotEncoder(drop='first').fit(X) + >>> drop_enc.categories_ + [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] + >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray() + array([[0., 0., 0.], + [1., 1., 0.]]) + + Or drop a column for feature only having 2 categories: + + >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X) + >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray() + array([[0., 1., 0., 0.], + [1., 0., 1., 0.]]) + + One can change the way feature names are created. + + >>> def custom_combiner(feature, category): + ... return str(feature) + "_" + type(category).__name__ + "_" + str(category) + >>> custom_fnames_enc = OneHotEncoder(feature_name_combiner=custom_combiner).fit(X) + >>> custom_fnames_enc.get_feature_names_out() + array(['x0_str_Female', 'x0_str_Male', 'x1_int_1', 'x1_int_2', 'x1_int_3'], + dtype=object) + + Infrequent categories are enabled by setting `max_categories` or `min_frequency`. + + >>> import numpy as np + >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T + >>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X) + >>> ohe.infrequent_categories_ + [array(['a', 'd'], dtype=object)] + >>> ohe.transform([["a"], ["b"]]) + array([[0., 0., 1.], + [1., 0., 0.]]) + """ + + _parameter_constraints: dict = { + "categories": [StrOptions({"auto"}), list], + "drop": [StrOptions({"first", "if_binary"}), "array-like", None], + "dtype": "no_validation", # validation delegated to numpy + "handle_unknown": [ + StrOptions({"error", "ignore", "infrequent_if_exist", "warn"}) + ], + "max_categories": [Interval(Integral, 1, None, closed="left"), None], + "min_frequency": [ + Interval(Integral, 1, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="neither"), + None, + ], + "sparse_output": ["boolean"], + "feature_name_combiner": [StrOptions({"concat"}), callable], + } + + def __init__( + self, + *, + categories="auto", + drop=None, + sparse_output=True, + dtype=np.float64, + handle_unknown="error", + min_frequency=None, + max_categories=None, + feature_name_combiner="concat", + ): + self.categories = categories + self.sparse_output = sparse_output + self.dtype = dtype + self.handle_unknown = handle_unknown + self.drop = drop + self.min_frequency = min_frequency + self.max_categories = max_categories + self.feature_name_combiner = feature_name_combiner + + def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx): + """Convert `drop_idx` into the index for infrequent categories. + + If there are no infrequent categories, then `drop_idx` is + returned. This method is called in `_set_drop_idx` when the `drop` + parameter is an array-like. + """ + if not self._infrequent_enabled: + return drop_idx + + default_to_infrequent = self._default_to_infrequent_mappings[feature_idx] + if default_to_infrequent is None: + return drop_idx + + # Raise error when explicitly dropping a category that is infrequent + infrequent_indices = self._infrequent_indices[feature_idx] + if infrequent_indices is not None and drop_idx in infrequent_indices: + categories = self.categories_[feature_idx] + raise ValueError( + f"Unable to drop category {categories[drop_idx].item()!r} from" + f" feature {feature_idx} because it is infrequent" + ) + return default_to_infrequent[drop_idx] + + def _set_drop_idx(self): + """Compute the drop indices associated with `self.categories_`. + + If `self.drop` is: + - `None`, No categories have been dropped. + - `'first'`, All zeros to drop the first category. + - `'if_binary'`, All zeros if the category is binary and `None` + otherwise. + - array-like, The indices of the categories that match the + categories in `self.drop`. If the dropped category is an infrequent + category, then the index for the infrequent category is used. This + means that the entire infrequent category is dropped. + + This methods defines a public `drop_idx_` and a private + `_drop_idx_after_grouping`. + + - `drop_idx_`: Public facing API that references the drop category in + `self.categories_`. + - `_drop_idx_after_grouping`: Used internally to drop categories *after* the + infrequent categories are grouped together. + + If there are no infrequent categories or drop is `None`, then + `drop_idx_=_drop_idx_after_grouping`. + """ + if self.drop is None: + drop_idx_after_grouping = None + elif isinstance(self.drop, str): + if self.drop == "first": + drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object) + elif self.drop == "if_binary": + n_features_out_no_drop = [len(cat) for cat in self.categories_] + if self._infrequent_enabled: + for i, infreq_idx in enumerate(self._infrequent_indices): + if infreq_idx is None: + continue + n_features_out_no_drop[i] -= infreq_idx.size - 1 + + drop_idx_after_grouping = np.array( + [ + 0 if n_features_out == 2 else None + for n_features_out in n_features_out_no_drop + ], + dtype=object, + ) + + else: + drop_array = np.asarray(self.drop, dtype=object) + droplen = len(drop_array) + + if droplen != len(self.categories_): + msg = ( + "`drop` should have length equal to the number " + "of features ({}), got {}" + ) + raise ValueError(msg.format(len(self.categories_), droplen)) + missing_drops = [] + drop_indices = [] + for feature_idx, (drop_val, cat_list) in enumerate( + zip(drop_array, self.categories_) + ): + if not is_scalar_nan(drop_val): + drop_idx = np.where(cat_list == drop_val)[0] + if drop_idx.size: # found drop idx + drop_indices.append( + self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0]) + ) + else: + missing_drops.append((feature_idx, drop_val)) + continue + + # drop_val is nan, find nan in categories manually + if is_scalar_nan(cat_list[-1]): + drop_indices.append( + self._map_drop_idx_to_infrequent(feature_idx, cat_list.size - 1) + ) + else: # nan is missing + missing_drops.append((feature_idx, drop_val)) + + if any(missing_drops): + msg = ( + "The following categories were supposed to be " + "dropped, but were not found in the training " + "data.\n{}".format( + "\n".join( + [ + "Category: {}, Feature: {}".format(c, v) + for c, v in missing_drops + ] + ) + ) + ) + raise ValueError(msg) + drop_idx_after_grouping = np.array(drop_indices, dtype=object) + + # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent + # categories are grouped together. If needed, we remap `drop_idx` back + # to the categories seen in `self.categories_`. + self._drop_idx_after_grouping = drop_idx_after_grouping + + if not self._infrequent_enabled or drop_idx_after_grouping is None: + self.drop_idx_ = self._drop_idx_after_grouping + else: + drop_idx_ = [] + for feature_idx, drop_idx in enumerate(drop_idx_after_grouping): + default_to_infrequent = self._default_to_infrequent_mappings[ + feature_idx + ] + if drop_idx is None or default_to_infrequent is None: + orig_drop_idx = drop_idx + else: + orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0] + + drop_idx_.append(orig_drop_idx) + + self.drop_idx_ = np.asarray(drop_idx_, dtype=object) + + def _compute_transformed_categories(self, i, remove_dropped=True): + """Compute the transformed categories used for column `i`. + + 1. If there are infrequent categories, the category is named + 'infrequent_sklearn'. + 2. Dropped columns are removed when remove_dropped=True. + """ + cats = self.categories_[i] + + if self._infrequent_enabled: + infreq_map = self._default_to_infrequent_mappings[i] + if infreq_map is not None: + frequent_mask = infreq_map < infreq_map.max() + infrequent_cat = "infrequent_sklearn" + # infrequent category is always at the end + cats = np.concatenate( + (cats[frequent_mask], np.array([infrequent_cat], dtype=object)) + ) + + if remove_dropped: + cats = self._remove_dropped_categories(cats, i) + return cats + + def _remove_dropped_categories(self, categories, i): + """Remove dropped categories.""" + if ( + self._drop_idx_after_grouping is not None + and self._drop_idx_after_grouping[i] is not None + ): + return np.delete(categories, self._drop_idx_after_grouping[i]) + return categories + + def _compute_n_features_outs(self): + """Compute the n_features_out for each input feature.""" + output = [len(cats) for cats in self.categories_] + + if self._drop_idx_after_grouping is not None: + for i, drop_idx in enumerate(self._drop_idx_after_grouping): + if drop_idx is not None: + output[i] -= 1 + + if not self._infrequent_enabled: + return output + + # infrequent is enabled, the number of features out are reduced + # because the infrequent categories are grouped together + for i, infreq_idx in enumerate(self._infrequent_indices): + if infreq_idx is None: + continue + output[i] -= infreq_idx.size - 1 + + return output + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """ + Fit OneHotEncoder to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + + Returns + ------- + self + Fitted encoder. + """ + self._fit( + X, + handle_unknown=self.handle_unknown, + ensure_all_finite="allow-nan", + ) + self._set_drop_idx() + self._n_features_outs = self._compute_n_features_outs() + return self + + def transform(self, X): + """ + Transform X using one-hot encoding. + + If `sparse_output=True` (default), it returns an instance of + :class:`scipy.sparse._csr.csr_matrix` (CSR format). + + If there are infrequent categories for a feature, set by specifying + `max_categories` or `min_frequency`, the infrequent categories are + grouped into a single category. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to encode. + + Returns + ------- + X_out : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Transformed input. If `sparse_output=True`, a sparse matrix will be + returned. + """ + check_is_fitted(self) + transform_output = _get_output_config("transform", estimator=self)["dense"] + if transform_output != "default" and self.sparse_output: + capitalize_transform_output = transform_output.capitalize() + raise ValueError( + f"{capitalize_transform_output} output does not support sparse data." + f" Set sparse_output=False to output {transform_output} dataframes or" + f" disable {capitalize_transform_output} output via" + '` ohe.set_output(transform="default").' + ) + + # validation of X happens in _check_X called by _transform + if self.handle_unknown == "warn": + warn_on_unknown, handle_unknown = True, "infrequent_if_exist" + else: + warn_on_unknown = self.drop is not None and self.handle_unknown in { + "ignore", + "infrequent_if_exist", + } + handle_unknown = self.handle_unknown + X_int, X_mask = self._transform( + X, + handle_unknown=handle_unknown, + ensure_all_finite="allow-nan", + warn_on_unknown=warn_on_unknown, + ) + + n_samples, n_features = X_int.shape + + if self._drop_idx_after_grouping is not None: + to_drop = self._drop_idx_after_grouping.copy() + # We remove all the dropped categories from mask, and decrement all + # categories that occur after them to avoid an empty column. + keep_cells = X_int != to_drop + for i, cats in enumerate(self.categories_): + # drop='if_binary' but feature isn't binary + if to_drop[i] is None: + # set to cardinality to not drop from X_int + to_drop[i] = len(cats) + + to_drop = to_drop.reshape(1, -1) + X_int[X_int > to_drop] -= 1 + X_mask &= keep_cells + + mask = X_mask.ravel() + feature_indices = np.cumsum([0] + self._n_features_outs) + indices = (X_int + feature_indices[:-1]).ravel()[mask] + + indptr = np.empty(n_samples + 1, dtype=int) + indptr[0] = 0 + np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype) + np.cumsum(indptr[1:], out=indptr[1:]) + data = np.ones(indptr[-1]) + + out = sparse.csr_matrix( + (data, indices, indptr), + shape=(n_samples, feature_indices[-1]), + dtype=self.dtype, + ) + if not self.sparse_output: + return out.toarray() + else: + return out + + def inverse_transform(self, X): + """ + Convert the data back to the original representation. + + When unknown categories are encountered (all zeros in the + one-hot encoding), ``None`` is used to represent this category. If the + feature with the unknown category has a dropped category, the dropped + category will be its inverse. + + For a given input feature, if there is an infrequent category, + 'infrequent_sklearn' will be used to represent the infrequent category. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape \ + (n_samples, n_encoded_features) + The transformed data. + + Returns + ------- + X_tr : ndarray of shape (n_samples, n_features) + Inverse transformed array. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse="csr") + + n_samples, _ = X.shape + n_features = len(self.categories_) + + n_features_out = np.sum(self._n_features_outs) + + # validate shape of passed X + msg = ( + "Shape of the passed X data is not correct. Expected {0} columns, got {1}." + ) + if X.shape[1] != n_features_out: + raise ValueError(msg.format(n_features_out, X.shape[1])) + + transformed_features = [ + self._compute_transformed_categories(i, remove_dropped=False) + for i, _ in enumerate(self.categories_) + ] + + # create resulting array of appropriate dtype + dt = np.result_type(*[cat.dtype for cat in transformed_features]) + X_tr = np.empty((n_samples, n_features), dtype=dt) + + j = 0 + found_unknown = {} + + if self._infrequent_enabled: + infrequent_indices = self._infrequent_indices + else: + infrequent_indices = [None] * n_features + + for i in range(n_features): + cats_wo_dropped = self._remove_dropped_categories( + transformed_features[i], i + ) + n_categories = cats_wo_dropped.shape[0] + + # Only happens if there was a column with a unique + # category. In this case we just fill the column with this + # unique category value. + if n_categories == 0: + X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]] + j += n_categories + continue + sub = X[:, j : j + n_categories] + # for sparse X argmax returns 2D matrix, ensure 1D array + labels = np.asarray(sub.argmax(axis=1)).flatten() + X_tr[:, i] = cats_wo_dropped[labels] + + if self.handle_unknown == "ignore" or ( + self.handle_unknown in ("infrequent_if_exist", "warn") + and infrequent_indices[i] is None + ): + unknown = np.asarray(sub.sum(axis=1) == 0).flatten() + # ignored unknown categories: we have a row of all zero + if unknown.any(): + # if categories were dropped then unknown categories will + # be mapped to the dropped category + if ( + self._drop_idx_after_grouping is None + or self._drop_idx_after_grouping[i] is None + ): + found_unknown[i] = unknown + else: + X_tr[unknown, i] = self.categories_[i][ + self._drop_idx_after_grouping[i] + ] + else: + dropped = np.asarray(sub.sum(axis=1) == 0).flatten() + if dropped.any(): + if self._drop_idx_after_grouping is None: + all_zero_samples = np.flatnonzero(dropped) + raise ValueError( + f"Samples {all_zero_samples} can not be inverted " + "when drop=None and handle_unknown='error' " + "because they contain all zeros" + ) + # we can safely assume that all of the nulls in each column + # are the dropped value + drop_idx = self._drop_idx_after_grouping[i] + X_tr[dropped, i] = transformed_features[i][drop_idx] + + j += n_categories + + # if ignored are found: potentially need to upcast result to + # insert None values + if found_unknown: + if X_tr.dtype != object: + X_tr = X_tr.astype(object) + + for idx, mask in found_unknown.items(): + X_tr[mask, idx] = None + + return X_tr + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self) + input_features = _check_feature_names_in(self, input_features) + cats = [ + self._compute_transformed_categories(i) + for i, _ in enumerate(self.categories_) + ] + + name_combiner = self._check_get_feature_name_combiner() + feature_names = [] + for i in range(len(cats)): + names = [name_combiner(input_features[i], t) for t in cats[i]] + feature_names.extend(names) + + return np.array(feature_names, dtype=object) + + def _check_get_feature_name_combiner(self): + if self.feature_name_combiner == "concat": + return lambda feature, category: feature + "_" + str(category) + else: # callable + dry_run_combiner = self.feature_name_combiner("feature", "category") + if not isinstance(dry_run_combiner, str): + raise TypeError( + "When `feature_name_combiner` is a callable, it should return a " + f"Python string. Got {type(dry_run_combiner)} instead." + ) + return self.feature_name_combiner + + +class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder): + """ + Encode categorical features as an integer array. + + The input to this transformer should be an array-like of integers or + strings, denoting the values taken on by categorical (discrete) features. + The features are converted to ordinal integers. This results in + a single column of integers (0 to n_categories - 1) per feature. + + Read more in the :ref:`User Guide `. + For a comparison of different encoders, refer to: + :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. + + .. versionadded:: 0.20 + + Parameters + ---------- + categories : 'auto' or a list of array-like, default='auto' + Categories (unique values) per feature: + + - 'auto' : Determine categories automatically from the training data. + - list : ``categories[i]`` holds the categories expected in the ith + column. The passed categories should not mix strings and numeric + values, and should be sorted in case of numeric values. + + The used categories can be found in the ``categories_`` attribute. + + dtype : number type, default=np.float64 + Desired dtype of output. + + handle_unknown : {'error', 'use_encoded_value'}, default='error' + When set to 'error' an error will be raised in case an unknown + categorical feature is present during transform. When set to + 'use_encoded_value', the encoded value of unknown categories will be + set to the value given for the parameter `unknown_value`. In + :meth:`inverse_transform`, an unknown category will be denoted as None. + + .. versionadded:: 0.24 + + unknown_value : int or np.nan, default=None + When the parameter handle_unknown is set to 'use_encoded_value', this + parameter is required and will set the encoded value of unknown + categories. It has to be distinct from the values used to encode any of + the categories in `fit`. If set to np.nan, the `dtype` parameter must + be a float dtype. + + .. versionadded:: 0.24 + + encoded_missing_value : int or np.nan, default=np.nan + Encoded value of missing categories. If set to `np.nan`, then the `dtype` + parameter must be a float dtype. + + .. versionadded:: 1.1 + + min_frequency : int or float, default=None + Specifies the minimum frequency below which a category will be + considered infrequent. + + - If `int`, categories with a smaller cardinality will be considered + infrequent. + + - If `float`, categories with a smaller cardinality than + `min_frequency * n_samples` will be considered infrequent. + + .. versionadded:: 1.3 + Read more in the :ref:`User Guide `. + + max_categories : int, default=None + Specifies an upper limit to the number of output categories for each input + feature when considering infrequent categories. If there are infrequent + categories, `max_categories` includes the category representing the + infrequent categories along with the frequent categories. If `None`, + there is no limit to the number of output features. + + `max_categories` do **not** take into account missing or unknown + categories. Setting `unknown_value` or `encoded_missing_value` to an + integer will increase the number of unique integer codes by one each. + This can result in up to `max_categories + 2` integer codes. + + .. versionadded:: 1.3 + Read more in the :ref:`User Guide `. + + Attributes + ---------- + categories_ : list of arrays + The categories of each feature determined during ``fit`` (in order of + the features in X and corresponding with the output of ``transform``). + This does not include categories that weren't seen during ``fit``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 1.0 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + infrequent_categories_ : list of ndarray + Defined only if infrequent categories are enabled by setting + `min_frequency` or `max_categories` to a non-default value. + `infrequent_categories_[i]` are the infrequent categories for feature + `i`. If the feature `i` has no infrequent categories + `infrequent_categories_[i]` is None. + + .. versionadded:: 1.3 + + See Also + -------- + OneHotEncoder : Performs a one-hot encoding of categorical features. This encoding + is suitable for low to medium cardinality categorical variables, both in + supervised and unsupervised settings. + TargetEncoder : Encodes categorical features using supervised signal + in a classification or regression pipeline. This encoding is typically + suitable for high cardinality categorical variables. + LabelEncoder : Encodes target labels with values between 0 and + ``n_classes-1``. + + Notes + ----- + With a high proportion of `nan` values, inferring categories becomes slow with + Python versions before 3.10. The handling of `nan` values was improved + from Python 3.10 onwards, (c.f. + `bpo-43475 `_). + + Examples + -------- + Given a dataset with two features, we let the encoder find the unique + values per feature and transform the data to an ordinal encoding. + + >>> from sklearn.preprocessing import OrdinalEncoder + >>> enc = OrdinalEncoder() + >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] + >>> enc.fit(X) + OrdinalEncoder() + >>> enc.categories_ + [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] + >>> enc.transform([['Female', 3], ['Male', 1]]) + array([[0., 2.], + [1., 0.]]) + + >>> enc.inverse_transform([[1, 0], [0, 1]]) + array([['Male', 1], + ['Female', 2]], dtype=object) + + By default, :class:`OrdinalEncoder` is lenient towards missing values by + propagating them. + + >>> import numpy as np + >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]] + >>> enc.fit_transform(X) + array([[ 1., 0.], + [ 0., 1.], + [ 0., nan]]) + + You can use the parameter `encoded_missing_value` to encode missing values. + + >>> enc.set_params(encoded_missing_value=-1).fit_transform(X) + array([[ 1., 0.], + [ 0., 1.], + [ 0., -1.]]) + + Infrequent categories are enabled by setting `max_categories` or `min_frequency`. + In the following example, "a" and "d" are considered infrequent and grouped + together into a single category, "b" and "c" are their own categories, unknown + values are encoded as 3 and missing values are encoded as 4. + + >>> X_train = np.array( + ... [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], + ... dtype=object).T + >>> enc = OrdinalEncoder( + ... handle_unknown="use_encoded_value", unknown_value=3, + ... max_categories=3, encoded_missing_value=4) + >>> _ = enc.fit(X_train) + >>> X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object) + >>> enc.transform(X_test) + array([[2.], + [0.], + [1.], + [2.], + [3.], + [4.]]) + """ + + _parameter_constraints: dict = { + "categories": [StrOptions({"auto"}), list], + "dtype": "no_validation", # validation delegated to numpy + "encoded_missing_value": [Integral, type(np.nan)], + "handle_unknown": [StrOptions({"error", "use_encoded_value"})], + "unknown_value": [Integral, type(np.nan), None], + "max_categories": [Interval(Integral, 1, None, closed="left"), None], + "min_frequency": [ + Interval(Integral, 1, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="neither"), + None, + ], + } + + def __init__( + self, + *, + categories="auto", + dtype=np.float64, + handle_unknown="error", + unknown_value=None, + encoded_missing_value=np.nan, + min_frequency=None, + max_categories=None, + ): + self.categories = categories + self.dtype = dtype + self.handle_unknown = handle_unknown + self.unknown_value = unknown_value + self.encoded_missing_value = encoded_missing_value + self.min_frequency = min_frequency + self.max_categories = max_categories + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """ + Fit the OrdinalEncoder to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + + Returns + ------- + self : object + Fitted encoder. + """ + if self.handle_unknown == "use_encoded_value": + if is_scalar_nan(self.unknown_value): + if np.dtype(self.dtype).kind != "f": + raise ValueError( + "When unknown_value is np.nan, the dtype " + "parameter should be " + f"a float dtype. Got {self.dtype}." + ) + elif not isinstance(self.unknown_value, numbers.Integral): + raise TypeError( + "unknown_value should be an integer or " + "np.nan when " + "handle_unknown is 'use_encoded_value', " + f"got {self.unknown_value}." + ) + elif self.unknown_value is not None: + raise TypeError( + "unknown_value should only be set when " + "handle_unknown is 'use_encoded_value', " + f"got {self.unknown_value}." + ) + + # `_fit` will only raise an error when `self.handle_unknown="error"` + fit_results = self._fit( + X, + handle_unknown=self.handle_unknown, + ensure_all_finite="allow-nan", + return_and_ignore_missing_for_infrequent=True, + ) + self._missing_indices = fit_results["missing_indices"] + + cardinalities = [len(categories) for categories in self.categories_] + if self._infrequent_enabled: + # Cardinality decreases because the infrequent categories are grouped + # together + for feature_idx, infrequent in enumerate(self.infrequent_categories_): + if infrequent is not None: + cardinalities[feature_idx] -= len(infrequent) + + # missing values are not considered part of the cardinality + # when considering unknown categories or encoded_missing_value + for cat_idx, categories_for_idx in enumerate(self.categories_): + if is_scalar_nan(categories_for_idx[-1]): + cardinalities[cat_idx] -= 1 + + if self.handle_unknown == "use_encoded_value": + for cardinality in cardinalities: + if 0 <= self.unknown_value < cardinality: + raise ValueError( + "The used value for unknown_value " + f"{self.unknown_value} is one of the " + "values already used for encoding the " + "seen categories." + ) + + if self._missing_indices: + if np.dtype(self.dtype).kind != "f" and is_scalar_nan( + self.encoded_missing_value + ): + raise ValueError( + "There are missing values in features " + f"{list(self._missing_indices)}. For OrdinalEncoder to " + f"encode missing values with dtype: {self.dtype}, set " + "encoded_missing_value to a non-nan value, or " + "set dtype to a float" + ) + + if not is_scalar_nan(self.encoded_missing_value): + # Features are invalid when they contain a missing category + # and encoded_missing_value was already used to encode a + # known category + invalid_features = [ + cat_idx + for cat_idx, cardinality in enumerate(cardinalities) + if cat_idx in self._missing_indices + and 0 <= self.encoded_missing_value < cardinality + ] + + if invalid_features: + # Use feature names if they are available + if hasattr(self, "feature_names_in_"): + invalid_features = self.feature_names_in_[invalid_features] + raise ValueError( + f"encoded_missing_value ({self.encoded_missing_value}) " + "is already used to encode a known category in features: " + f"{invalid_features}" + ) + + return self + + def transform(self, X): + """ + Transform X to ordinal codes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to encode. + + Returns + ------- + X_out : ndarray of shape (n_samples, n_features) + Transformed input. + """ + check_is_fitted(self, "categories_") + X_int, X_mask = self._transform( + X, + handle_unknown=self.handle_unknown, + ensure_all_finite="allow-nan", + ignore_category_indices=self._missing_indices, + ) + X_trans = X_int.astype(self.dtype, copy=False) + + for cat_idx, missing_idx in self._missing_indices.items(): + X_missing_mask = X_int[:, cat_idx] == missing_idx + X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value + + # create separate category for unknown values + if self.handle_unknown == "use_encoded_value": + X_trans[~X_mask] = self.unknown_value + return X_trans + + def inverse_transform(self, X): + """ + Convert the data back to the original representation. + + Parameters + ---------- + X : array-like of shape (n_samples, n_encoded_features) + The transformed data. + + Returns + ------- + X_tr : ndarray of shape (n_samples, n_features) + Inverse transformed array. + """ + check_is_fitted(self) + X = check_array(X, ensure_all_finite="allow-nan") + + n_samples, _ = X.shape + n_features = len(self.categories_) + + # validate shape of passed X + msg = ( + "Shape of the passed X data is not correct. Expected {0} columns, got {1}." + ) + if X.shape[1] != n_features: + raise ValueError(msg.format(n_features, X.shape[1])) + + # create resulting array of appropriate dtype + dt = np.result_type(*[cat.dtype for cat in self.categories_]) + X_tr = np.empty((n_samples, n_features), dtype=dt) + + found_unknown = {} + infrequent_masks = {} + + infrequent_indices = getattr(self, "_infrequent_indices", None) + + for i in range(n_features): + labels = X[:, i] + + # replace values of X[:, i] that were nan with actual indices + if i in self._missing_indices: + X_i_mask = _get_mask(labels, self.encoded_missing_value) + labels[X_i_mask] = self._missing_indices[i] + + rows_to_update = slice(None) + categories = self.categories_[i] + + if infrequent_indices is not None and infrequent_indices[i] is not None: + # Compute mask for frequent categories + infrequent_encoding_value = len(categories) - len(infrequent_indices[i]) + infrequent_masks[i] = labels == infrequent_encoding_value + rows_to_update = ~infrequent_masks[i] + + # Remap categories to be only frequent categories. The infrequent + # categories will be mapped to "infrequent_sklearn" later + frequent_categories_mask = np.ones_like(categories, dtype=bool) + frequent_categories_mask[infrequent_indices[i]] = False + categories = categories[frequent_categories_mask] + + if self.handle_unknown == "use_encoded_value": + unknown_labels = _get_mask(labels, self.unknown_value) + found_unknown[i] = unknown_labels + + known_labels = ~unknown_labels + if isinstance(rows_to_update, np.ndarray): + rows_to_update &= known_labels + else: + rows_to_update = known_labels + + labels_int = labels[rows_to_update].astype("int64", copy=False) + X_tr[rows_to_update, i] = categories[labels_int] + + if found_unknown or infrequent_masks: + X_tr = X_tr.astype(object, copy=False) + + # insert None values for unknown values + if found_unknown: + for idx, mask in found_unknown.items(): + X_tr[mask, idx] = None + + if infrequent_masks: + for idx, mask in infrequent_masks.items(): + X_tr[mask, idx] = "infrequent_sklearn" + + return X_tr diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_function_transformer.py b/.venv/Lib/site-packages/sklearn/preprocessing/_function_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..10bfcb34980b70a3438d003d6a511bf0c0887d34 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/_function_transformer.py @@ -0,0 +1,445 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from functools import partial + +import numpy as np + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils._estimator_html_repr import _VisualBlock +from ..utils._param_validation import StrOptions +from ..utils._set_output import ( + _get_adapter_from_container, + _get_output_config, +) +from ..utils.metaestimators import available_if +from ..utils.validation import ( + _allclose_dense_sparse, + _check_feature_names, + _check_feature_names_in, + _check_n_features, + _get_feature_names, + _is_pandas_df, + _is_polars_df, + check_array, + validate_data, +) + + +def _identity(X): + """The identity function.""" + return X + + +class FunctionTransformer(TransformerMixin, BaseEstimator): + """Constructs a transformer from an arbitrary callable. + + A FunctionTransformer forwards its X (and optionally y) arguments to a + user-defined function or function object and returns the result of this + function. This is useful for stateless transformations such as taking the + log of frequencies, doing custom scaling, etc. + + Note: If a lambda is used as the function, then the resulting + transformer will not be pickleable. + + .. versionadded:: 0.17 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + func : callable, default=None + The callable to use for the transformation. This will be passed + the same arguments as transform, with args and kwargs forwarded. + If func is None, then func will be the identity function. + + inverse_func : callable, default=None + The callable to use for the inverse transformation. This will be + passed the same arguments as inverse transform, with args and + kwargs forwarded. If inverse_func is None, then inverse_func + will be the identity function. + + validate : bool, default=False + Indicate that the input X array should be checked before calling + ``func``. The possibilities are: + + - If False, there is no input validation. + - If True, then X will be converted to a 2-dimensional NumPy array or + sparse matrix. If the conversion is not possible an exception is + raised. + + .. versionchanged:: 0.22 + The default of ``validate`` changed from True to False. + + accept_sparse : bool, default=False + Indicate that func accepts a sparse matrix as input. If validate is + False, this has no effect. Otherwise, if accept_sparse is false, + sparse matrix inputs will cause an exception to be raised. + + check_inverse : bool, default=True + Whether to check that or ``func`` followed by ``inverse_func`` leads to + the original inputs. It can be used for a sanity check, raising a + warning when the condition is not fulfilled. + + .. versionadded:: 0.20 + + feature_names_out : callable, 'one-to-one' or None, default=None + Determines the list of feature names that will be returned by the + `get_feature_names_out` method. If it is 'one-to-one', then the output + feature names will be equal to the input feature names. If it is a + callable, then it must take two positional arguments: this + `FunctionTransformer` (`self`) and an array-like of input feature names + (`input_features`). It must return an array-like of output feature + names. The `get_feature_names_out` method is only defined if + `feature_names_out` is not None. + + See ``get_feature_names_out`` for more details. + + .. versionadded:: 1.1 + + kw_args : dict, default=None + Dictionary of additional keyword arguments to pass to func. + + .. versionadded:: 0.18 + + inv_kw_args : dict, default=None + Dictionary of additional keyword arguments to pass to inverse_func. + + .. versionadded:: 0.18 + + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + MaxAbsScaler : Scale each feature by its maximum absolute value. + StandardScaler : Standardize features by removing the mean and + scaling to unit variance. + LabelBinarizer : Binarize labels in a one-vs-all fashion. + MultiLabelBinarizer : Transform between iterable of iterables + and a multilabel format. + + Notes + ----- + If `func` returns an output with a `columns` attribute, then the columns is enforced + to be consistent with the output of `get_feature_names_out`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import FunctionTransformer + >>> transformer = FunctionTransformer(np.log1p) + >>> X = np.array([[0, 1], [2, 3]]) + >>> transformer.transform(X) + array([[0. , 0.6931...], + [1.0986..., 1.3862...]]) + """ + + _parameter_constraints: dict = { + "func": [callable, None], + "inverse_func": [callable, None], + "validate": ["boolean"], + "accept_sparse": ["boolean"], + "check_inverse": ["boolean"], + "feature_names_out": [callable, StrOptions({"one-to-one"}), None], + "kw_args": [dict, None], + "inv_kw_args": [dict, None], + } + + def __init__( + self, + func=None, + inverse_func=None, + *, + validate=False, + accept_sparse=False, + check_inverse=True, + feature_names_out=None, + kw_args=None, + inv_kw_args=None, + ): + self.func = func + self.inverse_func = inverse_func + self.validate = validate + self.accept_sparse = accept_sparse + self.check_inverse = check_inverse + self.feature_names_out = feature_names_out + self.kw_args = kw_args + self.inv_kw_args = inv_kw_args + + def _check_input(self, X, *, reset): + if self.validate: + return validate_data(self, X, accept_sparse=self.accept_sparse, reset=reset) + elif reset: + # Set feature_names_in_ and n_features_in_ even if validate=False + # We run this only when reset==True to store the attributes but not + # validate them, because validate=False + _check_n_features(self, X, reset=reset) + _check_feature_names(self, X, reset=reset) + return X + + def _check_inverse_transform(self, X): + """Check that func and inverse_func are the inverse.""" + idx_selected = slice(None, None, max(1, X.shape[0] // 100)) + X_round_trip = self.inverse_transform(self.transform(X[idx_selected])) + + if hasattr(X, "dtype"): + dtypes = [X.dtype] + elif hasattr(X, "dtypes"): + # Dataframes can have multiple dtypes + dtypes = X.dtypes + + if not all(np.issubdtype(d, np.number) for d in dtypes): + raise ValueError( + "'check_inverse' is only supported when all the elements in `X` is" + " numerical." + ) + + if not _allclose_dense_sparse(X[idx_selected], X_round_trip): + warnings.warn( + ( + "The provided functions are not strictly" + " inverse of each other. If you are sure you" + " want to proceed regardless, set" + " 'check_inverse=False'." + ), + UserWarning, + ) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit transformer by checking X. + + If ``validate`` is ``True``, ``X`` will be checked. + + Parameters + ---------- + X : {array-like, sparse-matrix} of shape (n_samples, n_features) \ + if `validate=True` else any object that `func` can handle + Input array. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + FunctionTransformer class instance. + """ + X = self._check_input(X, reset=True) + if self.check_inverse and not (self.func is None or self.inverse_func is None): + self._check_inverse_transform(X) + return self + + def transform(self, X): + """Transform X using the forward function. + + Parameters + ---------- + X : {array-like, sparse-matrix} of shape (n_samples, n_features) \ + if `validate=True` else any object that `func` can handle + Input array. + + Returns + ------- + X_out : array-like, shape (n_samples, n_features) + Transformed input. + """ + X = self._check_input(X, reset=False) + out = self._transform(X, func=self.func, kw_args=self.kw_args) + output_config = _get_output_config("transform", self)["dense"] + + if hasattr(out, "columns") and self.feature_names_out is not None: + # check the consistency between the column provided by `transform` and + # the the column names provided by `get_feature_names_out`. + feature_names_out = self.get_feature_names_out() + if list(out.columns) != list(feature_names_out): + # we can override the column names of the output if it is inconsistent + # with the column names provided by `get_feature_names_out` in the + # following cases: + # * `func` preserved the column names between the input and the output + # * the input column names are all numbers + # * the output is requested to be a DataFrame (pandas or polars) + feature_names_in = getattr( + X, "feature_names_in_", _get_feature_names(X) + ) + same_feature_names_in_out = feature_names_in is not None and list( + feature_names_in + ) == list(out.columns) + not_all_str_columns = not all( + isinstance(col, str) for col in out.columns + ) + if same_feature_names_in_out or not_all_str_columns: + adapter = _get_adapter_from_container(out) + out = adapter.create_container( + X_output=out, + X_original=out, + columns=feature_names_out, + inplace=False, + ) + else: + raise ValueError( + "The output generated by `func` have different column names " + "than the ones provided by `get_feature_names_out`. " + f"Got output with columns names: {list(out.columns)} and " + "`get_feature_names_out` returned: " + f"{list(self.get_feature_names_out())}. " + "The column names can be overridden by setting " + "`set_output(transform='pandas')` or " + "`set_output(transform='polars')` such that the column names " + "are set to the names provided by `get_feature_names_out`." + ) + + if self.feature_names_out is None: + warn_msg = ( + "When `set_output` is configured to be '{0}', `func` should return " + "a {0} DataFrame to follow the `set_output` API or `feature_names_out`" + " should be defined." + ) + if output_config == "pandas" and not _is_pandas_df(out): + warnings.warn(warn_msg.format("pandas")) + elif output_config == "polars" and not _is_polars_df(out): + warnings.warn(warn_msg.format("polars")) + + return out + + def inverse_transform(self, X): + """Transform X using the inverse function. + + Parameters + ---------- + X : {array-like, sparse-matrix} of shape (n_samples, n_features) \ + if `validate=True` else any object that `inverse_func` can handle + Input array. + + Returns + ------- + X_out : array-like, shape (n_samples, n_features) + Transformed input. + """ + if self.validate: + X = check_array(X, accept_sparse=self.accept_sparse) + return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args) + + @available_if(lambda self: self.feature_names_out is not None) + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + This method is only defined if `feature_names_out` is not None. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input feature names. + + - If `input_features` is None, then `feature_names_in_` is + used as the input feature names. If `feature_names_in_` is not + defined, then names are generated: + `[x0, x1, ..., x(n_features_in_ - 1)]`. + - If `input_features` is array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + + - If `feature_names_out` is 'one-to-one', the input feature names + are returned (see `input_features` above). This requires + `feature_names_in_` and/or `n_features_in_` to be defined, which + is done automatically if `validate=True`. Alternatively, you can + set them in `func`. + - If `feature_names_out` is a callable, then it is called with two + arguments, `self` and `input_features`, and its return value is + returned by this method. + """ + if hasattr(self, "n_features_in_") or input_features is not None: + input_features = _check_feature_names_in(self, input_features) + if self.feature_names_out == "one-to-one": + names_out = input_features + elif callable(self.feature_names_out): + names_out = self.feature_names_out(self, input_features) + else: + raise ValueError( + f"feature_names_out={self.feature_names_out!r} is invalid. " + 'It must either be "one-to-one" or a callable with two ' + "arguments: the function transformer and an array-like of " + "input feature names. The callable must return an array-like " + "of output feature names." + ) + return np.asarray(names_out, dtype=object) + + def _transform(self, X, func=None, kw_args=None): + if func is None: + func = _identity + + return func(X, **(kw_args if kw_args else {})) + + def __sklearn_is_fitted__(self): + """Return True since FunctionTransfomer is stateless.""" + return True + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.no_validation = not self.validate + tags.requires_fit = False + return tags + + def set_output(self, *, transform=None): + """Set output container. + + See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py` + for an example on how to use the API. + + Parameters + ---------- + transform : {"default", "pandas", "polars"}, default=None + Configure output of `transform` and `fit_transform`. + + - `"default"`: Default output format of a transformer + - `"pandas"`: DataFrame output + - `"polars"`: Polars output + - `None`: Transform configuration is unchanged + + .. versionadded:: 1.4 + `"polars"` option was added. + + Returns + ------- + self : estimator instance + Estimator instance. + """ + if not hasattr(self, "_sklearn_output_config"): + self._sklearn_output_config = {} + + self._sklearn_output_config["transform"] = transform + return self + + def _get_function_name(self): + """Get the name display of the `func` used in HTML representation.""" + if hasattr(self.func, "__name__"): + return self.func.__name__ + if isinstance(self.func, partial): + return self.func.func.__name__ + return f"{self.func.__class__.__name__}(...)" + + def _sk_visual_block_(self): + return _VisualBlock( + "single", + self, + names=self._get_function_name(), + name_details=str(self), + name_caption="FunctionTransformer", + doc_link_label="FunctionTransformer", + ) diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_label.py b/.venv/Lib/site-packages/sklearn/preprocessing/_label.py new file mode 100644 index 0000000000000000000000000000000000000000..57dcaa08c1ed3707d2feb066b42adb81f8ebfd74 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/_label.py @@ -0,0 +1,963 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import array +import itertools +import warnings +from collections import defaultdict +from numbers import Integral + +import numpy as np +import scipy.sparse as sp + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils import column_or_1d +from ..utils._array_api import _setdiff1d, device, get_namespace +from ..utils._encode import _encode, _unique +from ..utils._param_validation import Interval, validate_params +from ..utils.multiclass import type_of_target, unique_labels +from ..utils.sparsefuncs import min_max_axis +from ..utils.validation import _num_samples, check_array, check_is_fitted + +__all__ = [ + "label_binarize", + "LabelBinarizer", + "LabelEncoder", + "MultiLabelBinarizer", +] + + +class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): + """Encode target labels with value between 0 and n_classes-1. + + This transformer should be used to encode target values, *i.e.* `y`, and + not the input `X`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.12 + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + Holds the label for each class. + + See Also + -------- + OrdinalEncoder : Encode categorical features using an ordinal encoding + scheme. + OneHotEncoder : Encode categorical features as a one-hot numeric array. + + Examples + -------- + `LabelEncoder` can be used to normalize labels. + + >>> from sklearn.preprocessing import LabelEncoder + >>> le = LabelEncoder() + >>> le.fit([1, 2, 2, 6]) + LabelEncoder() + >>> le.classes_ + array([1, 2, 6]) + >>> le.transform([1, 1, 2, 6]) + array([0, 0, 1, 2]...) + >>> le.inverse_transform([0, 0, 1, 2]) + array([1, 1, 2, 6]) + + It can also be used to transform non-numerical labels (as long as they are + hashable and comparable) to numerical labels. + + >>> le = LabelEncoder() + >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) + LabelEncoder() + >>> list(le.classes_) + [np.str_('amsterdam'), np.str_('paris'), np.str_('tokyo')] + >>> le.transform(["tokyo", "tokyo", "paris"]) + array([2, 2, 1]...) + >>> list(le.inverse_transform([2, 2, 1])) + [np.str_('tokyo'), np.str_('tokyo'), np.str_('paris')] + """ + + def fit(self, y): + """Fit label encoder. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : returns an instance of self. + Fitted label encoder. + """ + y = column_or_1d(y, warn=True) + self.classes_ = _unique(y) + return self + + def fit_transform(self, y): + """Fit label encoder and return encoded labels. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + y : array-like of shape (n_samples,) + Encoded labels. + """ + y = column_or_1d(y, warn=True) + self.classes_, y = _unique(y, return_inverse=True) + return y + + def transform(self, y): + """Transform labels to normalized encoding. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + y : array-like of shape (n_samples,) + Labels as normalized encodings. + """ + check_is_fitted(self) + xp, _ = get_namespace(y) + y = column_or_1d(y, dtype=self.classes_.dtype, warn=True) + # transform of empty array is empty array + if _num_samples(y) == 0: + return xp.asarray([]) + + return _encode(y, uniques=self.classes_) + + def inverse_transform(self, y): + """Transform labels back to original encoding. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + y : ndarray of shape (n_samples,) + Original encoding. + """ + check_is_fitted(self) + xp, _ = get_namespace(y) + y = column_or_1d(y, warn=True) + # inverse transform of empty array is empty array + if _num_samples(y) == 0: + return xp.asarray([]) + + diff = _setdiff1d( + ar1=y, + ar2=xp.arange(self.classes_.shape[0], device=device(y)), + xp=xp, + ) + if diff.shape[0]: + raise ValueError("y contains previously unseen labels: %s" % str(diff)) + y = xp.asarray(y) + return xp.take(self.classes_, y, axis=0) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.array_api_support = True + tags.input_tags.two_d_array = False + tags.target_tags.one_d_labels = True + return tags + + +class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): + """Binarize labels in a one-vs-all fashion. + + Several regression and binary classification algorithms are + available in scikit-learn. A simple way to extend these algorithms + to the multi-class classification case is to use the so-called + one-vs-all scheme. + + At learning time, this simply consists in learning one regressor + or binary classifier per class. In doing so, one needs to convert + multi-class labels to binary labels (belong or does not belong + to the class). `LabelBinarizer` makes this process easy with the + transform method. + + At prediction time, one assigns the class for which the corresponding + model gave the greatest confidence. `LabelBinarizer` makes this easy + with the :meth:`inverse_transform` method. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + neg_label : int, default=0 + Value with which negative labels must be encoded. + + pos_label : int, default=1 + Value with which positive labels must be encoded. + + sparse_output : bool, default=False + True if the returned array from transform is desired to be in sparse + CSR format. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + Holds the label for each class. + + y_type_ : str + Represents the type of the target data as evaluated by + :func:`~sklearn.utils.multiclass.type_of_target`. Possible type are + 'continuous', 'continuous-multioutput', 'binary', 'multiclass', + 'multiclass-multioutput', 'multilabel-indicator', and 'unknown'. + + sparse_input_ : bool + `True` if the input data to transform is given as a sparse matrix, + `False` otherwise. + + See Also + -------- + label_binarize : Function to perform the transform operation of + LabelBinarizer with fixed classes. + OneHotEncoder : Encode categorical features using a one-hot aka one-of-K + scheme. + + Examples + -------- + >>> from sklearn.preprocessing import LabelBinarizer + >>> lb = LabelBinarizer() + >>> lb.fit([1, 2, 6, 4, 2]) + LabelBinarizer() + >>> lb.classes_ + array([1, 2, 4, 6]) + >>> lb.transform([1, 6]) + array([[1, 0, 0, 0], + [0, 0, 0, 1]]) + + Binary targets transform to a column vector + + >>> lb = LabelBinarizer() + >>> lb.fit_transform(['yes', 'no', 'no', 'yes']) + array([[1], + [0], + [0], + [1]]) + + Passing a 2D matrix for multilabel classification + + >>> import numpy as np + >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]])) + LabelBinarizer() + >>> lb.classes_ + array([0, 1, 2]) + >>> lb.transform([0, 1, 2, 1]) + array([[1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [0, 1, 0]]) + """ + + _parameter_constraints: dict = { + "neg_label": [Integral], + "pos_label": [Integral], + "sparse_output": ["boolean"], + } + + def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False): + self.neg_label = neg_label + self.pos_label = pos_label + self.sparse_output = sparse_output + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, y): + """Fit label binarizer. + + Parameters + ---------- + y : ndarray of shape (n_samples,) or (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. + + Returns + ------- + self : object + Returns the instance itself. + """ + if self.neg_label >= self.pos_label: + raise ValueError( + f"neg_label={self.neg_label} must be strictly less than " + f"pos_label={self.pos_label}." + ) + + if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0): + raise ValueError( + "Sparse binarization is only supported with non " + "zero pos_label and zero neg_label, got " + f"pos_label={self.pos_label} and neg_label={self.neg_label}" + ) + + self.y_type_ = type_of_target(y, input_name="y") + + if "multioutput" in self.y_type_: + raise ValueError( + "Multioutput target data is not supported with label binarization" + ) + if _num_samples(y) == 0: + raise ValueError("y has 0 samples: %r" % y) + + self.sparse_input_ = sp.issparse(y) + self.classes_ = unique_labels(y) + return self + + def fit_transform(self, y): + """Fit label binarizer/transform multi-class labels to binary labels. + + The output of transform is sometimes referred to as + the 1-of-K coding scheme. + + Parameters + ---------- + y : {ndarray, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. Sparse matrix can be + CSR, CSC, COO, DOK, or LIL. + + Returns + ------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. Sparse matrix + will be of CSR format. + """ + return self.fit(y).transform(y) + + def transform(self, y): + """Transform multi-class labels to binary labels. + + The output of transform is sometimes referred to by some authors as + the 1-of-K coding scheme. + + Parameters + ---------- + y : {array, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. Sparse matrix can be + CSR, CSC, COO, DOK, or LIL. + + Returns + ------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. Sparse matrix + will be of CSR format. + """ + check_is_fitted(self) + + y_is_multilabel = type_of_target(y).startswith("multilabel") + if y_is_multilabel and not self.y_type_.startswith("multilabel"): + raise ValueError("The object was not fitted with multilabel input.") + + return label_binarize( + y, + classes=self.classes_, + pos_label=self.pos_label, + neg_label=self.neg_label, + sparse_output=self.sparse_output, + ) + + def inverse_transform(self, Y, threshold=None): + """Transform binary labels back to multi-class labels. + + Parameters + ---------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Target values. All sparse matrices are converted to CSR before + inverse transformation. + + threshold : float, default=None + Threshold used in the binary and multi-label cases. + + Use 0 when ``Y`` contains the output of :term:`decision_function` + (classifier). + Use 0.5 when ``Y`` contains the output of :term:`predict_proba`. + + If None, the threshold is assumed to be half way between + neg_label and pos_label. + + Returns + ------- + y : {ndarray, sparse matrix} of shape (n_samples,) + Target values. Sparse matrix will be of CSR format. + + Notes + ----- + In the case when the binary labels are fractional + (probabilistic), :meth:`inverse_transform` chooses the class with the + greatest value. Typically, this allows to use the output of a + linear model's :term:`decision_function` method directly as the input + of :meth:`inverse_transform`. + """ + check_is_fitted(self) + + if threshold is None: + threshold = (self.pos_label + self.neg_label) / 2.0 + + if self.y_type_ == "multiclass": + y_inv = _inverse_binarize_multiclass(Y, self.classes_) + else: + y_inv = _inverse_binarize_thresholding( + Y, self.y_type_, self.classes_, threshold + ) + + if self.sparse_input_: + y_inv = sp.csr_matrix(y_inv) + elif sp.issparse(y_inv): + y_inv = y_inv.toarray() + + return y_inv + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = False + tags.target_tags.one_d_labels = True + return tags + + +@validate_params( + { + "y": ["array-like", "sparse matrix"], + "classes": ["array-like"], + "neg_label": [Interval(Integral, None, None, closed="neither")], + "pos_label": [Interval(Integral, None, None, closed="neither")], + "sparse_output": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False): + """Binarize labels in a one-vs-all fashion. + + Several regression and binary classification algorithms are + available in scikit-learn. A simple way to extend these algorithms + to the multi-class classification case is to use the so-called + one-vs-all scheme. + + This function makes it possible to compute this transformation for a + fixed set of class labels known ahead of time. + + Parameters + ---------- + y : array-like or sparse matrix + Sequence of integer labels or multilabel data to encode. + + classes : array-like of shape (n_classes,) + Uniquely holds the label for each class. + + neg_label : int, default=0 + Value with which negative labels must be encoded. + + pos_label : int, default=1 + Value with which positive labels must be encoded. + + sparse_output : bool, default=False, + Set to true if output binary array is desired in CSR sparse format. + + Returns + ------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. Sparse matrix will + be of CSR format. + + See Also + -------- + LabelBinarizer : Class used to wrap the functionality of label_binarize and + allow for fitting to classes independently of the transform operation. + + Examples + -------- + >>> from sklearn.preprocessing import label_binarize + >>> label_binarize([1, 6], classes=[1, 2, 4, 6]) + array([[1, 0, 0, 0], + [0, 0, 0, 1]]) + + The class ordering is preserved: + + >>> label_binarize([1, 6], classes=[1, 6, 4, 2]) + array([[1, 0, 0, 0], + [0, 1, 0, 0]]) + + Binary targets transform to a column vector + + >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes']) + array([[1], + [0], + [0], + [1]]) + """ + if not isinstance(y, list): + # XXX Workaround that will be removed when list of list format is + # dropped + y = check_array( + y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None + ) + else: + if _num_samples(y) == 0: + raise ValueError("y has 0 samples: %r" % y) + if neg_label >= pos_label: + raise ValueError( + "neg_label={0} must be strictly less than pos_label={1}.".format( + neg_label, pos_label + ) + ) + + if sparse_output and (pos_label == 0 or neg_label != 0): + raise ValueError( + "Sparse binarization is only supported with non " + "zero pos_label and zero neg_label, got " + "pos_label={0} and neg_label={1}" + "".format(pos_label, neg_label) + ) + + # To account for pos_label == 0 in the dense case + pos_switch = pos_label == 0 + if pos_switch: + pos_label = -neg_label + + y_type = type_of_target(y) + if "multioutput" in y_type: + raise ValueError( + "Multioutput target data is not supported with label binarization" + ) + if y_type == "unknown": + raise ValueError("The type of target data is not known") + + n_samples = y.shape[0] if sp.issparse(y) else len(y) + n_classes = len(classes) + classes = np.asarray(classes) + + if y_type == "binary": + if n_classes == 1: + if sparse_output: + return sp.csr_matrix((n_samples, 1), dtype=int) + else: + Y = np.zeros((len(y), 1), dtype=int) + Y += neg_label + return Y + elif len(classes) >= 3: + y_type = "multiclass" + + sorted_class = np.sort(classes) + if y_type == "multilabel-indicator": + y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0]) + if classes.size != y_n_classes: + raise ValueError( + "classes {0} mismatch with the labels {1} found in the data".format( + classes, unique_labels(y) + ) + ) + + if y_type in ("binary", "multiclass"): + y = column_or_1d(y) + + # pick out the known labels from y + y_in_classes = np.isin(y, classes) + y_seen = y[y_in_classes] + indices = np.searchsorted(sorted_class, y_seen) + indptr = np.hstack((0, np.cumsum(y_in_classes))) + + data = np.empty_like(indices) + data.fill(pos_label) + Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes)) + elif y_type == "multilabel-indicator": + Y = sp.csr_matrix(y) + if pos_label != 1: + data = np.empty_like(Y.data) + data.fill(pos_label) + Y.data = data + else: + raise ValueError( + "%s target data is not supported with label binarization" % y_type + ) + + if not sparse_output: + Y = Y.toarray() + Y = Y.astype(int, copy=False) + + if neg_label != 0: + Y[Y == 0] = neg_label + + if pos_switch: + Y[Y == pos_label] = 0 + else: + Y.data = Y.data.astype(int, copy=False) + + # preserve label ordering + if np.any(classes != sorted_class): + indices = np.searchsorted(sorted_class, classes) + Y = Y[:, indices] + + if y_type == "binary": + if sparse_output: + Y = Y.getcol(-1) + else: + Y = Y[:, -1].reshape((-1, 1)) + + return Y + + +def _inverse_binarize_multiclass(y, classes): + """Inverse label binarization transformation for multiclass. + + Multiclass uses the maximal score instead of a threshold. + """ + classes = np.asarray(classes) + + if sp.issparse(y): + # Find the argmax for each row in y where y is a CSR matrix + + y = y.tocsr() + n_samples, n_outputs = y.shape + outputs = np.arange(n_outputs) + row_max = min_max_axis(y, 1)[1] + row_nnz = np.diff(y.indptr) + + y_data_repeated_max = np.repeat(row_max, row_nnz) + # picks out all indices obtaining the maximum per row + y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data) + + # For corner case where last row has a max of 0 + if row_max[-1] == 0: + y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)]) + + # Gets the index of the first argmax in each row from y_i_all_argmax + index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1]) + # first argmax of each row + y_ind_ext = np.append(y.indices, [0]) + y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]] + # Handle rows of all 0 + y_i_argmax[np.where(row_nnz == 0)[0]] = 0 + + # Handles rows with max of 0 that contain negative numbers + samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)] + for i in samples: + ind = y.indices[y.indptr[i] : y.indptr[i + 1]] + y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0] + + return classes[y_i_argmax] + else: + return classes.take(y.argmax(axis=1), mode="clip") + + +def _inverse_binarize_thresholding(y, output_type, classes, threshold): + """Inverse label binarization transformation using thresholding.""" + + if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2: + raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape)) + + if output_type != "binary" and y.shape[1] != len(classes): + raise ValueError( + "The number of class is not equal to the number of dimension of y." + ) + + classes = np.asarray(classes) + + # Perform thresholding + if sp.issparse(y): + if threshold > 0: + if y.format not in ("csr", "csc"): + y = y.tocsr() + y.data = np.array(y.data > threshold, dtype=int) + y.eliminate_zeros() + else: + y = np.array(y.toarray() > threshold, dtype=int) + else: + y = np.array(y > threshold, dtype=int) + + # Inverse transform data + if output_type == "binary": + if sp.issparse(y): + y = y.toarray() + if y.ndim == 2 and y.shape[1] == 2: + return classes[y[:, 1]] + else: + if len(classes) == 1: + return np.repeat(classes[0], len(y)) + else: + return classes[y.ravel()] + + elif output_type == "multilabel-indicator": + return y + + else: + raise ValueError("{0} format is not supported".format(output_type)) + + +class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): + """Transform between iterable of iterables and a multilabel format. + + Although a list of sets or tuples is a very intuitive format for multilabel + data, it is unwieldy to process. This transformer converts between this + intuitive format and the supported multilabel format: a (samples x classes) + binary matrix indicating the presence of a class label. + + Parameters + ---------- + classes : array-like of shape (n_classes,), default=None + Indicates an ordering for the class labels. + All entries should be unique (cannot contain duplicate classes). + + sparse_output : bool, default=False + Set to True if output binary array is desired in CSR sparse format. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + A copy of the `classes` parameter when provided. + Otherwise it corresponds to the sorted set of classes found + when fitting. + + See Also + -------- + OneHotEncoder : Encode categorical features using a one-hot aka one-of-K + scheme. + + Examples + -------- + >>> from sklearn.preprocessing import MultiLabelBinarizer + >>> mlb = MultiLabelBinarizer() + >>> mlb.fit_transform([(1, 2), (3,)]) + array([[1, 1, 0], + [0, 0, 1]]) + >>> mlb.classes_ + array([1, 2, 3]) + + >>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}]) + array([[0, 1, 1], + [1, 0, 0]]) + >>> list(mlb.classes_) + ['comedy', 'sci-fi', 'thriller'] + + A common mistake is to pass in a list, which leads to the following issue: + + >>> mlb = MultiLabelBinarizer() + >>> mlb.fit(['sci-fi', 'thriller', 'comedy']) + MultiLabelBinarizer() + >>> mlb.classes_ + array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't', + 'y'], dtype=object) + + To correct this, the list of labels should be passed in as: + + >>> mlb = MultiLabelBinarizer() + >>> mlb.fit([['sci-fi', 'thriller', 'comedy']]) + MultiLabelBinarizer() + >>> mlb.classes_ + array(['comedy', 'sci-fi', 'thriller'], dtype=object) + """ + + _parameter_constraints: dict = { + "classes": ["array-like", None], + "sparse_output": ["boolean"], + } + + def __init__(self, *, classes=None, sparse_output=False): + self.classes = classes + self.sparse_output = sparse_output + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, y): + """Fit the label sets binarizer, storing :term:`classes_`. + + Parameters + ---------- + y : iterable of iterables + A set of labels (any orderable and hashable object) for each + sample. If the `classes` parameter is set, `y` will not be + iterated. + + Returns + ------- + self : object + Fitted estimator. + """ + self._cached_dict = None + + if self.classes is None: + classes = sorted(set(itertools.chain.from_iterable(y))) + elif len(set(self.classes)) < len(self.classes): + raise ValueError( + "The classes argument contains duplicate " + "classes. Remove these duplicates before passing " + "them to MultiLabelBinarizer." + ) + else: + classes = self.classes + dtype = int if all(isinstance(c, int) for c in classes) else object + self.classes_ = np.empty(len(classes), dtype=dtype) + self.classes_[:] = classes + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, y): + """Fit the label sets binarizer and transform the given label sets. + + Parameters + ---------- + y : iterable of iterables + A set of labels (any orderable and hashable object) for each + sample. If the `classes` parameter is set, `y` will not be + iterated. + + Returns + ------- + y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes) + A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` + is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR + format. + """ + if self.classes is not None: + return self.fit(y).transform(y) + + self._cached_dict = None + + # Automatically increment on new class + class_mapping = defaultdict(int) + class_mapping.default_factory = class_mapping.__len__ + yt = self._transform(y, class_mapping) + + # sort classes and reorder columns + tmp = sorted(class_mapping, key=class_mapping.get) + + # (make safe for tuples) + dtype = int if all(isinstance(c, int) for c in tmp) else object + class_mapping = np.empty(len(tmp), dtype=dtype) + class_mapping[:] = tmp + self.classes_, inverse = np.unique(class_mapping, return_inverse=True) + # ensure yt.indices keeps its current dtype + yt.indices = np.asarray(inverse[yt.indices], dtype=yt.indices.dtype) + + if not self.sparse_output: + yt = yt.toarray() + + return yt + + def transform(self, y): + """Transform the given label sets. + + Parameters + ---------- + y : iterable of iterables + A set of labels (any orderable and hashable object) for each + sample. If the `classes` parameter is set, `y` will not be + iterated. + + Returns + ------- + y_indicator : array or CSR matrix, shape (n_samples, n_classes) + A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in + `y[i]`, and 0 otherwise. + """ + check_is_fitted(self) + + class_to_index = self._build_cache() + yt = self._transform(y, class_to_index) + + if not self.sparse_output: + yt = yt.toarray() + + return yt + + def _build_cache(self): + if self._cached_dict is None: + self._cached_dict = dict(zip(self.classes_, range(len(self.classes_)))) + + return self._cached_dict + + def _transform(self, y, class_mapping): + """Transforms the label sets with a given mapping. + + Parameters + ---------- + y : iterable of iterables + A set of labels (any orderable and hashable object) for each + sample. If the `classes` parameter is set, `y` will not be + iterated. + + class_mapping : Mapping + Maps from label to column index in label indicator matrix. + + Returns + ------- + y_indicator : sparse matrix of shape (n_samples, n_classes) + Label indicator matrix. Will be of CSR format. + """ + indices = array.array("i") + indptr = array.array("i", [0]) + unknown = set() + for labels in y: + index = set() + for label in labels: + try: + index.add(class_mapping[label]) + except KeyError: + unknown.add(label) + indices.extend(index) + indptr.append(len(indices)) + if unknown: + warnings.warn( + "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str)) + ) + data = np.ones(len(indices), dtype=int) + + return sp.csr_matrix( + (data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping)) + ) + + def inverse_transform(self, yt): + """Transform the given indicator matrix into label sets. + + Parameters + ---------- + yt : {ndarray, sparse matrix} of shape (n_samples, n_classes) + A matrix containing only 1s ands 0s. + + Returns + ------- + y : list of tuples + The set of labels for each sample such that `y[i]` consists of + `classes_[j]` for each `yt[i, j] == 1`. + """ + check_is_fitted(self) + + if yt.shape[1] != len(self.classes_): + raise ValueError( + "Expected indicator for {0} classes, but got {1}".format( + len(self.classes_), yt.shape[1] + ) + ) + + if sp.issparse(yt): + yt = yt.tocsr() + if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0: + raise ValueError("Expected only 0s and 1s in label indicator.") + return [ + tuple(self.classes_.take(yt.indices[start:end])) + for start, end in zip(yt.indptr[:-1], yt.indptr[1:]) + ] + else: + unexpected = np.setdiff1d(yt, [0, 1]) + if len(unexpected) > 0: + raise ValueError( + "Expected only 0s and 1s in label indicator. Also got {0}".format( + unexpected + ) + ) + return [tuple(self.classes_.compress(indicators)) for indicators in yt] + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = False + tags.target_tags.two_d_labels = True + return tags diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_polynomial.py b/.venv/Lib/site-packages/sklearn/preprocessing/_polynomial.py new file mode 100644 index 0000000000000000000000000000000000000000..457d17ab0695075c4e852d55394feab0cba4bd8d --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/_polynomial.py @@ -0,0 +1,1173 @@ +""" +This file contains preprocessing tools based on polynomials. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import collections +from itertools import chain, combinations +from itertools import combinations_with_replacement as combinations_w_r +from numbers import Integral + +import numpy as np +from scipy import sparse +from scipy.interpolate import BSpline +from scipy.special import comb + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils import check_array +from ..utils._param_validation import Interval, StrOptions +from ..utils.fixes import parse_version, sp_version +from ..utils.stats import _weighted_percentile +from ..utils.validation import ( + FLOAT_DTYPES, + _check_feature_names_in, + _check_sample_weight, + check_is_fitted, + validate_data, +) +from ._csr_polynomial_expansion import ( + _calc_expanded_nnz, + _calc_total_nnz, + _csr_polynomial_expansion, +) + +__all__ = [ + "PolynomialFeatures", + "SplineTransformer", +] + + +def _create_expansion(X, interaction_only, deg, n_features, cumulative_size=0): + """Helper function for creating and appending sparse expansion matrices""" + + total_nnz = _calc_total_nnz(X.indptr, interaction_only, deg) + expanded_col = _calc_expanded_nnz(n_features, interaction_only, deg) + + if expanded_col == 0: + return None + # This only checks whether each block needs 64bit integers upon + # expansion. We prefer to keep int32 indexing where we can, + # since currently SciPy's CSR construction downcasts when possible, + # so we prefer to avoid an unnecessary cast. The dtype may still + # change in the concatenation process if needed. + # See: https://github.com/scipy/scipy/issues/16569 + max_indices = expanded_col - 1 + max_indptr = total_nnz + max_int32 = np.iinfo(np.int32).max + needs_int64 = max(max_indices, max_indptr) > max_int32 + index_dtype = np.int64 if needs_int64 else np.int32 + + # This is a pretty specific bug that is hard to work around by a user, + # hence we do not detail the entire bug and all possible avoidance + # mechnasisms. Instead we recommend upgrading scipy or shrinking their data. + cumulative_size += expanded_col + if ( + sp_version < parse_version("1.8.0") + and cumulative_size - 1 > max_int32 + and not needs_int64 + ): + raise ValueError( + "In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`" + " sometimes produces negative columns when the output shape contains" + " `n_cols` too large to be represented by a 32bit signed" + " integer. To avoid this error, either use a version" + " of scipy `>=1.8.0` or alter the `PolynomialFeatures`" + " transformer to produce fewer than 2^31 output features." + ) + + # Result of the expansion, modified in place by the + # `_csr_polynomial_expansion` routine. + expanded_data = np.empty(shape=total_nnz, dtype=X.data.dtype) + expanded_indices = np.empty(shape=total_nnz, dtype=index_dtype) + expanded_indptr = np.empty(shape=X.indptr.shape[0], dtype=index_dtype) + _csr_polynomial_expansion( + X.data, + X.indices, + X.indptr, + X.shape[1], + expanded_data, + expanded_indices, + expanded_indptr, + interaction_only, + deg, + ) + return sparse.csr_matrix( + (expanded_data, expanded_indices, expanded_indptr), + shape=(X.indptr.shape[0] - 1, expanded_col), + dtype=X.dtype, + ) + + +class PolynomialFeatures(TransformerMixin, BaseEstimator): + """Generate polynomial and interaction features. + + Generate a new feature matrix consisting of all polynomial combinations + of the features with degree less than or equal to the specified degree. + For example, if an input sample is two dimensional and of the form + [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + degree : int or tuple (min_degree, max_degree), default=2 + If a single int is given, it specifies the maximal degree of the + polynomial features. If a tuple `(min_degree, max_degree)` is passed, + then `min_degree` is the minimum and `max_degree` is the maximum + polynomial degree of the generated features. Note that `min_degree=0` + and `min_degree=1` are equivalent as outputting the degree zero term is + determined by `include_bias`. + + interaction_only : bool, default=False + If `True`, only interaction features are produced: features that are + products of at most `degree` *distinct* input features, i.e. terms with + power of 2 or higher of the same input feature are excluded: + + - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc. + - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc. + + include_bias : bool, default=True + If `True` (default), then include a bias column, the feature in which + all polynomial powers are zero (i.e. a column of ones - acts as an + intercept term in a linear model). + + order : {'C', 'F'}, default='C' + Order of output array in the dense case. `'F'` order is faster to + compute, but may slow down subsequent estimators. + + .. versionadded:: 0.21 + + Attributes + ---------- + powers_ : ndarray of shape (`n_output_features_`, `n_features_in_`) + `powers_[i, j]` is the exponent of the jth input in the ith output. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_output_features_ : int + The total number of polynomial output features. The number of output + features is computed by iterating over all suitably sized combinations + of input features. + + See Also + -------- + SplineTransformer : Transformer that generates univariate B-spline bases + for features. + + Notes + ----- + Be aware that the number of features in the output array scales + polynomially in the number of features of the input array, and + exponentially in the degree. High degrees can cause overfitting. + + See :ref:`examples/linear_model/plot_polynomial_interpolation.py + ` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import PolynomialFeatures + >>> X = np.arange(6).reshape(3, 2) + >>> X + array([[0, 1], + [2, 3], + [4, 5]]) + >>> poly = PolynomialFeatures(2) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0., 0., 1.], + [ 1., 2., 3., 4., 6., 9.], + [ 1., 4., 5., 16., 20., 25.]]) + >>> poly = PolynomialFeatures(interaction_only=True) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0.], + [ 1., 2., 3., 6.], + [ 1., 4., 5., 20.]]) + """ + + _parameter_constraints: dict = { + "degree": [Interval(Integral, 0, None, closed="left"), "array-like"], + "interaction_only": ["boolean"], + "include_bias": ["boolean"], + "order": [StrOptions({"C", "F"})], + } + + def __init__( + self, degree=2, *, interaction_only=False, include_bias=True, order="C" + ): + self.degree = degree + self.interaction_only = interaction_only + self.include_bias = include_bias + self.order = order + + @staticmethod + def _combinations( + n_features, min_degree, max_degree, interaction_only, include_bias + ): + comb = combinations if interaction_only else combinations_w_r + start = max(1, min_degree) + iter = chain.from_iterable( + comb(range(n_features), i) for i in range(start, max_degree + 1) + ) + if include_bias: + iter = chain(comb(range(n_features), 0), iter) + return iter + + @staticmethod + def _num_combinations( + n_features, min_degree, max_degree, interaction_only, include_bias + ): + """Calculate number of terms in polynomial expansion + + This should be equivalent to counting the number of terms returned by + _combinations(...) but much faster. + """ + + if interaction_only: + combinations = sum( + [ + comb(n_features, i, exact=True) + for i in range(max(1, min_degree), min(max_degree, n_features) + 1) + ] + ) + else: + combinations = comb(n_features + max_degree, max_degree, exact=True) - 1 + if min_degree > 0: + d = min_degree - 1 + combinations -= comb(n_features + d, d, exact=True) - 1 + + if include_bias: + combinations += 1 + + return combinations + + @property + def powers_(self): + """Exponent for each of the inputs in the output.""" + check_is_fitted(self) + + combinations = self._combinations( + n_features=self.n_features_in_, + min_degree=self._min_degree, + max_degree=self._max_degree, + interaction_only=self.interaction_only, + include_bias=self.include_bias, + ) + return np.vstack( + [np.bincount(c, minlength=self.n_features_in_) for c in combinations] + ) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features is None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + powers = self.powers_ + input_features = _check_feature_names_in(self, input_features) + feature_names = [] + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join( + ( + "%s^%d" % (input_features[ind], exp) + if exp != 1 + else input_features[ind] + ) + for ind, exp in zip(inds, row[inds]) + ) + else: + name = "1" + feature_names.append(name) + return np.asarray(feature_names, dtype=object) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """ + Compute number of output features. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Fitted transformer. + """ + _, n_features = validate_data(self, X, accept_sparse=True).shape + + if isinstance(self.degree, Integral): + if self.degree == 0 and not self.include_bias: + raise ValueError( + "Setting degree to zero and include_bias to False would result in" + " an empty output array." + ) + + self._min_degree = 0 + self._max_degree = self.degree + elif ( + isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2 + ): + self._min_degree, self._max_degree = self.degree + if not ( + isinstance(self._min_degree, Integral) + and isinstance(self._max_degree, Integral) + and self._min_degree >= 0 + and self._min_degree <= self._max_degree + ): + raise ValueError( + "degree=(min_degree, max_degree) must " + "be non-negative integers that fulfil " + "min_degree <= max_degree, got " + f"{self.degree}." + ) + elif self._max_degree == 0 and not self.include_bias: + raise ValueError( + "Setting both min_degree and max_degree to zero and include_bias to" + " False would result in an empty output array." + ) + else: + raise ValueError( + "degree must be a non-negative int or tuple " + "(min_degree, max_degree), got " + f"{self.degree}." + ) + + self.n_output_features_ = self._num_combinations( + n_features=n_features, + min_degree=self._min_degree, + max_degree=self._max_degree, + interaction_only=self.interaction_only, + include_bias=self.include_bias, + ) + if self.n_output_features_ > np.iinfo(np.intp).max: + msg = ( + "The output that would result from the current configuration would" + f" have {self.n_output_features_} features which is too large to be" + f" indexed by {np.intp().dtype.name}. Please change some or all of the" + " following:\n- The number of features in the input, currently" + f" {n_features=}\n- The range of degrees to calculate, currently" + f" [{self._min_degree}, {self._max_degree}]\n- Whether to include only" + f" interaction terms, currently {self.interaction_only}\n- Whether to" + f" include a bias term, currently {self.include_bias}." + ) + if ( + np.intp == np.int32 + and self.n_output_features_ <= np.iinfo(np.int64).max + ): # pragma: nocover + msg += ( + "\nNote that the current Python runtime has a limited 32 bit " + "address space and that this configuration would have been " + "admissible if run on a 64 bit Python runtime." + ) + raise ValueError(msg) + # We also record the number of output features for + # _max_degree = 0 + self._n_out_full = self._num_combinations( + n_features=n_features, + min_degree=0, + max_degree=self._max_degree, + interaction_only=self.interaction_only, + include_bias=self.include_bias, + ) + + return self + + def transform(self, X): + """Transform data to polynomial features. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to transform, row by row. + + Prefer CSR over CSC for sparse input (for speed), but CSC is + required if the degree is 4 or higher. If the degree is less than + 4 and the input format is CSC, it will be converted to CSR, have + its polynomial features generated, then converted back to CSC. + + If the degree is 2 or 3, the method described in "Leveraging + Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices + Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is + used, which is much faster than the method used on CSC input. For + this reason, a CSC input will be converted to CSR, and the output + will be converted back to CSC prior to being returned, hence the + preference of CSR. + + Returns + ------- + XP : {ndarray, sparse matrix} of shape (n_samples, NP) + The matrix of features, where `NP` is the number of polynomial + features generated from the combination of inputs. If a sparse + matrix is provided, it will be converted into a sparse + `csr_matrix`. + """ + check_is_fitted(self) + + X = validate_data( + self, + X, + order="F", + dtype=FLOAT_DTYPES, + reset=False, + accept_sparse=("csr", "csc"), + ) + + n_samples, n_features = X.shape + max_int32 = np.iinfo(np.int32).max + if sparse.issparse(X) and X.format == "csr": + if self._max_degree > 3: + return self.transform(X.tocsc()).tocsr() + to_stack = [] + if self.include_bias: + to_stack.append( + sparse.csr_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype)) + ) + if self._min_degree <= 1 and self._max_degree > 0: + to_stack.append(X) + + cumulative_size = sum(mat.shape[1] for mat in to_stack) + for deg in range(max(2, self._min_degree), self._max_degree + 1): + expanded = _create_expansion( + X=X, + interaction_only=self.interaction_only, + deg=deg, + n_features=n_features, + cumulative_size=cumulative_size, + ) + if expanded is not None: + to_stack.append(expanded) + cumulative_size += expanded.shape[1] + if len(to_stack) == 0: + # edge case: deal with empty matrix + XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype) + else: + # `scipy.sparse.hstack` breaks in scipy<1.9.2 + # when `n_output_features_ > max_int32` + all_int32 = all(mat.indices.dtype == np.int32 for mat in to_stack) + if ( + sp_version < parse_version("1.9.2") + and self.n_output_features_ > max_int32 + and all_int32 + ): + raise ValueError( # pragma: no cover + "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`" + " produces negative columns when:\n1. The output shape contains" + " `n_cols` too large to be represented by a 32bit signed" + " integer.\n2. All sub-matrices to be stacked have indices of" + " dtype `np.int32`.\nTo avoid this error, either use a version" + " of scipy `>=1.9.2` or alter the `PolynomialFeatures`" + " transformer to produce fewer than 2^31 output features" + ) + XP = sparse.hstack(to_stack, dtype=X.dtype, format="csr") + elif sparse.issparse(X) and X.format == "csc" and self._max_degree < 4: + return self.transform(X.tocsr()).tocsc() + elif sparse.issparse(X): + combinations = self._combinations( + n_features=n_features, + min_degree=self._min_degree, + max_degree=self._max_degree, + interaction_only=self.interaction_only, + include_bias=self.include_bias, + ) + columns = [] + for combi in combinations: + if combi: + out_col = 1 + for col_idx in combi: + out_col = X[:, [col_idx]].multiply(out_col) + columns.append(out_col) + else: + bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) + columns.append(bias) + XP = sparse.hstack(columns, dtype=X.dtype).tocsc() + else: + # Do as if _min_degree = 0 and cut down array after the + # computation, i.e. use _n_out_full instead of n_output_features_. + XP = np.empty( + shape=(n_samples, self._n_out_full), dtype=X.dtype, order=self.order + ) + + # What follows is a faster implementation of: + # for i, comb in enumerate(combinations): + # XP[:, i] = X[:, comb].prod(1) + # This implementation uses two optimisations. + # First one is broadcasting, + # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] + # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] + # ... + # multiply ([X[:, start:end], X[:, start]) -> ... + # Second optimisation happens for degrees >= 3. + # Xi^3 is computed reusing previous computation: + # Xi^3 = Xi^2 * Xi. + + # degree 0 term + if self.include_bias: + XP[:, 0] = 1 + current_col = 1 + else: + current_col = 0 + + if self._max_degree == 0: + return XP + + # degree 1 term + XP[:, current_col : current_col + n_features] = X + index = list(range(current_col, current_col + n_features)) + current_col += n_features + index.append(current_col) + + # loop over degree >= 2 terms + for _ in range(2, self._max_degree + 1): + new_index = [] + end = index[-1] + for feature_idx in range(n_features): + start = index[feature_idx] + new_index.append(current_col) + if self.interaction_only: + start += index[feature_idx + 1] - index[feature_idx] + next_col = current_col + end - start + if next_col <= current_col: + break + # XP[:, start:end] are terms of degree d - 1 + # that exclude feature #feature_idx. + np.multiply( + XP[:, start:end], + X[:, feature_idx : feature_idx + 1], + out=XP[:, current_col:next_col], + casting="no", + ) + current_col = next_col + + new_index.append(current_col) + index = new_index + + if self._min_degree > 1: + n_XP, n_Xout = self._n_out_full, self.n_output_features_ + if self.include_bias: + Xout = np.empty( + shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order + ) + Xout[:, 0] = 1 + Xout[:, 1:] = XP[:, n_XP - n_Xout + 1 :] + else: + Xout = XP[:, n_XP - n_Xout :].copy() + XP = Xout + return XP + + +class SplineTransformer(TransformerMixin, BaseEstimator): + """Generate univariate B-spline bases for features. + + Generate a new feature matrix consisting of + `n_splines=n_knots + degree - 1` (`n_knots - 1` for + `extrapolation="periodic"`) spline basis functions + (B-splines) of polynomial order=`degree` for each feature. + + In order to learn more about the SplineTransformer class go to: + :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py` + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + n_knots : int, default=5 + Number of knots of the splines if `knots` equals one of + {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots` + is array-like. + + degree : int, default=3 + The polynomial degree of the spline basis. Must be a non-negative + integer. + + knots : {'uniform', 'quantile'} or array-like of shape \ + (n_knots, n_features), default='uniform' + Set knot positions such that first knot <= features <= last knot. + + - If 'uniform', `n_knots` number of knots are distributed uniformly + from min to max values of the features. + - If 'quantile', they are distributed uniformly along the quantiles of + the features. + - If an array-like is given, it directly specifies the sorted knot + positions including the boundary knots. Note that, internally, + `degree` number of knots are added before the first knot, the same + after the last knot. + + extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, \ + default='constant' + If 'error', values outside the min and max values of the training + features raises a `ValueError`. If 'constant', the value of the + splines at minimum and maximum value of the features is used as + constant extrapolation. If 'linear', a linear extrapolation is used. + If 'continue', the splines are extrapolated as is, i.e. option + `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If + 'periodic', periodic splines with a periodicity equal to the distance + between the first and last knot are used. Periodic splines enforce + equal function values and derivatives at the first and last knot. + For example, this makes it possible to avoid introducing an arbitrary + jump between Dec 31st and Jan 1st in spline features derived from a + naturally periodic "day-of-year" input feature. In this case it is + recommended to manually set the knot values to control the period. + + include_bias : bool, default=True + If False, then the last spline element inside the data range + of a feature is dropped. As B-splines sum to one over the spline basis + functions for each data point, they implicitly include a bias term, + i.e. a column of ones. It acts as an intercept term in a linear models. + + order : {'C', 'F'}, default='C' + Order of output array in the dense case. `'F'` order is faster to compute, but + may slow down subsequent estimators. + + sparse_output : bool, default=False + Will return sparse CSR matrix if set True else will return an array. This + option is only available with `scipy>=1.8`. + + .. versionadded:: 1.2 + + Attributes + ---------- + bsplines_ : list of shape (n_features,) + List of BSplines objects, one for each feature. + + n_features_in_ : int + The total number of input features. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_features_out_ : int + The total number of output features, which is computed as + `n_features * n_splines`, where `n_splines` is + the number of bases elements of the B-splines, + `n_knots + degree - 1` for non-periodic splines and + `n_knots - 1` for periodic ones. + If `include_bias=False`, then it is only + `n_features * (n_splines - 1)`. + + See Also + -------- + KBinsDiscretizer : Transformer that bins continuous data into intervals. + + PolynomialFeatures : Transformer that generates polynomial and interaction + features. + + Notes + ----- + High degrees and a high number of knots can cause overfitting. + + See :ref:`examples/linear_model/plot_polynomial_interpolation.py + `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import SplineTransformer + >>> X = np.arange(6).reshape(6, 1) + >>> spline = SplineTransformer(degree=2, n_knots=3) + >>> spline.fit_transform(X) + array([[0.5 , 0.5 , 0. , 0. ], + [0.18, 0.74, 0.08, 0. ], + [0.02, 0.66, 0.32, 0. ], + [0. , 0.32, 0.66, 0.02], + [0. , 0.08, 0.74, 0.18], + [0. , 0. , 0.5 , 0.5 ]]) + """ + + _parameter_constraints: dict = { + "n_knots": [Interval(Integral, 2, None, closed="left")], + "degree": [Interval(Integral, 0, None, closed="left")], + "knots": [StrOptions({"uniform", "quantile"}), "array-like"], + "extrapolation": [ + StrOptions({"error", "constant", "linear", "continue", "periodic"}) + ], + "include_bias": ["boolean"], + "order": [StrOptions({"C", "F"})], + "sparse_output": ["boolean"], + } + + def __init__( + self, + n_knots=5, + degree=3, + *, + knots="uniform", + extrapolation="constant", + include_bias=True, + order="C", + sparse_output=False, + ): + self.n_knots = n_knots + self.degree = degree + self.knots = knots + self.extrapolation = extrapolation + self.include_bias = include_bias + self.order = order + self.sparse_output = sparse_output + + @staticmethod + def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None): + """Calculate base knot positions. + + Base knots such that first knot <= feature <= last knot. For the + B-spline construction with scipy.interpolate.BSpline, 2*degree knots + beyond the base interval are added. + + Returns + ------- + knots : ndarray of shape (n_knots, n_features), dtype=np.float64 + Knot positions (points) of base interval. + """ + if knots == "quantile": + percentiles = 100 * np.linspace( + start=0, stop=1, num=n_knots, dtype=np.float64 + ) + + if sample_weight is None: + knots = np.percentile(X, percentiles, axis=0) + else: + knots = np.array( + [ + _weighted_percentile(X, sample_weight, percentile) + for percentile in percentiles + ] + ) + + else: + # knots == 'uniform': + # Note that the variable `knots` has already been validated and + # `else` is therefore safe. + # Disregard observations with zero weight. + mask = slice(None, None, 1) if sample_weight is None else sample_weight > 0 + x_min = np.amin(X[mask], axis=0) + x_max = np.amax(X[mask], axis=0) + + knots = np.linspace( + start=x_min, + stop=x_max, + num=n_knots, + endpoint=True, + dtype=np.float64, + ) + + return knots + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + n_splines = self.bsplines_[0].c.shape[1] + + input_features = _check_feature_names_in(self, input_features) + feature_names = [] + for i in range(self.n_features_in_): + for j in range(n_splines - 1 + self.include_bias): + feature_names.append(f"{input_features[i]}_sp_{j}") + return np.asarray(feature_names, dtype=object) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None, sample_weight=None): + """Compute knot positions of splines. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data. + + y : None + Ignored. + + sample_weight : array-like of shape (n_samples,), default = None + Individual weights for each sample. Used to calculate quantiles if + `knots="quantile"`. For `knots="uniform"`, zero weighted + observations are ignored for finding the min and max of `X`. + + Returns + ------- + self : object + Fitted transformer. + """ + X = validate_data( + self, + X, + reset=True, + accept_sparse=False, + ensure_min_samples=2, + ensure_2d=True, + ) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + _, n_features = X.shape + + if isinstance(self.knots, str): + base_knots = self._get_base_knot_positions( + X, n_knots=self.n_knots, knots=self.knots, sample_weight=sample_weight + ) + else: + base_knots = check_array(self.knots, dtype=np.float64) + if base_knots.shape[0] < 2: + raise ValueError("Number of knots, knots.shape[0], must be >= 2.") + elif base_knots.shape[1] != n_features: + raise ValueError("knots.shape[1] == n_features is violated.") + elif not np.all(np.diff(base_knots, axis=0) > 0): + raise ValueError("knots must be sorted without duplicates.") + + if self.sparse_output and sp_version < parse_version("1.8.0"): + raise ValueError( + "Option sparse_output=True is only available with scipy>=1.8.0, " + f"but here scipy=={sp_version} is used." + ) + + # number of knots for base interval + n_knots = base_knots.shape[0] + + if self.extrapolation == "periodic" and n_knots <= self.degree: + raise ValueError( + "Periodic splines require degree < n_knots. Got n_knots=" + f"{n_knots} and degree={self.degree}." + ) + + # number of splines basis functions + if self.extrapolation != "periodic": + n_splines = n_knots + self.degree - 1 + else: + # periodic splines have self.degree less degrees of freedom + n_splines = n_knots - 1 + + degree = self.degree + n_out = n_features * n_splines + # We have to add degree number of knots below, and degree number knots + # above the base knots in order to make the spline basis complete. + if self.extrapolation == "periodic": + # For periodic splines the spacing of the first / last degree knots + # needs to be a continuation of the spacing of the last / first + # base knots. + period = base_knots[-1] - base_knots[0] + knots = np.r_[ + base_knots[-(degree + 1) : -1] - period, + base_knots, + base_knots[1 : (degree + 1)] + period, + ] + + else: + # Eilers & Marx in "Flexible smoothing with B-splines and + # penalties" https://doi.org/10.1214/ss/1038425655 advice + # against repeating first and last knot several times, which + # would have inferior behaviour at boundaries if combined with + # a penalty (hence P-Spline). We follow this advice even if our + # splines are unpenalized. Meaning we do not: + # knots = np.r_[ + # np.tile(base_knots.min(axis=0), reps=[degree, 1]), + # base_knots, + # np.tile(base_knots.max(axis=0), reps=[degree, 1]) + # ] + # Instead, we reuse the distance of the 2 fist/last knots. + dist_min = base_knots[1] - base_knots[0] + dist_max = base_knots[-1] - base_knots[-2] + + knots = np.r_[ + np.linspace( + base_knots[0] - degree * dist_min, + base_knots[0] - dist_min, + num=degree, + ), + base_knots, + np.linspace( + base_knots[-1] + dist_max, + base_knots[-1] + degree * dist_max, + num=degree, + ), + ] + + # With a diagonal coefficient matrix, we get back the spline basis + # elements, i.e. the design matrix of the spline. + # Note, BSpline appreciates C-contiguous float64 arrays as c=coef. + coef = np.eye(n_splines, dtype=np.float64) + if self.extrapolation == "periodic": + coef = np.concatenate((coef, coef[:degree, :])) + + extrapolate = self.extrapolation in ["periodic", "continue"] + + bsplines = [ + BSpline.construct_fast( + knots[:, i], coef, self.degree, extrapolate=extrapolate + ) + for i in range(n_features) + ] + self.bsplines_ = bsplines + + self.n_features_out_ = n_out - n_features * (1 - self.include_bias) + return self + + def transform(self, X): + """Transform each feature data to B-splines. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + XBS : {ndarray, sparse matrix} of shape (n_samples, n_features * n_splines) + The matrix of features, where n_splines is the number of bases + elements of the B-splines, n_knots + degree - 1. + """ + check_is_fitted(self) + + X = validate_data(self, X, reset=False, accept_sparse=False, ensure_2d=True) + + n_samples, n_features = X.shape + n_splines = self.bsplines_[0].c.shape[1] + degree = self.degree + + # TODO: Remove this condition, once scipy 1.10 is the minimum version. + # Only scipy => 1.10 supports design_matrix(.., extrapolate=..). + # The default (implicit in scipy < 1.10) is extrapolate=False. + scipy_1_10 = sp_version >= parse_version("1.10.0") + # Note: self.bsplines_[0].extrapolate is True for extrapolation in + # ["periodic", "continue"] + if scipy_1_10: + use_sparse = self.sparse_output + kwargs_extrapolate = {"extrapolate": self.bsplines_[0].extrapolate} + else: + use_sparse = self.sparse_output and not self.bsplines_[0].extrapolate + kwargs_extrapolate = dict() + + # Note that scipy BSpline returns float64 arrays and converts input + # x=X[:, i] to c-contiguous float64. + n_out = self.n_features_out_ + n_features * (1 - self.include_bias) + if X.dtype in FLOAT_DTYPES: + dtype = X.dtype + else: + dtype = np.float64 + if use_sparse: + output_list = [] + else: + XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order) + + for i in range(n_features): + spl = self.bsplines_[i] + + if self.extrapolation in ("continue", "error", "periodic"): + if self.extrapolation == "periodic": + # With periodic extrapolation we map x to the segment + # [spl.t[k], spl.t[n]]. + # This is equivalent to BSpline(.., extrapolate="periodic") + # for scipy>=1.0.0. + n = spl.t.size - spl.k - 1 + # Assign to new array to avoid inplace operation + x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % ( + spl.t[n] - spl.t[spl.k] + ) + else: + x = X[:, i] + + if use_sparse: + XBS_sparse = BSpline.design_matrix( + x, spl.t, spl.k, **kwargs_extrapolate + ) + if self.extrapolation == "periodic": + # See the construction of coef in fit. We need to add the last + # degree spline basis function to the first degree ones and + # then drop the last ones. + # Note: See comment about SparseEfficiencyWarning below. + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[:, :degree] += XBS_sparse[:, -degree:] + XBS_sparse = XBS_sparse[:, :-degree] + else: + XBS[:, (i * n_splines) : ((i + 1) * n_splines)] = spl(x) + else: # extrapolation in ("constant", "linear") + xmin, xmax = spl.t[degree], spl.t[-degree - 1] + # spline values at boundaries + f_min, f_max = spl(xmin), spl(xmax) + mask = (xmin <= X[:, i]) & (X[:, i] <= xmax) + if use_sparse: + mask_inv = ~mask + x = X[:, i].copy() + # Set some arbitrary values outside boundary that will be reassigned + # later. + x[mask_inv] = spl.t[self.degree] + XBS_sparse = BSpline.design_matrix(x, spl.t, spl.k) + # Note: Without converting to lil_matrix we would get: + # scipy.sparse._base.SparseEfficiencyWarning: Changing the sparsity + # structure of a csr_matrix is expensive. lil_matrix is more + # efficient. + if np.any(mask_inv): + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[mask_inv, :] = 0 + else: + XBS[mask, (i * n_splines) : ((i + 1) * n_splines)] = spl(X[mask, i]) + + # Note for extrapolation: + # 'continue' is already returned as is by scipy BSplines + if self.extrapolation == "error": + # BSpline with extrapolate=False does not raise an error, but + # outputs np.nan. + if (use_sparse and np.any(np.isnan(XBS_sparse.data))) or ( + not use_sparse + and np.any( + np.isnan(XBS[:, (i * n_splines) : ((i + 1) * n_splines)]) + ) + ): + raise ValueError( + "X contains values beyond the limits of the knots." + ) + elif self.extrapolation == "constant": + # Set all values beyond xmin and xmax to the value of the + # spline basis functions at those two positions. + # Only the first degree and last degree number of splines + # have non-zero values at the boundaries. + + mask = X[:, i] < xmin + if np.any(mask): + if use_sparse: + # Note: See comment about SparseEfficiencyWarning above. + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[mask, :degree] = f_min[:degree] + + else: + XBS[mask, (i * n_splines) : (i * n_splines + degree)] = f_min[ + :degree + ] + + mask = X[:, i] > xmax + if np.any(mask): + if use_sparse: + # Note: See comment about SparseEfficiencyWarning above. + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[mask, -degree:] = f_max[-degree:] + else: + XBS[ + mask, + ((i + 1) * n_splines - degree) : ((i + 1) * n_splines), + ] = f_max[-degree:] + + elif self.extrapolation == "linear": + # Continue the degree first and degree last spline bases + # linearly beyond the boundaries, with slope = derivative at + # the boundary. + # Note that all others have derivative = value = 0 at the + # boundaries. + + # spline derivatives = slopes at boundaries + fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1) + # Compute the linear continuation. + if degree <= 1: + # For degree=1, the derivative of 2nd spline is not zero at + # boundary. For degree=0 it is the same as 'constant'. + degree += 1 + for j in range(degree): + mask = X[:, i] < xmin + if np.any(mask): + linear_extr = f_min[j] + (X[mask, i] - xmin) * fp_min[j] + if use_sparse: + # Note: See comment about SparseEfficiencyWarning above. + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[mask, j] = linear_extr + else: + XBS[mask, i * n_splines + j] = linear_extr + + mask = X[:, i] > xmax + if np.any(mask): + k = n_splines - 1 - j + linear_extr = f_max[k] + (X[mask, i] - xmax) * fp_max[k] + if use_sparse: + # Note: See comment about SparseEfficiencyWarning above. + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[mask, k : k + 1] = linear_extr[:, None] + else: + XBS[mask, i * n_splines + k] = linear_extr + + if use_sparse: + XBS_sparse = XBS_sparse.tocsr() + output_list.append(XBS_sparse) + + if use_sparse: + # TODO: Remove this conditional error when the minimum supported version of + # SciPy is 1.9.2 + # `scipy.sparse.hstack` breaks in scipy<1.9.2 + # when `n_features_out_ > max_int32` + max_int32 = np.iinfo(np.int32).max + all_int32 = True + for mat in output_list: + all_int32 &= mat.indices.dtype == np.int32 + if ( + sp_version < parse_version("1.9.2") + and self.n_features_out_ > max_int32 + and all_int32 + ): + raise ValueError( + "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`" + " produces negative columns when:\n1. The output shape contains" + " `n_cols` too large to be represented by a 32bit signed" + " integer.\n. All sub-matrices to be stacked have indices of" + " dtype `np.int32`.\nTo avoid this error, either use a version" + " of scipy `>=1.9.2` or alter the `SplineTransformer`" + " transformer to produce fewer than 2^31 output features" + ) + XBS = sparse.hstack(output_list, format="csr") + elif self.sparse_output: + # TODO: Remove ones scipy 1.10 is the minimum version. See comments above. + XBS = sparse.csr_matrix(XBS) + + if self.include_bias: + return XBS + else: + # We throw away one spline basis per feature. + # We chose the last one. + indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0] + return XBS[:, indices] diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder.py b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..171aafe7a03ed5711cbf0d3181c686199e6d370c --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder.py @@ -0,0 +1,534 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np + +from ..base import OneToOneFeatureMixin, _fit_context +from ..utils._param_validation import Interval, StrOptions +from ..utils.multiclass import type_of_target +from ..utils.validation import ( + _check_feature_names_in, + _check_y, + check_consistent_length, + check_is_fitted, +) +from ._encoders import _BaseEncoder +from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth + + +class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder): + """Target Encoder for regression and classification targets. + + Each category is encoded based on a shrunk estimate of the average target + values for observations belonging to the category. The encoding scheme mixes + the global target mean with the target mean conditioned on the value of the + category (see [MIC]_). + + When the target type is "multiclass", encodings are based + on the conditional probability estimate for each class. The target is first + binarized using the "one-vs-all" scheme via + :class:`~sklearn.preprocessing.LabelBinarizer`, then the average target + value for each class and each category is used for encoding, resulting in + `n_features` * `n_classes` encoded output features. + + :class:`TargetEncoder` considers missing values, such as `np.nan` or `None`, + as another category and encodes them like any other category. Categories + that are not seen during :meth:`fit` are encoded with the target mean, i.e. + `target_mean_`. + + For a demo on the importance of the `TargetEncoder` internal cross-fitting, + see + :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`. + For a comparison of different encoders, refer to + :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read + more in the :ref:`User Guide `. + + .. note:: + `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide ` for details. + + .. versionadded:: 1.3 + + Parameters + ---------- + categories : "auto" or list of shape (n_features,) of array-like, default="auto" + Categories (unique values) per feature: + + - `"auto"` : Determine categories automatically from the training data. + - list : `categories[i]` holds the categories expected in the i-th column. The + passed categories should not mix strings and numeric values within a single + feature, and should be sorted in case of numeric values. + + The used categories are stored in the `categories_` fitted attribute. + + target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto" + Type of target. + + - `"auto"` : Type of target is inferred with + :func:`~sklearn.utils.multiclass.type_of_target`. + - `"continuous"` : Continuous target + - `"binary"` : Binary target + - `"multiclass"` : Multiclass target + + .. note:: + The type of target inferred with `"auto"` may not be the desired target + type used for modeling. For example, if the target consisted of integers + between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target` + will infer the target as `"multiclass"`. In this case, setting + `target_type="continuous"` will specify the target as a regression + problem. The `target_type_` attribute gives the target type used by the + encoder. + + .. versionchanged:: 1.4 + Added the option 'multiclass'. + + smooth : "auto" or float, default="auto" + The amount of mixing of the target mean conditioned on the value of the + category with the global target mean. A larger `smooth` value will put + more weight on the global target mean. + If `"auto"`, then `smooth` is set to an empirical Bayes estimate. + + cv : int, default=5 + Determines the number of folds in the :term:`cross fitting` strategy used in + :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used + and for continuous targets, `KFold` is used. + + shuffle : bool, default=True + Whether to shuffle the data in :meth:`fit_transform` before splitting into + folds. Note that the samples within each split will not be shuffled. + + random_state : int, RandomState instance or None, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold. Otherwise, this + parameter has no effect. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + encodings_ : list of shape (n_features,) or (n_features * n_classes) of \ + ndarray + Encodings learnt on all of `X`. + For feature `i`, `encodings_[i]` are the encodings matching the + categories listed in `categories_[i]`. When `target_type_` is + "multiclass", the encoding for feature `i` and class `j` is stored in + `encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and + 3 classes (c), encodings are ordered: + f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2, + + categories_ : list of shape (n_features,) of ndarray + The categories of each input feature determined during fitting or + specified in `categories` + (in order of the features in `X` and corresponding with the output + of :meth:`transform`). + + target_type_ : str + Type of target. + + target_mean_ : float + The overall mean of the target. This value is only used in :meth:`transform` + to encode categories. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + classes_ : ndarray or None + If `target_type_` is 'binary' or 'multiclass', holds the label for each class, + otherwise `None`. + + See Also + -------- + OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features. + Contrary to TargetEncoder, this encoding is not supervised. Treating the + resulting encoding as a numerical features therefore lead arbitrarily + ordered values and therefore typically lead to lower predictive performance + when used as preprocessing for a classifier or regressor. + OneHotEncoder : Performs a one-hot encoding of categorical features. This + unsupervised encoding is better suited for low cardinality categorical + variables as it generate one new feature per unique category. + + References + ---------- + .. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality + categorical attributes in classification and prediction problems" + SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>` + + Examples + -------- + With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate: + + >>> import numpy as np + >>> from sklearn.preprocessing import TargetEncoder + >>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T + >>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30 + >>> enc_auto = TargetEncoder(smooth="auto") + >>> X_trans = enc_auto.fit_transform(X, y) + + >>> # A high `smooth` parameter puts more weight on global mean on the categorical + >>> # encodings: + >>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y) + >>> enc_high_smooth.target_mean_ + np.float64(44...) + >>> enc_high_smooth.encodings_ + [array([44..., 44..., 44...])] + + >>> # On the other hand, a low `smooth` parameter puts more weight on target + >>> # conditioned on the value of the categorical: + >>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y) + >>> enc_low_smooth.encodings_ + [array([20..., 80..., 43...])] + """ + + _parameter_constraints: dict = { + "categories": [StrOptions({"auto"}), list], + "target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})], + "smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")], + "cv": [Interval(Integral, 2, None, closed="left")], + "shuffle": ["boolean"], + "random_state": ["random_state"], + } + + def __init__( + self, + categories="auto", + target_type="auto", + smooth="auto", + cv=5, + shuffle=True, + random_state=None, + ): + self.categories = categories + self.smooth = smooth + self.target_type = target_type + self.cv = cv + self.shuffle = shuffle + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit the :class:`TargetEncoder` to X and y. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + y : array-like of shape (n_samples,) + The target data used to encode the categories. + + Returns + ------- + self : object + Fitted encoder. + """ + self._fit_encodings_all(X, y) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y): + """Fit :class:`TargetEncoder` and transform X with the target encoding. + + .. note:: + `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide `. for details. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + y : array-like of shape (n_samples,) + The target data used to encode the categories. + + Returns + ------- + X_trans : ndarray of shape (n_samples, n_features) or \ + (n_samples, (n_features * n_classes)) + Transformed input. + """ + from ..model_selection import KFold, StratifiedKFold # avoid circular import + + X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y) + + # The cv splitter is voluntarily restricted to *KFold to enforce non + # overlapping validation folds, otherwise the fit_transform output will + # not be well-specified. + if self.target_type_ == "continuous": + cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state) + else: + cv = StratifiedKFold( + self.cv, shuffle=self.shuffle, random_state=self.random_state + ) + + # If 'multiclass' multiply axis=1 by num classes else keep shape the same + if self.target_type_ == "multiclass": + X_out = np.empty( + (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)), + dtype=np.float64, + ) + else: + X_out = np.empty_like(X_ordinal, dtype=np.float64) + + for train_idx, test_idx in cv.split(X, y): + X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx] + y_train_mean = np.mean(y_train, axis=0) + + if self.target_type_ == "multiclass": + encodings = self._fit_encoding_multiclass( + X_train, + y_train, + n_categories, + y_train_mean, + ) + else: + encodings = self._fit_encoding_binary_or_continuous( + X_train, + y_train, + n_categories, + y_train_mean, + ) + self._transform_X_ordinal( + X_out, + X_ordinal, + ~X_known_mask, + test_idx, + encodings, + y_train_mean, + ) + return X_out + + def transform(self, X): + """Transform X with the target encoding. + + .. note:: + `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide `. for details. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + Returns + ------- + X_trans : ndarray of shape (n_samples, n_features) or \ + (n_samples, (n_features * n_classes)) + Transformed input. + """ + X_ordinal, X_known_mask = self._transform( + X, handle_unknown="ignore", ensure_all_finite="allow-nan" + ) + + # If 'multiclass' multiply axis=1 by num of classes else keep shape the same + if self.target_type_ == "multiclass": + X_out = np.empty( + (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)), + dtype=np.float64, + ) + else: + X_out = np.empty_like(X_ordinal, dtype=np.float64) + + self._transform_X_ordinal( + X_out, + X_ordinal, + ~X_known_mask, + slice(None), + self.encodings_, + self.target_mean_, + ) + return X_out + + def _fit_encodings_all(self, X, y): + """Fit a target encoding with all the data.""" + # avoid circular import + from ..preprocessing import ( + LabelBinarizer, + LabelEncoder, + ) + + check_consistent_length(X, y) + self._fit(X, handle_unknown="ignore", ensure_all_finite="allow-nan") + + if self.target_type == "auto": + accepted_target_types = ("binary", "multiclass", "continuous") + inferred_type_of_target = type_of_target(y, input_name="y") + if inferred_type_of_target not in accepted_target_types: + raise ValueError( + "Unknown label type: Target type was inferred to be " + f"{inferred_type_of_target!r}. Only {accepted_target_types} are " + "supported." + ) + self.target_type_ = inferred_type_of_target + else: + self.target_type_ = self.target_type + + self.classes_ = None + if self.target_type_ == "binary": + label_encoder = LabelEncoder() + y = label_encoder.fit_transform(y) + self.classes_ = label_encoder.classes_ + elif self.target_type_ == "multiclass": + label_binarizer = LabelBinarizer() + y = label_binarizer.fit_transform(y) + self.classes_ = label_binarizer.classes_ + else: # continuous + y = _check_y(y, y_numeric=True, estimator=self) + + self.target_mean_ = np.mean(y, axis=0) + + X_ordinal, X_known_mask = self._transform( + X, handle_unknown="ignore", ensure_all_finite="allow-nan" + ) + n_categories = np.fromiter( + (len(category_for_feature) for category_for_feature in self.categories_), + dtype=np.int64, + count=len(self.categories_), + ) + if self.target_type_ == "multiclass": + encodings = self._fit_encoding_multiclass( + X_ordinal, + y, + n_categories, + self.target_mean_, + ) + else: + encodings = self._fit_encoding_binary_or_continuous( + X_ordinal, + y, + n_categories, + self.target_mean_, + ) + self.encodings_ = encodings + + return X_ordinal, X_known_mask, y, n_categories + + def _fit_encoding_binary_or_continuous( + self, X_ordinal, y, n_categories, target_mean + ): + """Learn target encodings.""" + if self.smooth == "auto": + y_variance = np.var(y) + encodings = _fit_encoding_fast_auto_smooth( + X_ordinal, + y, + n_categories, + target_mean, + y_variance, + ) + else: + encodings = _fit_encoding_fast( + X_ordinal, + y, + n_categories, + self.smooth, + target_mean, + ) + return encodings + + def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean): + """Learn multiclass encodings. + + Learn encodings for each class (c) then reorder encodings such that + the same features (f) are grouped together. `reorder_index` enables + converting from: + f0_c0, f1_c0, f0_c1, f1_c1, f0_c2, f1_c2 + to: + f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2 + """ + n_features = self.n_features_in_ + n_classes = len(self.classes_) + + encodings = [] + for i in range(n_classes): + y_class = y[:, i] + encoding = self._fit_encoding_binary_or_continuous( + X_ordinal, + y_class, + n_categories, + target_mean[i], + ) + encodings.extend(encoding) + + reorder_index = ( + idx + for start in range(n_features) + for idx in range(start, (n_classes * n_features), n_features) + ) + return [encodings[idx] for idx in reorder_index] + + def _transform_X_ordinal( + self, + X_out, + X_ordinal, + X_unknown_mask, + row_indices, + encodings, + target_mean, + ): + """Transform X_ordinal using encodings. + + In the multiclass case, `X_ordinal` and `X_unknown_mask` have column + (axis=1) size `n_features`, while `encodings` has length of size + `n_features * n_classes`. `feat_idx` deals with this by repeating + feature indices by `n_classes` E.g., for 3 features, 2 classes: + 0,0,1,1,2,2 + + Additionally, `target_mean` is of shape (`n_classes`,) so `mean_idx` + cycles through 0 to `n_classes` - 1, `n_features` times. + """ + if self.target_type_ == "multiclass": + n_classes = len(self.classes_) + for e_idx, encoding in enumerate(encodings): + # Repeat feature indices by n_classes + feat_idx = e_idx // n_classes + # Cycle through each class + mean_idx = e_idx % n_classes + X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, feat_idx]] + X_out[X_unknown_mask[:, feat_idx], e_idx] = target_mean[mean_idx] + else: + for e_idx, encoding in enumerate(encodings): + X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, e_idx]] + X_out[X_unknown_mask[:, e_idx], e_idx] = target_mean + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. `feature_names_in_` is used unless it is + not defined, in which case the following input feature names are + generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + When `type_of_target_` is "multiclass" the names are of the format + '_'. + """ + check_is_fitted(self, "n_features_in_") + feature_names = _check_feature_names_in(self, input_features) + if self.target_type_ == "multiclass": + feature_names = [ + f"{feature_name}_{class_name}" + for feature_name in feature_names + for class_name in self.classes_ + ] + return np.asarray(feature_names, dtype=object) + else: + return feature_names + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = True + return tags diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..9c58f0c23834feaba3bf6547c479c919a350b534 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.lib differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.pyd new file mode 100644 index 0000000000000000000000000000000000000000..e654cb7f4b09e30332955c3be1eec2fb9e956caf Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.pyd differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx new file mode 100644 index 0000000000000000000000000000000000000000..cb07d531147bbd55434cfa435f5b180d1d144aee --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx @@ -0,0 +1,167 @@ +from libc.math cimport isnan +from libcpp.vector cimport vector + +from ..utils._typedefs cimport float32_t, float64_t, int32_t, int64_t + +import numpy as np + + +ctypedef fused INT_DTYPE: + int64_t + int32_t + +ctypedef fused Y_DTYPE: + int64_t + int32_t + float64_t + float32_t + + +def _fit_encoding_fast( + INT_DTYPE[:, ::1] X_int, + const Y_DTYPE[:] y, + int64_t[::1] n_categories, + double smooth, + double y_mean, +): + """Fit a target encoding on X_int and y. + + This implementation uses Eq 7 from [1] to compute the encoding. + As stated in the paper, Eq 7 is the same as Eq 3. + + [1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality + categorical attributes in classification and prediction problems" + """ + cdef: + int64_t sample_idx, feat_idx, cat_idx, n_cats + INT_DTYPE X_int_tmp + int n_samples = X_int.shape[0] + int n_features = X_int.shape[1] + double smooth_sum = smooth * y_mean + int64_t max_n_cats = np.max(n_categories) + double[::1] sums = np.empty(max_n_cats, dtype=np.float64) + double[::1] counts = np.empty(max_n_cats, dtype=np.float64) + list encodings = [] + double[::1] current_encoding + # Gives access to encodings without gil + vector[double*] encoding_vec + + encoding_vec.resize(n_features) + for feat_idx in range(n_features): + current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64) + encoding_vec[feat_idx] = ¤t_encoding[0] + encodings.append(np.asarray(current_encoding)) + + with nogil: + for feat_idx in range(n_features): + n_cats = n_categories[feat_idx] + + for cat_idx in range(n_cats): + sums[cat_idx] = smooth_sum + counts[cat_idx] = smooth + + for sample_idx in range(n_samples): + X_int_tmp = X_int[sample_idx, feat_idx] + # -1 are unknown categories, which are not counted + if X_int_tmp == -1: + continue + sums[X_int_tmp] += y[sample_idx] + counts[X_int_tmp] += 1.0 + + for cat_idx in range(n_cats): + if counts[cat_idx] == 0: + encoding_vec[feat_idx][cat_idx] = y_mean + else: + encoding_vec[feat_idx][cat_idx] = sums[cat_idx] / counts[cat_idx] + + return encodings + + +def _fit_encoding_fast_auto_smooth( + INT_DTYPE[:, ::1] X_int, + const Y_DTYPE[:] y, + int64_t[::1] n_categories, + double y_mean, + double y_variance, +): + """Fit a target encoding on X_int and y with auto smoothing. + + This implementation uses Eq 5 and 6 from [1]. + + [1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality + categorical attributes in classification and prediction problems" + """ + cdef: + int64_t sample_idx, feat_idx, cat_idx, n_cats + INT_DTYPE X_int_tmp + double diff + int n_samples = X_int.shape[0] + int n_features = X_int.shape[1] + int64_t max_n_cats = np.max(n_categories) + double[::1] means = np.empty(max_n_cats, dtype=np.float64) + int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64) + double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64) + double lambda_ + list encodings = [] + double[::1] current_encoding + # Gives access to encodings without gil + vector[double*] encoding_vec + + encoding_vec.resize(n_features) + for feat_idx in range(n_features): + current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64) + encoding_vec[feat_idx] = ¤t_encoding[0] + encodings.append(np.asarray(current_encoding)) + + # TODO: parallelize this with OpenMP prange. When n_features >= n_threads, it's + # probably good to parallelize the outer loop. When n_features is too small, + # then it would probably better to parallelize the nested loops on n_samples and + # n_cats, but the code to handle thread-local temporary variables might be + # significantly more complex. + with nogil: + for feat_idx in range(n_features): + n_cats = n_categories[feat_idx] + + for cat_idx in range(n_cats): + means[cat_idx] = 0.0 + counts[cat_idx] = 0 + sum_of_squared_diffs[cat_idx] = 0.0 + + # first pass to compute the mean + for sample_idx in range(n_samples): + X_int_tmp = X_int[sample_idx, feat_idx] + + # -1 are unknown categories, which are not counted + if X_int_tmp == -1: + continue + counts[X_int_tmp] += 1 + means[X_int_tmp] += y[sample_idx] + + for cat_idx in range(n_cats): + means[cat_idx] /= counts[cat_idx] + + # second pass to compute the sum of squared differences + for sample_idx in range(n_samples): + X_int_tmp = X_int[sample_idx, feat_idx] + if X_int_tmp == -1: + continue + diff = y[sample_idx] - means[X_int_tmp] + sum_of_squared_diffs[X_int_tmp] += diff * diff + + for cat_idx in range(n_cats): + lambda_ = ( + y_variance * counts[cat_idx] / + (y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] / + counts[cat_idx]) + ) + if isnan(lambda_): + # A nan can happen when: + # 1. counts[cat_idx] == 0 + # 2. y_variance == 0 and sum_of_squared_diffs[cat_idx] == 0 + encoding_vec[feat_idx][cat_idx] = y_mean + else: + encoding_vec[feat_idx][cat_idx] = ( + lambda_ * means[cat_idx] + (1 - lambda_) * y_mean + ) + + return encodings diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/meson.build b/.venv/Lib/site-packages/sklearn/preprocessing/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..fcd4cc8d50740bf09955ccf08b850dde1cd55293 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/meson.build @@ -0,0 +1,16 @@ +py.extension_module( + '_csr_polynomial_expansion', + ['_csr_polynomial_expansion.pyx', utils_cython_tree], + cython_args: cython_args, + subdir: 'sklearn/preprocessing', + install: true +) + +py.extension_module( + '_target_encoder_fast', + ['_target_encoder_fast.pyx', utils_cython_tree], + override_options: ['cython_language=cpp'], + cython_args: cython_args, + subdir: 'sklearn/preprocessing', + install: true +) diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_common.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_common.py new file mode 100644 index 0000000000000000000000000000000000000000..0d0fbd461bfe5d4158865bd956dbe4d2e12fbc3d --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_common.py @@ -0,0 +1,187 @@ +import warnings + +import numpy as np +import pytest + +from sklearn.base import clone +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import ( + MaxAbsScaler, + MinMaxScaler, + PowerTransformer, + QuantileTransformer, + RobustScaler, + StandardScaler, + maxabs_scale, + minmax_scale, + power_transform, + quantile_transform, + robust_scale, + scale, +) +from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.fixes import ( + BSR_CONTAINERS, + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DIA_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, +) + +iris = load_iris() + + +def _get_valid_samples_by_column(X, col): + """Get non NaN samples in column of X""" + return X[:, [col]][~np.isnan(X[:, col])] + + +@pytest.mark.parametrize( + "est, func, support_sparse, strictly_positive, omit_kwargs", + [ + (MaxAbsScaler(), maxabs_scale, True, False, []), + (MinMaxScaler(), minmax_scale, False, False, ["clip"]), + (StandardScaler(), scale, False, False, []), + (StandardScaler(with_mean=False), scale, True, False, []), + (PowerTransformer("yeo-johnson"), power_transform, False, False, []), + (PowerTransformer("box-cox"), power_transform, False, True, []), + (QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []), + (RobustScaler(), robust_scale, False, False, []), + (RobustScaler(with_centering=False), robust_scale, True, False, []), + ], +) +def test_missing_value_handling( + est, func, support_sparse, strictly_positive, omit_kwargs +): + # check that the preprocessing method let pass nan + rng = np.random.RandomState(42) + X = iris.data.copy() + n_missing = 50 + X[ + rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing) + ] = np.nan + if strictly_positive: + X += np.nanmin(X) + 0.1 + X_train, X_test = train_test_split(X, random_state=1) + # sanity check + assert not np.all(np.isnan(X_train), axis=0).any() + assert np.any(np.isnan(X_train), axis=0).all() + assert np.any(np.isnan(X_test), axis=0).all() + X_test[:, 0] = np.nan # make sure this boundary case is tested + + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + Xt = est.fit(X_train).transform(X_test) + # ensure no warnings are raised + # missing values should still be missing, and only them + assert_array_equal(np.isnan(Xt), np.isnan(X_test)) + + # check that the function leads to the same results as the class + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + Xt_class = est.transform(X_train) + kwargs = est.get_params() + # remove the parameters which should be omitted because they + # are not defined in the counterpart function of the preprocessing class + for kwarg in omit_kwargs: + _ = kwargs.pop(kwarg) + Xt_func = func(X_train, **kwargs) + assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class)) + assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)]) + + # check that the inverse transform keep NaN + Xt_inv = est.inverse_transform(Xt) + assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test)) + # FIXME: we can introduce equal_nan=True in recent version of numpy. + # For the moment which just check that non-NaN values are almost equal. + assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)]) + + for i in range(X.shape[1]): + # train only on non-NaN + est.fit(_get_valid_samples_by_column(X_train, i)) + # check transforming with NaN works even when training without NaN + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + Xt_col = est.transform(X_test[:, [i]]) + assert_allclose(Xt_col, Xt[:, [i]]) + # check non-NaN is handled as before - the 1st column is all nan + if not np.isnan(X_test[:, i]).all(): + Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i)) + assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())]) + + if support_sparse: + est_dense = clone(est) + est_sparse = clone(est) + + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + Xt_dense = est_dense.fit(X_train).transform(X_test) + Xt_inv_dense = est_dense.inverse_transform(Xt_dense) + + for sparse_container in ( + BSR_CONTAINERS + + COO_CONTAINERS + + CSC_CONTAINERS + + CSR_CONTAINERS + + DIA_CONTAINERS + + DOK_CONTAINERS + + LIL_CONTAINERS + ): + # check that the dense and sparse inputs lead to the same results + # precompute the matrix to avoid catching side warnings + X_train_sp = sparse_container(X_train) + X_test_sp = sparse_container(X_test) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PendingDeprecationWarning) + warnings.simplefilter("error", RuntimeWarning) + Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp) + + assert_allclose(Xt_sp.toarray(), Xt_dense) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PendingDeprecationWarning) + warnings.simplefilter("error", RuntimeWarning) + Xt_inv_sp = est_sparse.inverse_transform(Xt_sp) + + assert_allclose(Xt_inv_sp.toarray(), Xt_inv_dense) + + +@pytest.mark.parametrize( + "est, func", + [ + (MaxAbsScaler(), maxabs_scale), + (MinMaxScaler(), minmax_scale), + (StandardScaler(), scale), + (StandardScaler(with_mean=False), scale), + (PowerTransformer("yeo-johnson"), power_transform), + ( + PowerTransformer("box-cox"), + power_transform, + ), + (QuantileTransformer(n_quantiles=3), quantile_transform), + (RobustScaler(), robust_scale), + (RobustScaler(with_centering=False), robust_scale), + ], +) +def test_missing_value_pandas_na_support(est, func): + # Test pandas IntegerArray with pd.NA + pd = pytest.importorskip("pandas") + + X = np.array( + [ + [1, 2, 3, np.nan, np.nan, 4, 5, 1], + [np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8], + [1, 2, 3, 4, 5, 6, 7, 8], + ] + ).T + + # Creates dataframe with IntegerArrays with pd.NA + X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"]) + X_df["c"] = X_df["c"].astype("int") + + X_trans = est.fit_transform(X) + X_df_trans = est.fit_transform(X_df) + + assert_allclose(X_trans, X_df_trans) diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py new file mode 100644 index 0000000000000000000000000000000000000000..04c67a15cd92903e0d10c6dea3e6fda55abfb5d7 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py @@ -0,0 +1,2621 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import re +import warnings + +import numpy as np +import numpy.linalg as la +import pytest +from scipy import sparse, stats + +from sklearn import datasets +from sklearn.base import clone +from sklearn.exceptions import NotFittedError +from sklearn.metrics.pairwise import linear_kernel +from sklearn.model_selection import cross_val_predict +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import ( + Binarizer, + KernelCenterer, + MaxAbsScaler, + MinMaxScaler, + Normalizer, + PowerTransformer, + QuantileTransformer, + RobustScaler, + StandardScaler, + add_dummy_feature, + maxabs_scale, + minmax_scale, + normalize, + power_transform, + quantile_transform, + robust_scale, + scale, +) +from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale +from sklearn.svm import SVR +from sklearn.utils import gen_batches, shuffle +from sklearn.utils._array_api import ( + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_allclose_dense_sparse, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + assert_array_less, + skip_if_32bit, +) +from sklearn.utils.estimator_checks import ( + check_array_api_input_and_values, +) +from sklearn.utils.fixes import ( + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + LIL_CONTAINERS, +) +from sklearn.utils.sparsefuncs import mean_variance_axis + +iris = datasets.load_iris() + +# Make some data to be used many times +rng = np.random.RandomState(0) +n_features = 30 +n_samples = 1000 +offsets = rng.uniform(-1, 1, size=n_features) +scales = rng.uniform(1, 10, size=n_features) +X_2d = rng.randn(n_samples, n_features) * scales + offsets +X_1row = X_2d[0, :].reshape(1, n_features) +X_1col = X_2d[:, 0].reshape(n_samples, 1) +X_list_1row = X_1row.tolist() +X_list_1col = X_1col.tolist() + + +def toarray(a): + if hasattr(a, "toarray"): + a = a.toarray() + return a + + +def _check_dim_1axis(a): + return np.asarray(a).shape[0] + + +def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen): + if batch_stop != n: + assert (i + 1) * chunk_size == n_samples_seen + else: + assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen + + +def test_raises_value_error_if_sample_weights_greater_than_1d(): + # Sample weights must be either scalar or 1D + + n_sampless = [2, 3] + n_featuress = [3, 2] + + for n_samples, n_features in zip(n_sampless, n_featuress): + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples) + + scaler = StandardScaler() + + # make sure Error is raised the sample weights greater than 1d + sample_weight_notOK = rng.randn(n_samples, 1) ** 2 + with pytest.raises(ValueError): + scaler.fit(X, y, sample_weight=sample_weight_notOK) + + +@pytest.mark.parametrize( + ["Xw", "X", "sample_weight"], + [ + ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [1, 2, 3], [4, 5, 6]], [2.0, 1.0]), + ( + [[1, 0, 1], [0, 0, 1]], + [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]], + np.array([1, 3]), + ), + ( + [[1, np.nan, 1], [np.nan, np.nan, 1]], + [ + [1, np.nan, 1], + [np.nan, np.nan, 1], + [np.nan, np.nan, 1], + [np.nan, np.nan, 1], + ], + np.array([1, 3]), + ), + ], +) +@pytest.mark.parametrize("array_constructor", ["array", "sparse_csr", "sparse_csc"]) +def test_standard_scaler_sample_weight(Xw, X, sample_weight, array_constructor): + with_mean = not array_constructor.startswith("sparse") + X = _convert_container(X, array_constructor) + Xw = _convert_container(Xw, array_constructor) + + # weighted StandardScaler + yw = np.ones(Xw.shape[0]) + scaler_w = StandardScaler(with_mean=with_mean) + scaler_w.fit(Xw, yw, sample_weight=sample_weight) + + # unweighted, but with repeated samples + y = np.ones(X.shape[0]) + scaler = StandardScaler(with_mean=with_mean) + scaler.fit(X, y) + + X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]] + + assert_almost_equal(scaler.mean_, scaler_w.mean_) + assert_almost_equal(scaler.var_, scaler_w.var_) + assert_almost_equal(scaler.transform(X_test), scaler_w.transform(X_test)) + + +def test_standard_scaler_1d(): + # Test scaling of dataset along single axis + for X in [X_1row, X_1col, X_list_1row, X_list_1row]: + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=True) + + if isinstance(X, list): + X = np.array(X) # cast only after scaling done + + if _check_dim_1axis(X) == 1: + assert_almost_equal(scaler.mean_, X.ravel()) + assert_almost_equal(scaler.scale_, np.ones(n_features)) + assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features)) + assert_array_almost_equal(X_scaled.std(axis=0), np.zeros_like(n_features)) + else: + assert_almost_equal(scaler.mean_, X.mean()) + assert_almost_equal(scaler.scale_, X.std()) + assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features)) + assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) + assert_array_almost_equal(X_scaled.std(axis=0), 1.0) + assert scaler.n_samples_seen_ == X.shape[0] + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert_array_almost_equal(X_scaled_back, X) + + # Constant feature + X = np.ones((5, 1)) + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=True) + assert_almost_equal(scaler.mean_, 1.0) + assert_almost_equal(scaler.scale_, 1.0) + assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) + assert_array_almost_equal(X_scaled.std(axis=0), 0.0) + assert scaler.n_samples_seen_ == X.shape[0] + + +@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) +@pytest.mark.parametrize("add_sample_weight", [False, True]) +def test_standard_scaler_dtype(add_sample_weight, sparse_container): + # Ensure scaling does not affect dtype + rng = np.random.RandomState(0) + n_samples = 10 + n_features = 3 + if add_sample_weight: + sample_weight = np.ones(n_samples) + else: + sample_weight = None + with_mean = True + if sparse_container is not None: + # scipy sparse containers do not support float16, see + # https://github.com/scipy/scipy/issues/7408 for more details. + supported_dtype = [np.float64, np.float32] + else: + supported_dtype = [np.float64, np.float32, np.float16] + for dtype in supported_dtype: + X = rng.randn(n_samples, n_features).astype(dtype) + if sparse_container is not None: + X = sparse_container(X) + with_mean = False + + scaler = StandardScaler(with_mean=with_mean) + X_scaled = scaler.fit(X, sample_weight=sample_weight).transform(X) + assert X.dtype == X_scaled.dtype + assert scaler.mean_.dtype == np.float64 + assert scaler.scale_.dtype == np.float64 + + +@pytest.mark.parametrize( + "scaler", + [ + StandardScaler(with_mean=False), + RobustScaler(with_centering=False), + ], +) +@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) +@pytest.mark.parametrize("add_sample_weight", [False, True]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("constant", [0, 1.0, 100.0]) +def test_standard_scaler_constant_features( + scaler, add_sample_weight, sparse_container, dtype, constant +): + if isinstance(scaler, RobustScaler) and add_sample_weight: + pytest.skip(f"{scaler.__class__.__name__} does not yet support sample_weight") + + rng = np.random.RandomState(0) + n_samples = 100 + n_features = 1 + if add_sample_weight: + fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2) + else: + fit_params = {} + X_array = np.full(shape=(n_samples, n_features), fill_value=constant, dtype=dtype) + X = X_array if sparse_container is None else sparse_container(X_array) + X_scaled = scaler.fit(X, **fit_params).transform(X) + + if isinstance(scaler, StandardScaler): + # The variance info should be close to zero for constant features. + assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7) + + # Constant features should not be scaled (scale of 1.): + assert_allclose(scaler.scale_, np.ones(X.shape[1])) + + assert X_scaled is not X # make sure we make a copy + assert_allclose_dense_sparse(X_scaled, X) + + if isinstance(scaler, StandardScaler) and not add_sample_weight: + # Also check consistency with the standard scale function. + X_scaled_2 = scale(X, with_mean=scaler.with_mean) + assert X_scaled_2 is not X # make sure we did a copy + assert_allclose_dense_sparse(X_scaled_2, X) + + +@pytest.mark.parametrize("n_samples", [10, 100, 10_000]) +@pytest.mark.parametrize("average", [1e-10, 1, 1e10]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) +def test_standard_scaler_near_constant_features( + n_samples, sparse_container, average, dtype +): + # Check that when the variance is too small (var << mean**2) the feature + # is considered constant and not scaled. + + scale_min, scale_max = -30, 19 + scales = np.array([10**i for i in range(scale_min, scale_max + 1)], dtype=dtype) + + n_features = scales.shape[0] + X = np.empty((n_samples, n_features), dtype=dtype) + # Make a dataset of known var = scales**2 and mean = average + X[: n_samples // 2, :] = average + scales + X[n_samples // 2 :, :] = average - scales + X_array = X if sparse_container is None else sparse_container(X) + + scaler = StandardScaler(with_mean=False).fit(X_array) + + # StandardScaler uses float64 accumulators even if the data has a float32 + # dtype. + eps = np.finfo(np.float64).eps + + # if var < bound = N.eps.var + N².eps².mean², the feature is considered + # constant and the scale_ attribute is set to 1. + bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2 + within_bounds = scales**2 <= bounds + + # Check that scale_min is small enough to have some scales below the + # bound and therefore detected as constant: + assert np.any(within_bounds) + + # Check that such features are actually treated as constant by the scaler: + assert all(scaler.var_[within_bounds] <= bounds[within_bounds]) + assert_allclose(scaler.scale_[within_bounds], 1.0) + + # Depending the on the dtype of X, some features might not actually be + # representable as non constant for small scales (even if above the + # precision bound of the float64 variance estimate). Such feature should + # be correctly detected as constants with 0 variance by StandardScaler. + representable_diff = X[0, :] - X[-1, :] != 0 + assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0) + assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1) + + # The other features are scaled and scale_ is equal to sqrt(var_) assuming + # that scales are large enough for average + scale and average - scale to + # be distinct in X (depending on X's dtype). + common_mask = np.logical_and(scales**2 > bounds, representable_diff) + assert_allclose(scaler.scale_[common_mask], np.sqrt(scaler.var_)[common_mask]) + + +def test_scale_1d(): + # 1-d inputs + X_list = [1.0, 3.0, 5.0, 0.0] + X_arr = np.array(X_list) + + for X in [X_list, X_arr]: + X_scaled = scale(X) + assert_array_almost_equal(X_scaled.mean(), 0.0) + assert_array_almost_equal(X_scaled.std(), 1.0) + assert_array_equal(scale(X, with_mean=False, with_std=False), X) + + +@skip_if_32bit +def test_standard_scaler_numerical_stability(): + # Test numerical stability of scaling + # np.log(1e-5) is taken because of its floating point representation + # was empirically found to cause numerical problems with np.mean & np.std. + x = np.full(8, np.log(1e-5), dtype=np.float64) + # This does not raise a warning as the number of samples is too low + # to trigger the problem in recent numpy + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + scale(x) + assert_array_almost_equal(scale(x), np.zeros(8)) + + # with 2 more samples, the std computation run into numerical issues: + x = np.full(10, np.log(1e-5), dtype=np.float64) + warning_message = "standard deviation of the data is probably very close to 0" + with pytest.warns(UserWarning, match=warning_message): + x_scaled = scale(x) + assert_array_almost_equal(x_scaled, np.zeros(10)) + + x = np.full(10, 1e-100, dtype=np.float64) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + x_small_scaled = scale(x) + assert_array_almost_equal(x_small_scaled, np.zeros(10)) + + # Large values can cause (often recoverable) numerical stability issues: + x_big = np.full(10, 1e100, dtype=np.float64) + warning_message = "Dataset may contain too large values" + with pytest.warns(UserWarning, match=warning_message): + x_big_scaled = scale(x_big) + assert_array_almost_equal(x_big_scaled, np.zeros(10)) + assert_array_almost_equal(x_big_scaled, x_small_scaled) + with pytest.warns(UserWarning, match=warning_message): + x_big_centered = scale(x_big, with_std=False) + assert_array_almost_equal(x_big_centered, np.zeros(10)) + assert_array_almost_equal(x_big_centered, x_small_scaled) + + +def test_scaler_2d_arrays(): + # Test scaling of 2d array along first axis + rng = np.random.RandomState(0) + n_features = 5 + n_samples = 4 + X = rng.randn(n_samples, n_features) + X[:, 0] = 0.0 # first feature is always of zero + + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=True) + assert not np.any(np.isnan(X_scaled)) + assert scaler.n_samples_seen_ == n_samples + + assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + # Check that X has been copied + assert X_scaled is not X + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert X_scaled_back is not X + assert X_scaled_back is not X_scaled + assert_array_almost_equal(X_scaled_back, X) + + X_scaled = scale(X, axis=1, with_std=False) + assert not np.any(np.isnan(X_scaled)) + assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0]) + X_scaled = scale(X, axis=1, with_std=True) + assert not np.any(np.isnan(X_scaled)) + assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0]) + # Check that the data hasn't been modified + assert X_scaled is not X + + X_scaled = scaler.fit(X).transform(X, copy=False) + assert not np.any(np.isnan(X_scaled)) + assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + # Check that X has not been copied + assert X_scaled is X + + X = rng.randn(4, 5) + X[:, 0] = 1.0 # first feature is a constant, non zero feature + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=True) + assert not np.any(np.isnan(X_scaled)) + assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + # Check that X has not been copied + assert X_scaled is not X + + +def test_scaler_float16_overflow(): + # Test if the scaler will not overflow on float16 numpy arrays + rng = np.random.RandomState(0) + # float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000 + # which is enough to overflow the data type + X = rng.uniform(5, 10, [200000, 1]).astype(np.float16) + + with np.errstate(over="raise"): + scaler = StandardScaler().fit(X) + X_scaled = scaler.transform(X) + + # Calculate the float64 equivalent to verify result + X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64)) + + # Overflow calculations may cause -inf, inf, or nan. Since there is no nan + # input, all of the outputs should be finite. This may be redundant since a + # FloatingPointError exception will be thrown on overflow above. + assert np.all(np.isfinite(X_scaled)) + + # The normal distribution is very unlikely to go above 4. At 4.0-8.0 the + # float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are + # checked to account for precision differences. + assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2) + + +def test_handle_zeros_in_scale(): + s1 = np.array([0, 1e-16, 1, 2, 3]) + s2 = _handle_zeros_in_scale(s1, copy=True) + + assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3])) + assert_allclose(s2, np.array([1, 1, 1, 2, 3])) + + +def test_minmax_scaler_partial_fit(): + # Test if partial_fit run over many batches of size 1 and 50 + # gives the same results as fit + X = X_2d + n = X.shape[0] + + for chunk_size in [1, 2, 50, n, n + 42]: + # Test mean at the end of the process + scaler_batch = MinMaxScaler().fit(X) + + scaler_incr = MinMaxScaler() + for batch in gen_batches(n_samples, chunk_size): + scaler_incr = scaler_incr.partial_fit(X[batch]) + + assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) + assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) + assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) + assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) + + # Test std after 1 step + batch0 = slice(0, chunk_size) + scaler_batch = MinMaxScaler().fit(X[batch0]) + scaler_incr = MinMaxScaler().partial_fit(X[batch0]) + + assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) + assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) + assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) + assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) + + # Test std until the end of partial fits, and + scaler_batch = MinMaxScaler().fit(X) + scaler_incr = MinMaxScaler() # Clean estimator + for i, batch in enumerate(gen_batches(n_samples, chunk_size)): + scaler_incr = scaler_incr.partial_fit(X[batch]) + assert_correct_incr( + i, + batch_start=batch.start, + batch_stop=batch.stop, + n=n, + chunk_size=chunk_size, + n_samples_seen=scaler_incr.n_samples_seen_, + ) + + +def test_standard_scaler_partial_fit(): + # Test if partial_fit run over many batches of size 1 and 50 + # gives the same results as fit + X = X_2d + n = X.shape[0] + + for chunk_size in [1, 2, 50, n, n + 42]: + # Test mean at the end of the process + scaler_batch = StandardScaler(with_std=False).fit(X) + + scaler_incr = StandardScaler(with_std=False) + for batch in gen_batches(n_samples, chunk_size): + scaler_incr = scaler_incr.partial_fit(X[batch]) + assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) + assert scaler_batch.var_ == scaler_incr.var_ # Nones + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + + # Test std after 1 step + batch0 = slice(0, chunk_size) + scaler_incr = StandardScaler().partial_fit(X[batch0]) + if chunk_size == 1: + assert_array_almost_equal( + np.zeros(n_features, dtype=np.float64), scaler_incr.var_ + ) + assert_array_almost_equal( + np.ones(n_features, dtype=np.float64), scaler_incr.scale_ + ) + else: + assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_) + assert_array_almost_equal( + np.std(X[batch0], axis=0), scaler_incr.scale_ + ) # no constants + + # Test std until the end of partial fits, and + scaler_batch = StandardScaler().fit(X) + scaler_incr = StandardScaler() # Clean estimator + for i, batch in enumerate(gen_batches(n_samples, chunk_size)): + scaler_incr = scaler_incr.partial_fit(X[batch]) + assert_correct_incr( + i, + batch_start=batch.start, + batch_stop=batch.stop, + n=n, + chunk_size=chunk_size, + n_samples_seen=scaler_incr.n_samples_seen_, + ) + + assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_standard_scaler_partial_fit_numerical_stability(sparse_container): + # Test if the incremental computation introduces significative errors + # for large datasets with values of large magniture + rng = np.random.RandomState(0) + n_features = 2 + n_samples = 100 + offsets = rng.uniform(-1e15, 1e15, size=n_features) + scales = rng.uniform(1e3, 1e6, size=n_features) + X = rng.randn(n_samples, n_features) * scales + offsets + + scaler_batch = StandardScaler().fit(X) + scaler_incr = StandardScaler() + for chunk in X: + scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features)) + + # Regardless of abs values, they must not be more diff 6 significant digits + tol = 10 ** (-6) + assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol) + assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol) + assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol) + # NOTE Be aware that for much larger offsets std is very unstable (last + # assert) while mean is OK. + + # Sparse input + size = (100, 3) + scale = 1e20 + X = sparse_container(rng.randint(0, 2, size).astype(np.float64) * scale) + + # with_mean=False is required with sparse input + scaler = StandardScaler(with_mean=False).fit(X) + scaler_incr = StandardScaler(with_mean=False) + + for chunk in X: + if chunk.ndim == 1: + # Sparse arrays can be 1D (in scipy 1.14 and later) while old + # sparse matrix instances are always 2D. + chunk = chunk.reshape(1, -1) + scaler_incr = scaler_incr.partial_fit(chunk) + + # Regardless of magnitude, they must not differ more than of 6 digits + tol = 10 ** (-6) + assert scaler.mean_ is not None + assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol) + assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol) + + +@pytest.mark.parametrize("sample_weight", [True, None]) +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_partial_fit_sparse_input(sample_weight, sparse_container): + # Check that sparsity is not destroyed + X = sparse_container(np.array([[1.0], [0.0], [0.0], [5.0]])) + + if sample_weight: + sample_weight = rng.rand(X.shape[0]) + + null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) + X_null = null_transform.partial_fit(X, sample_weight=sample_weight).transform(X) + assert_array_equal(X_null.toarray(), X.toarray()) + X_orig = null_transform.inverse_transform(X_null) + assert_array_equal(X_orig.toarray(), X_null.toarray()) + assert_array_equal(X_orig.toarray(), X.toarray()) + + +@pytest.mark.parametrize("sample_weight", [True, None]) +def test_standard_scaler_trasform_with_partial_fit(sample_weight): + # Check some postconditions after applying partial_fit and transform + X = X_2d[:100, :] + + if sample_weight: + sample_weight = rng.rand(X.shape[0]) + + scaler_incr = StandardScaler() + for i, batch in enumerate(gen_batches(X.shape[0], 1)): + X_sofar = X[: (i + 1), :] + chunks_copy = X_sofar.copy() + if sample_weight is None: + scaled_batch = StandardScaler().fit_transform(X_sofar) + scaler_incr = scaler_incr.partial_fit(X[batch]) + else: + scaled_batch = StandardScaler().fit_transform( + X_sofar, sample_weight=sample_weight[: i + 1] + ) + scaler_incr = scaler_incr.partial_fit( + X[batch], sample_weight=sample_weight[batch] + ) + scaled_incr = scaler_incr.transform(X_sofar) + + assert_array_almost_equal(scaled_batch, scaled_incr) + assert_array_almost_equal(X_sofar, chunks_copy) # No change + right_input = scaler_incr.inverse_transform(scaled_incr) + assert_array_almost_equal(X_sofar, right_input) + + zero = np.zeros(X.shape[1]) + epsilon = np.finfo(float).eps + assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal + assert_array_less(zero, scaler_incr.scale_ + epsilon) + if sample_weight is None: + # (i+1) because the Scaler has been already fitted + assert (i + 1) == scaler_incr.n_samples_seen_ + else: + assert np.sum(sample_weight[: i + 1]) == pytest.approx( + scaler_incr.n_samples_seen_ + ) + + +def test_standard_check_array_of_inverse_transform(): + # Check if StandardScaler inverse_transform is + # converting the integer array to float + x = np.array( + [ + [1, 1, 1, 0, 1, 0], + [1, 1, 1, 0, 1, 0], + [0, 8, 0, 1, 0, 0], + [1, 4, 1, 1, 0, 0], + [0, 1, 0, 0, 1, 0], + [0, 4, 0, 1, 0, 1], + ], + dtype=np.int32, + ) + + scaler = StandardScaler() + scaler.fit(x) + + # The of inverse_transform should be converted + # to a float array. + # If not X *= self.scale_ will fail. + scaler.inverse_transform(x) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize( + "check", + [check_array_api_input_and_values], + ids=_get_check_estimator_ids, +) +@pytest.mark.parametrize( + "estimator", + [ + MaxAbsScaler(), + MinMaxScaler(), + MinMaxScaler(clip=True), + KernelCenterer(), + Normalizer(norm="l1"), + Normalizer(norm="l2"), + Normalizer(norm="max"), + ], + ids=_get_check_estimator_ids, +) +def test_scaler_array_api_compliance( + estimator, check, array_namespace, device, dtype_name +): + name = estimator.__class__.__name__ + check(name, estimator, array_namespace, device=device, dtype_name=dtype_name) + + +def test_min_max_scaler_iris(): + X = iris.data + scaler = MinMaxScaler() + # default params + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), 0) + assert_array_almost_equal(X_trans.max(axis=0), 1) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # not default params: min=1, max=2 + scaler = MinMaxScaler(feature_range=(1, 2)) + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), 1) + assert_array_almost_equal(X_trans.max(axis=0), 2) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # min=-.5, max=.6 + scaler = MinMaxScaler(feature_range=(-0.5, 0.6)) + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), -0.5) + assert_array_almost_equal(X_trans.max(axis=0), 0.6) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # raises on invalid range + scaler = MinMaxScaler(feature_range=(2, 1)) + with pytest.raises(ValueError): + scaler.fit(X) + + +def test_min_max_scaler_zero_variance_features(): + # Check min max scaler on toy data with zero variance features + X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]] + + X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] + + # default params + scaler = MinMaxScaler() + X_trans = scaler.fit_transform(X) + X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]] + assert_array_almost_equal(X_trans, X_expected_0_1) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + X_trans_new = scaler.transform(X_new) + X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]] + assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) + + # not default params + scaler = MinMaxScaler(feature_range=(1, 2)) + X_trans = scaler.fit_transform(X) + X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]] + assert_array_almost_equal(X_trans, X_expected_1_2) + + # function interface + X_trans = minmax_scale(X) + assert_array_almost_equal(X_trans, X_expected_0_1) + X_trans = minmax_scale(X, feature_range=(1, 2)) + assert_array_almost_equal(X_trans, X_expected_1_2) + + +def test_minmax_scale_axis1(): + X = iris.data + X_trans = minmax_scale(X, axis=1) + assert_array_almost_equal(np.min(X_trans, axis=1), 0) + assert_array_almost_equal(np.max(X_trans, axis=1), 1) + + +def test_min_max_scaler_1d(): + # Test scaling of dataset along single axis + for X in [X_1row, X_1col, X_list_1row, X_list_1row]: + scaler = MinMaxScaler(copy=True) + X_scaled = scaler.fit(X).transform(X) + + if isinstance(X, list): + X = np.array(X) # cast only after scaling done + + if _check_dim_1axis(X) == 1: + assert_array_almost_equal(X_scaled.min(axis=0), np.zeros(n_features)) + assert_array_almost_equal(X_scaled.max(axis=0), np.zeros(n_features)) + else: + assert_array_almost_equal(X_scaled.min(axis=0), 0.0) + assert_array_almost_equal(X_scaled.max(axis=0), 1.0) + assert scaler.n_samples_seen_ == X.shape[0] + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert_array_almost_equal(X_scaled_back, X) + + # Constant feature + X = np.ones((5, 1)) + scaler = MinMaxScaler() + X_scaled = scaler.fit(X).transform(X) + assert X_scaled.min() >= 0.0 + assert X_scaled.max() <= 1.0 + assert scaler.n_samples_seen_ == X.shape[0] + + # Function interface + X_1d = X_1row.ravel() + min_ = X_1d.min() + max_ = X_1d.max() + assert_array_almost_equal( + (X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True) + ) + + +@pytest.mark.parametrize("sample_weight", [True, None]) +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_scaler_without_centering(sample_weight, sparse_container): + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + X_sparse = sparse_container(X) + + if sample_weight: + sample_weight = rng.rand(X.shape[0]) + + with pytest.raises(ValueError): + StandardScaler().fit(X_sparse) + + scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight) + X_scaled = scaler.transform(X, copy=True) + assert not np.any(np.isnan(X_scaled)) + + scaler_sparse = StandardScaler(with_mean=False).fit( + X_sparse, sample_weight=sample_weight + ) + X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True) + assert not np.any(np.isnan(X_sparse_scaled.data)) + + assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_) + assert_array_almost_equal(scaler.var_, scaler_sparse.var_) + assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_) + assert_array_almost_equal(scaler.n_samples_seen_, scaler_sparse.n_samples_seen_) + + if sample_weight is None: + assert_array_almost_equal( + X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2 + ) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + + X_sparse_scaled_mean, X_sparse_scaled_var = mean_variance_axis(X_sparse_scaled, 0) + assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0)) + assert_array_almost_equal(X_sparse_scaled_var, X_scaled.var(axis=0)) + + # Check that X has not been modified (copy) + assert X_scaled is not X + assert X_sparse_scaled is not X_sparse + + X_scaled_back = scaler.inverse_transform(X_scaled) + assert X_scaled_back is not X + assert X_scaled_back is not X_scaled + assert_array_almost_equal(X_scaled_back, X) + + X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled) + assert X_sparse_scaled_back is not X_sparse + assert X_sparse_scaled_back is not X_sparse_scaled + assert_array_almost_equal(X_sparse_scaled_back.toarray(), X) + + if sparse_container in CSR_CONTAINERS: + null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) + X_null = null_transform.fit_transform(X_sparse) + assert_array_equal(X_null.data, X_sparse.data) + X_orig = null_transform.inverse_transform(X_null) + assert_array_equal(X_orig.data, X_sparse.data) + + +@pytest.mark.parametrize("with_mean", [True, False]) +@pytest.mark.parametrize("with_std", [True, False]) +@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) +def test_scaler_n_samples_seen_with_nan(with_mean, with_std, sparse_container): + X = np.array( + [[0, 1, 3], [np.nan, 6, 10], [5, 4, np.nan], [8, 0, np.nan]], dtype=np.float64 + ) + if sparse_container is not None: + X = sparse_container(X) + + if sparse.issparse(X) and with_mean: + pytest.skip("'with_mean=True' cannot be used with sparse matrix.") + + transformer = StandardScaler(with_mean=with_mean, with_std=with_std) + transformer.fit(X) + + assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2])) + + +def _check_identity_scalers_attributes(scaler_1, scaler_2): + assert scaler_1.mean_ is scaler_2.mean_ is None + assert scaler_1.var_ is scaler_2.var_ is None + assert scaler_1.scale_ is scaler_2.scale_ is None + assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_ + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_scaler_return_identity(sparse_container): + # test that the scaler return identity when with_mean and with_std are + # False + X_dense = np.array([[0, 1, 3], [5, 6, 0], [8, 0, 10]], dtype=np.float64) + X_sparse = sparse_container(X_dense) + + transformer_dense = StandardScaler(with_mean=False, with_std=False) + X_trans_dense = transformer_dense.fit_transform(X_dense) + assert_allclose(X_trans_dense, X_dense) + + transformer_sparse = clone(transformer_dense) + X_trans_sparse = transformer_sparse.fit_transform(X_sparse) + assert_allclose_dense_sparse(X_trans_sparse, X_sparse) + + _check_identity_scalers_attributes(transformer_dense, transformer_sparse) + + transformer_dense.partial_fit(X_dense) + transformer_sparse.partial_fit(X_sparse) + _check_identity_scalers_attributes(transformer_dense, transformer_sparse) + + transformer_dense.fit(X_dense) + transformer_sparse.fit(X_sparse) + _check_identity_scalers_attributes(transformer_dense, transformer_sparse) + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_scaler_int(sparse_container): + # test that scaler converts integer input to floating + # for both sparse and dense matrices + rng = np.random.RandomState(42) + X = rng.randint(20, size=(4, 5)) + X[:, 0] = 0 # first feature is always of zero + X_sparse = sparse_container(X) + + with warnings.catch_warnings(record=True): + scaler = StandardScaler(with_mean=False).fit(X) + X_scaled = scaler.transform(X, copy=True) + assert not np.any(np.isnan(X_scaled)) + + with warnings.catch_warnings(record=True): + scaler_sparse = StandardScaler(with_mean=False).fit(X_sparse) + X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True) + assert not np.any(np.isnan(X_sparse_scaled.data)) + + assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_) + assert_array_almost_equal(scaler.var_, scaler_sparse.var_) + assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_) + + assert_array_almost_equal( + X_scaled.mean(axis=0), [0.0, 1.109, 1.856, 21.0, 1.559], 2 + ) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + + X_sparse_scaled_mean, X_sparse_scaled_std = mean_variance_axis( + X_sparse_scaled.astype(float), 0 + ) + assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0)) + assert_array_almost_equal(X_sparse_scaled_std, X_scaled.std(axis=0)) + + # Check that X has not been modified (copy) + assert X_scaled is not X + assert X_sparse_scaled is not X_sparse + + X_scaled_back = scaler.inverse_transform(X_scaled) + assert X_scaled_back is not X + assert X_scaled_back is not X_scaled + assert_array_almost_equal(X_scaled_back, X) + + X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled) + assert X_sparse_scaled_back is not X_sparse + assert X_sparse_scaled_back is not X_sparse_scaled + assert_array_almost_equal(X_sparse_scaled_back.toarray(), X) + + if sparse_container in CSR_CONTAINERS: + null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) + with warnings.catch_warnings(record=True): + X_null = null_transform.fit_transform(X_sparse) + assert_array_equal(X_null.data, X_sparse.data) + X_orig = null_transform.inverse_transform(X_null) + assert_array_equal(X_orig.data, X_sparse.data) + + +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_scaler_without_copy(sparse_container): + # Check that StandardScaler.fit does not change input + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + X_sparse = sparse_container(X) + + X_copy = X.copy() + StandardScaler(copy=False).fit(X) + assert_array_equal(X, X_copy) + + X_sparse_copy = X_sparse.copy() + StandardScaler(with_mean=False, copy=False).fit(X_sparse) + assert_array_equal(X_sparse.toarray(), X_sparse_copy.toarray()) + + +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_scale_sparse_with_mean_raise_exception(sparse_container): + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X_sparse = sparse_container(X) + + # check scaling and fit with direct calls on sparse data + with pytest.raises(ValueError): + scale(X_sparse, with_mean=True) + with pytest.raises(ValueError): + StandardScaler(with_mean=True).fit(X_sparse) + + # check transform and inverse_transform after a fit on a dense array + scaler = StandardScaler(with_mean=True).fit(X) + with pytest.raises(ValueError): + scaler.transform(X_sparse) + + X_transformed_sparse = sparse_container(scaler.transform(X)) + with pytest.raises(ValueError): + scaler.inverse_transform(X_transformed_sparse) + + +def test_scale_input_finiteness_validation(): + # Check if non finite inputs raise ValueError + X = [[np.inf, 5, 6, 7, 8]] + with pytest.raises( + ValueError, match="Input contains infinity or a value too large" + ): + scale(X) + + +def test_robust_scaler_error_sparse(): + X_sparse = sparse.rand(1000, 10) + scaler = RobustScaler(with_centering=True) + err_msg = "Cannot center sparse matrices" + with pytest.raises(ValueError, match=err_msg): + scaler.fit(X_sparse) + + +@pytest.mark.parametrize("with_centering", [True, False]) +@pytest.mark.parametrize("with_scaling", [True, False]) +@pytest.mark.parametrize("X", [np.random.randn(10, 3), sparse.rand(10, 3, density=0.5)]) +def test_robust_scaler_attributes(X, with_centering, with_scaling): + # check consistent type of attributes + if with_centering and sparse.issparse(X): + pytest.skip("RobustScaler cannot center sparse matrix") + + scaler = RobustScaler(with_centering=with_centering, with_scaling=with_scaling) + scaler.fit(X) + + if with_centering: + assert isinstance(scaler.center_, np.ndarray) + else: + assert scaler.center_ is None + if with_scaling: + assert isinstance(scaler.scale_, np.ndarray) + else: + assert scaler.scale_ is None + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_robust_scaler_col_zero_sparse(csr_container): + # check that the scaler is working when there is not data materialized in a + # column of a sparse matrix + X = np.random.randn(10, 5) + X[:, 0] = 0 + X = csr_container(X) + + scaler = RobustScaler(with_centering=False) + scaler.fit(X) + assert scaler.scale_[0] == pytest.approx(1) + + X_trans = scaler.transform(X) + assert_allclose(X[:, [0]].toarray(), X_trans[:, [0]].toarray()) + + +def test_robust_scaler_2d_arrays(): + # Test robust scaling of 2d array along first axis + rng = np.random.RandomState(0) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + + scaler = RobustScaler() + X_scaled = scaler.fit(X).transform(X) + + assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0)[0], 0) + + +@pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1]) +@pytest.mark.parametrize("strictly_signed", ["positive", "negative", "zeros", None]) +def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed): + # Check the equivalence of the fitting with dense and sparse matrices + X_sparse = sparse.rand(1000, 5, density=density).tocsc() + if strictly_signed == "positive": + X_sparse.data = np.abs(X_sparse.data) + elif strictly_signed == "negative": + X_sparse.data = -np.abs(X_sparse.data) + elif strictly_signed == "zeros": + X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64) + X_dense = X_sparse.toarray() + + scaler_sparse = RobustScaler(with_centering=False) + scaler_dense = RobustScaler(with_centering=False) + + scaler_sparse.fit(X_sparse) + scaler_dense.fit(X_dense) + + assert_allclose(scaler_sparse.scale_, scaler_dense.scale_) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_robust_scaler_transform_one_row_csr(csr_container): + # Check RobustScaler on transforming csr matrix with one row + rng = np.random.RandomState(0) + X = rng.randn(4, 5) + single_row = np.array([[0.1, 1.0, 2.0, 0.0, -1.0]]) + scaler = RobustScaler(with_centering=False) + scaler = scaler.fit(X) + row_trans = scaler.transform(csr_container(single_row)) + row_expected = single_row / scaler.scale_ + assert_array_almost_equal(row_trans.toarray(), row_expected) + row_scaled_back = scaler.inverse_transform(row_trans) + assert_array_almost_equal(single_row, row_scaled_back.toarray()) + + +def test_robust_scaler_iris(): + X = iris.data + scaler = RobustScaler() + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(np.median(X_trans, axis=0), 0) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + q = np.percentile(X_trans, q=(25, 75), axis=0) + iqr = q[1] - q[0] + assert_array_almost_equal(iqr, 1) + + +def test_robust_scaler_iris_quantiles(): + X = iris.data + scaler = RobustScaler(quantile_range=(10, 90)) + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(np.median(X_trans, axis=0), 0) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + q = np.percentile(X_trans, q=(10, 90), axis=0) + q_range = q[1] - q[0] + assert_array_almost_equal(q_range, 1) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_quantile_transform_iris(csc_container): + X = iris.data + # uniform output distribution + transformer = QuantileTransformer(n_quantiles=30) + X_trans = transformer.fit_transform(X) + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + # normal output distribution + transformer = QuantileTransformer(n_quantiles=30, output_distribution="normal") + X_trans = transformer.fit_transform(X) + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + # make sure it is possible to take the inverse of a sparse matrix + # which contain negative value; this is the case in the iris dataset + X_sparse = csc_container(X) + X_sparse_tran = transformer.fit_transform(X_sparse) + X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran) + assert_array_almost_equal(X_sparse.toarray(), X_sparse_tran_inv.toarray()) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_quantile_transform_check_error(csc_container): + X = np.transpose( + [ + [0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1], + ] + ) + X = csc_container(X) + X_neg = np.transpose( + [ + [0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1], + ] + ) + X_neg = csc_container(X_neg) + + err_msg = ( + "The number of quantiles cannot be greater than " + "the number of samples used. Got 1000 quantiles " + "and 10 samples." + ) + with pytest.raises(ValueError, match=err_msg): + QuantileTransformer(subsample=10).fit(X) + + transformer = QuantileTransformer(n_quantiles=10) + err_msg = "QuantileTransformer only accepts non-negative sparse matrices." + with pytest.raises(ValueError, match=err_msg): + transformer.fit(X_neg) + transformer.fit(X) + err_msg = "QuantileTransformer only accepts non-negative sparse matrices." + with pytest.raises(ValueError, match=err_msg): + transformer.transform(X_neg) + + X_bad_feat = np.transpose( + [[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]] + ) + err_msg = ( + "X has 2 features, but QuantileTransformer is expecting 3 features as input." + ) + with pytest.raises(ValueError, match=err_msg): + transformer.inverse_transform(X_bad_feat) + + transformer = QuantileTransformer(n_quantiles=10).fit(X) + # check that an error is raised if input is scalar + with pytest.raises(ValueError, match="Expected 2D array, got scalar array instead"): + transformer.transform(10) + # check that a warning is raised is n_quantiles > n_samples + transformer = QuantileTransformer(n_quantiles=100) + warn_msg = "n_quantiles is set to n_samples" + with pytest.warns(UserWarning, match=warn_msg) as record: + transformer.fit(X) + assert len(record) == 1 + assert transformer.n_quantiles_ == X.shape[0] + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_quantile_transform_sparse_ignore_zeros(csc_container): + X = np.array([[0, 1], [0, 0], [0, 2], [0, 2], [0, 1]]) + X_sparse = csc_container(X) + transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) + + # dense case -> warning raise + warning_message = ( + "'ignore_implicit_zeros' takes effect" + " only with sparse matrix. This parameter has no" + " effect." + ) + with pytest.warns(UserWarning, match=warning_message): + transformer.fit(X) + + X_expected = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 0]]) + X_trans = transformer.fit_transform(X_sparse) + assert_almost_equal(X_expected, X_trans.toarray()) + + # consider the case where sparse entries are missing values and user-given + # zeros are to be considered + X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0]) + X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8]) + X_sparse = csc_container((X_data, (X_row, X_col))) + X_trans = transformer.fit_transform(X_sparse) + X_expected = np.array( + [ + [0.0, 0.5], + [0.0, 0.0], + [0.0, 1.0], + [0.0, 1.0], + [0.0, 0.5], + [0.0, 0.0], + [0.0, 0.5], + [0.0, 1.0], + [0.0, 0.0], + ] + ) + assert_almost_equal(X_expected, X_trans.toarray()) + + transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) + X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1]) + X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1]) + X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6]) + X_sparse = csc_container((X_data, (X_row, X_col))) + X_trans = transformer.fit_transform(X_sparse) + X_expected = np.array( + [[0, 1], [0, 0.375], [0, 0.375], [0, 0.375], [0, 1], [0, 0], [0, 1]] + ) + assert_almost_equal(X_expected, X_trans.toarray()) + assert_almost_equal( + X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray() + ) + + # check in conjunction with subsampling + transformer = QuantileTransformer( + ignore_implicit_zeros=True, n_quantiles=5, subsample=8, random_state=0 + ) + X_trans = transformer.fit_transform(X_sparse) + assert_almost_equal(X_expected, X_trans.toarray()) + assert_almost_equal( + X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray() + ) + + +def test_quantile_transform_dense_toy(): + X = np.array( + [[0, 2, 2.6], [25, 4, 4.1], [50, 6, 2.3], [75, 8, 9.5], [100, 10, 0.1]] + ) + + transformer = QuantileTransformer(n_quantiles=5) + transformer.fit(X) + + # using a uniform output, each entry of X should be map between 0 and 1 + # and equally spaced + X_trans = transformer.fit_transform(X) + X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T + assert_almost_equal(np.sort(X_trans, axis=0), X_expected) + + X_test = np.array( + [ + [-1, 1, 0], + [101, 11, 10], + ] + ) + X_expected = np.array( + [ + [0, 0, 0], + [1, 1, 1], + ] + ) + assert_array_almost_equal(transformer.transform(X_test), X_expected) + + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + +def test_quantile_transform_subsampling(): + # Test that subsampling the input yield to a consistent results We check + # that the computed quantiles are almost mapped to a [0, 1] vector where + # values are equally spaced. The infinite norm is checked to be smaller + # than a given threshold. This is repeated 5 times. + + # dense support + n_samples = 1000000 + n_quantiles = 1000 + X = np.sort(np.random.sample((n_samples, 1)), axis=0) + ROUND = 5 + inf_norm_arr = [] + for random_state in range(ROUND): + transformer = QuantileTransformer( + random_state=random_state, + n_quantiles=n_quantiles, + subsample=n_samples // 10, + ) + transformer.fit(X) + diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_) + inf_norm = np.max(np.abs(diff)) + assert inf_norm < 1e-2 + inf_norm_arr.append(inf_norm) + # each random subsampling yield a unique approximation to the expected + # linspace CDF + assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr) + + # sparse support + + X = sparse.rand(n_samples, 1, density=0.99, format="csc", random_state=0) + inf_norm_arr = [] + for random_state in range(ROUND): + transformer = QuantileTransformer( + random_state=random_state, + n_quantiles=n_quantiles, + subsample=n_samples // 10, + ) + transformer.fit(X) + diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_) + inf_norm = np.max(np.abs(diff)) + assert inf_norm < 1e-1 + inf_norm_arr.append(inf_norm) + # each random subsampling yield a unique approximation to the expected + # linspace CDF + assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr) + + +def test_quantile_transform_subsampling_disabled(): + """Check the behaviour of `QuantileTransformer` when `subsample=None`.""" + X = np.random.RandomState(0).normal(size=(200, 1)) + + n_quantiles = 5 + transformer = QuantileTransformer(n_quantiles=n_quantiles, subsample=None).fit(X) + + expected_references = np.linspace(0, 1, n_quantiles) + assert_allclose(transformer.references_, expected_references) + expected_quantiles = np.quantile(X.ravel(), expected_references) + assert_allclose(transformer.quantiles_.ravel(), expected_quantiles) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_quantile_transform_sparse_toy(csc_container): + X = np.array( + [ + [0.0, 2.0, 0.0], + [25.0, 4.0, 0.0], + [50.0, 0.0, 2.6], + [0.0, 0.0, 4.1], + [0.0, 6.0, 0.0], + [0.0, 8.0, 0.0], + [75.0, 0.0, 2.3], + [0.0, 10.0, 0.0], + [0.0, 0.0, 9.5], + [100.0, 0.0, 0.1], + ] + ) + + X = csc_container(X) + + transformer = QuantileTransformer(n_quantiles=10) + transformer.fit(X) + + X_trans = transformer.fit_transform(X) + assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0) + assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0) + + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) + + transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray()) + + X_trans = transformer_dense.transform(X) + assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0) + assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0) + + X_trans_inv = transformer_dense.inverse_transform(X_trans) + assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) + + +def test_quantile_transform_axis1(): + X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]) + + X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5) + X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5) + assert_array_almost_equal(X_trans_a0, X_trans_a1.T) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_quantile_transform_bounds(csc_container): + # Lower and upper bounds are manually mapped. We checked that in the case + # of a constant feature and binary feature, the bounds are properly mapped. + X_dense = np.array([[0, 0], [0, 0], [1, 0]]) + X_sparse = csc_container(X_dense) + + # check sparse and dense are consistent + X_trans = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_dense) + assert_array_almost_equal(X_trans, X_dense) + X_trans_sp = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform( + X_sparse + ) + assert_array_almost_equal(X_trans_sp.toarray(), X_dense) + assert_array_almost_equal(X_trans, X_trans_sp.toarray()) + + # check the consistency of the bounds by learning on 1 matrix + # and transforming another + X = np.array([[0, 1], [0, 0.5], [1, 0]]) + X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]]) + transformer = QuantileTransformer(n_quantiles=3).fit(X) + X_trans = transformer.transform(X1) + assert_array_almost_equal(X_trans, X1) + + # check that values outside of the range learned will be mapped properly. + X = np.random.random((1000, 1)) + transformer = QuantileTransformer() + transformer.fit(X) + assert transformer.transform([[-10]]) == transformer.transform([[np.min(X)]]) + assert transformer.transform([[10]]) == transformer.transform([[np.max(X)]]) + assert transformer.inverse_transform([[-10]]) == transformer.inverse_transform( + [[np.min(transformer.references_)]] + ) + assert transformer.inverse_transform([[10]]) == transformer.inverse_transform( + [[np.max(transformer.references_)]] + ) + + +def test_quantile_transform_and_inverse(): + X_1 = iris.data + X_2 = np.array([[0.0], [BOUNDS_THRESHOLD / 10], [1.5], [2], [3], [3], [4]]) + for X in [X_1, X_2]: + transformer = QuantileTransformer(n_quantiles=1000, random_state=0) + X_trans = transformer.fit_transform(X) + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv, decimal=9) + + +def test_quantile_transform_nan(): + X = np.array([[np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0.5], [np.nan, 1, 1, 0]]) + + transformer = QuantileTransformer(n_quantiles=10, random_state=42) + transformer.fit_transform(X) + + # check that the quantile of the first column is all NaN + assert np.isnan(transformer.quantiles_[:, 0]).all() + # all other column should not contain NaN + assert not np.isnan(transformer.quantiles_[:, 1:]).any() + + +@pytest.mark.parametrize("array_type", ["array", "sparse"]) +def test_quantile_transformer_sorted_quantiles(array_type): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15733 + # Taken from upstream bug report: + # https://github.com/numpy/numpy/issues/14685 + X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10) + X = 0.1 * X.reshape(-1, 1) + X = _convert_container(X, array_type) + + n_quantiles = 100 + qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X) + + # Check that the estimated quantile thresholds are monotically + # increasing: + quantiles = qt.quantiles_[:, 0] + assert len(quantiles) == 100 + assert all(np.diff(quantiles) >= 0) + + +def test_robust_scaler_invalid_range(): + for range_ in [ + (-1, 90), + (-2, -3), + (10, 101), + (100.5, 101), + (90, 50), + ]: + scaler = RobustScaler(quantile_range=range_) + + with pytest.raises(ValueError, match=r"Invalid quantile range: \("): + scaler.fit(iris.data) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_scale_function_without_centering(csr_container): + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + X_csr = csr_container(X) + + X_scaled = scale(X, with_mean=False) + assert not np.any(np.isnan(X_scaled)) + + X_csr_scaled = scale(X_csr, with_mean=False) + assert not np.any(np.isnan(X_csr_scaled.data)) + + # test csc has same outcome + X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) + assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) + + # raises value error on axis != 0 + with pytest.raises(ValueError): + scale(X_csr, with_mean=False, axis=1) + + assert_array_almost_equal( + X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2 + ) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + # Check that X has not been copied + assert X_scaled is not X + + X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) + assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) + assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) + + # null scale + X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True) + assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray()) + + +def test_robust_scale_axis1(): + X = iris.data + X_trans = robust_scale(X, axis=1) + assert_array_almost_equal(np.median(X_trans, axis=1), 0) + q = np.percentile(X_trans, q=(25, 75), axis=1) + iqr = q[1] - q[0] + assert_array_almost_equal(iqr, 1) + + +def test_robust_scale_1d_array(): + X = iris.data[:, 1] + X_trans = robust_scale(X) + assert_array_almost_equal(np.median(X_trans), 0) + q = np.percentile(X_trans, q=(25, 75)) + iqr = q[1] - q[0] + assert_array_almost_equal(iqr, 1) + + +def test_robust_scaler_zero_variance_features(): + # Check RobustScaler on toy data with zero variance features + X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]] + + scaler = RobustScaler() + X_trans = scaler.fit_transform(X) + + # NOTE: for such a small sample size, what we expect in the third column + # depends HEAVILY on the method used to calculate quantiles. The values + # here were calculated to fit the quantiles produces by np.percentile + # using numpy 1.9 Calculating quantiles with + # scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles + # would yield very different results! + X_expected = [[0.0, 0.0, +0.0], [0.0, 0.0, -1.0], [0.0, 0.0, +1.0]] + assert_array_almost_equal(X_trans, X_expected) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # make sure new data gets transformed correctly + X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] + X_trans_new = scaler.transform(X_new) + X_expected_new = [[+0.0, 1.0, +0.0], [-1.0, 0.0, -0.83333], [+0.0, 0.0, +1.66667]] + assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3) + + +def test_robust_scaler_unit_variance(): + # Check RobustScaler with unit_variance=True on standard normal data with + # outliers + rng = np.random.RandomState(42) + X = rng.randn(1000000, 1) + X_with_outliers = np.vstack([X, np.ones((100, 1)) * 100, np.ones((100, 1)) * -100]) + + quantile_range = (1, 99) + robust_scaler = RobustScaler(quantile_range=quantile_range, unit_variance=True).fit( + X_with_outliers + ) + X_trans = robust_scaler.transform(X) + + assert robust_scaler.center_ == pytest.approx(0, abs=1e-3) + assert robust_scaler.scale_ == pytest.approx(1, abs=1e-2) + assert X_trans.std() == pytest.approx(1, abs=1e-2) + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_maxabs_scaler_zero_variance_features(sparse_container): + # Check MaxAbsScaler on toy data with zero variance features + X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]] + + scaler = MaxAbsScaler() + X_trans = scaler.fit_transform(X) + X_expected = [ + [0.0, 1.0, 1.0 / 3.0], + [0.0, 1.0, -0.2], + [0.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ] + assert_array_almost_equal(X_trans, X_expected) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # make sure new data gets transformed correctly + X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] + X_trans_new = scaler.transform(X_new) + X_expected_new = [[+0.0, 2.0, 1.0 / 3.0], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.0]] + + assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2) + + # function interface + X_trans = maxabs_scale(X) + assert_array_almost_equal(X_trans, X_expected) + + # sparse data + X_sparse = sparse_container(X) + X_trans_sparse = scaler.fit_transform(X_sparse) + X_expected = [ + [0.0, 1.0, 1.0 / 3.0], + [0.0, 1.0, -0.2], + [0.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ] + assert_array_almost_equal(X_trans_sparse.toarray(), X_expected) + X_trans_sparse_inv = scaler.inverse_transform(X_trans_sparse) + assert_array_almost_equal(X, X_trans_sparse_inv.toarray()) + + +def test_maxabs_scaler_large_negative_value(): + # Check MaxAbsScaler on toy data with a large negative value + X = [ + [0.0, 1.0, +0.5, -1.0], + [0.0, 1.0, -0.3, -0.5], + [0.0, 1.0, -100.0, 0.0], + [0.0, 0.0, +0.0, -2.0], + ] + + scaler = MaxAbsScaler() + X_trans = scaler.fit_transform(X) + X_expected = [ + [0.0, 1.0, 0.005, -0.5], + [0.0, 1.0, -0.003, -0.25], + [0.0, 1.0, -1.0, 0.0], + [0.0, 0.0, 0.0, -1.0], + ] + assert_array_almost_equal(X_trans, X_expected) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_maxabs_scaler_transform_one_row_csr(csr_container): + # Check MaxAbsScaler on transforming csr matrix with one row + X = csr_container([[0.5, 1.0, 1.0]]) + scaler = MaxAbsScaler() + scaler = scaler.fit(X) + X_trans = scaler.transform(X) + X_expected = csr_container([[1.0, 1.0, 1.0]]) + assert_array_almost_equal(X_trans.toarray(), X_expected.toarray()) + X_scaled_back = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X.toarray(), X_scaled_back.toarray()) + + +def test_maxabs_scaler_1d(): + # Test scaling of dataset along single axis + for X in [X_1row, X_1col, X_list_1row, X_list_1row]: + scaler = MaxAbsScaler(copy=True) + X_scaled = scaler.fit(X).transform(X) + + if isinstance(X, list): + X = np.array(X) # cast only after scaling done + + if _check_dim_1axis(X) == 1: + assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), np.ones(n_features)) + else: + assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0) + assert scaler.n_samples_seen_ == X.shape[0] + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert_array_almost_equal(X_scaled_back, X) + + # Constant feature + X = np.ones((5, 1)) + scaler = MaxAbsScaler() + X_scaled = scaler.fit(X).transform(X) + assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0) + assert scaler.n_samples_seen_ == X.shape[0] + + # function interface + X_1d = X_1row.ravel() + max_abs = np.abs(X_1d).max() + assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_maxabs_scaler_partial_fit(csr_container): + # Test if partial_fit run over many batches of size 1 and 50 + # gives the same results as fit + X = X_2d[:100, :] + n = X.shape[0] + + for chunk_size in [1, 2, 50, n, n + 42]: + # Test mean at the end of the process + scaler_batch = MaxAbsScaler().fit(X) + + scaler_incr = MaxAbsScaler() + scaler_incr_csr = MaxAbsScaler() + scaler_incr_csc = MaxAbsScaler() + for batch in gen_batches(n, chunk_size): + scaler_incr = scaler_incr.partial_fit(X[batch]) + X_csr = csr_container(X[batch]) + scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr) + X_csc = csr_container(X[batch]) + scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc) + + assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) + assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_) + assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + assert scaler_batch.n_samples_seen_ == scaler_incr_csr.n_samples_seen_ + assert scaler_batch.n_samples_seen_ == scaler_incr_csc.n_samples_seen_ + assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) + assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_) + assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_) + assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) + + # Test std after 1 step + batch0 = slice(0, chunk_size) + scaler_batch = MaxAbsScaler().fit(X[batch0]) + scaler_incr = MaxAbsScaler().partial_fit(X[batch0]) + + assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) + assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) + + # Test std until the end of partial fits, and + scaler_batch = MaxAbsScaler().fit(X) + scaler_incr = MaxAbsScaler() # Clean estimator + for i, batch in enumerate(gen_batches(n, chunk_size)): + scaler_incr = scaler_incr.partial_fit(X[batch]) + assert_correct_incr( + i, + batch_start=batch.start, + batch_stop=batch.stop, + n=n, + chunk_size=chunk_size, + n_samples_seen=scaler_incr.n_samples_seen_, + ) + + +def check_normalizer(norm, X_norm): + """ + Convenient checking function for `test_normalizer_l1_l2_max` and + `test_normalizer_l1_l2_max_non_csr` + """ + if norm == "l1": + row_sums = np.abs(X_norm).sum(axis=1) + for i in range(3): + assert_almost_equal(row_sums[i], 1.0) + assert_almost_equal(row_sums[3], 0.0) + elif norm == "l2": + for i in range(3): + assert_almost_equal(la.norm(X_norm[i]), 1.0) + assert_almost_equal(la.norm(X_norm[3]), 0.0) + elif norm == "max": + row_maxs = abs(X_norm).max(axis=1) + for i in range(3): + assert_almost_equal(row_maxs[i], 1.0) + assert_almost_equal(row_maxs[3], 0.0) + + +@pytest.mark.parametrize("norm", ["l1", "l2", "max"]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_normalizer_l1_l2_max(norm, csr_container): + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + X_sparse_unpruned = csr_container(X_dense) + + # set the row number 3 to zero + X_dense[3, :] = 0.0 + + # set the row number 3 to zero without pruning (can happen in real life) + indptr_3 = X_sparse_unpruned.indptr[3] + indptr_4 = X_sparse_unpruned.indptr[4] + X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 + + # build the pruned variant using the regular constructor + X_sparse_pruned = csr_container(X_dense) + + # check inputs that support the no-copy optim + for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): + normalizer = Normalizer(norm=norm, copy=True) + X_norm1 = normalizer.transform(X) + assert X_norm1 is not X + X_norm1 = toarray(X_norm1) + + normalizer = Normalizer(norm=norm, copy=False) + X_norm2 = normalizer.transform(X) + assert X_norm2 is X + X_norm2 = toarray(X_norm2) + + for X_norm in (X_norm1, X_norm2): + check_normalizer(norm, X_norm) + + +@pytest.mark.parametrize("norm", ["l1", "l2", "max"]) +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + LIL_CONTAINERS +) +def test_normalizer_l1_l2_max_non_csr(norm, sparse_container): + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + + # set the row number 3 to zero + X_dense[3, :] = 0.0 + + X = sparse_container(X_dense) + X_norm = Normalizer(norm=norm, copy=False).transform(X) + + assert X_norm is not X + assert sparse.issparse(X_norm) and X_norm.format == "csr" + + X_norm = toarray(X_norm) + check_normalizer(norm, X_norm) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_normalizer_max_sign(csr_container): + # check that we normalize by a positive number even for negative data + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + # set the row number 3 to zero + X_dense[3, :] = 0.0 + # check for mixed data where the value with + # largest magnitude is negative + X_dense[2, abs(X_dense[2, :]).argmax()] *= -1 + X_all_neg = -np.abs(X_dense) + X_all_neg_sparse = csr_container(X_all_neg) + + for X in (X_dense, X_all_neg, X_all_neg_sparse): + normalizer = Normalizer(norm="max") + X_norm = normalizer.transform(X) + assert X_norm is not X + X_norm = toarray(X_norm) + assert_array_equal(np.sign(X_norm), np.sign(toarray(X))) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_normalize(csr_container): + # Test normalize function + # Only tests functionality not used by the tests for Normalizer. + X = np.random.RandomState(37).randn(3, 2) + assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T) + + rs = np.random.RandomState(0) + X_dense = rs.randn(10, 5) + X_sparse = csr_container(X_dense) + ones = np.ones((10)) + for X in (X_dense, X_sparse): + for dtype in (np.float32, np.float64): + for norm in ("l1", "l2"): + X = X.astype(dtype) + X_norm = normalize(X, norm=norm) + assert X_norm.dtype == dtype + + X_norm = toarray(X_norm) + if norm == "l1": + row_sums = np.abs(X_norm).sum(axis=1) + else: + X_norm_squared = X_norm**2 + row_sums = X_norm_squared.sum(axis=1) + + assert_array_almost_equal(row_sums, ones) + + # Test return_norm + X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]]) + for norm in ("l1", "l2", "max"): + _, norms = normalize(X_dense, norm=norm, return_norm=True) + if norm == "l1": + assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0])) + elif norm == "l2": + assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127])) + else: + assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0])) + + X_sparse = csr_container(X_dense) + for norm in ("l1", "l2"): + with pytest.raises(NotImplementedError): + normalize(X_sparse, norm=norm, return_norm=True) + _, norms = normalize(X_sparse, norm="max", return_norm=True) + assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0])) + + +@pytest.mark.parametrize( + "constructor", [np.array, list] + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_binarizer(constructor): + X_ = np.array([[1, 0, 5], [2, 3, -1]]) + X = constructor(X_.copy()) + + binarizer = Binarizer(threshold=2.0, copy=True) + X_bin = toarray(binarizer.transform(X)) + assert np.sum(X_bin == 0) == 4 + assert np.sum(X_bin == 1) == 2 + X_bin = binarizer.transform(X) + assert sparse.issparse(X) == sparse.issparse(X_bin) + + binarizer = Binarizer(copy=True).fit(X) + X_bin = toarray(binarizer.transform(X)) + assert X_bin is not X + assert np.sum(X_bin == 0) == 2 + assert np.sum(X_bin == 1) == 4 + + binarizer = Binarizer(copy=True) + X_bin = binarizer.transform(X) + assert X_bin is not X + X_bin = toarray(X_bin) + assert np.sum(X_bin == 0) == 2 + assert np.sum(X_bin == 1) == 4 + + binarizer = Binarizer(copy=False) + X_bin = binarizer.transform(X) + if constructor is not list: + assert X_bin is X + + binarizer = Binarizer(copy=False) + X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64) + X_bin = binarizer.transform(X_float) + if constructor is not list: + assert X_bin is X_float + + X_bin = toarray(X_bin) + assert np.sum(X_bin == 0) == 2 + assert np.sum(X_bin == 1) == 4 + + binarizer = Binarizer(threshold=-0.5, copy=True) + if constructor in (np.array, list): + X = constructor(X_.copy()) + + X_bin = toarray(binarizer.transform(X)) + assert np.sum(X_bin == 0) == 1 + assert np.sum(X_bin == 1) == 5 + X_bin = binarizer.transform(X) + + # Cannot use threshold < 0 for sparse + if constructor in CSC_CONTAINERS: + with pytest.raises(ValueError): + binarizer.transform(constructor(X)) + + +def test_center_kernel(): + # Test that KernelCenterer is equivalent to StandardScaler + # in feature space + rng = np.random.RandomState(0) + X_fit = rng.random_sample((5, 4)) + scaler = StandardScaler(with_std=False) + scaler.fit(X_fit) + X_fit_centered = scaler.transform(X_fit) + K_fit = np.dot(X_fit, X_fit.T) + + # center fit time matrix + centerer = KernelCenterer() + K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) + K_fit_centered2 = centerer.fit_transform(K_fit) + assert_array_almost_equal(K_fit_centered, K_fit_centered2) + + # center predict time matrix + X_pred = rng.random_sample((2, 4)) + K_pred = np.dot(X_pred, X_fit.T) + X_pred_centered = scaler.transform(X_pred) + K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) + K_pred_centered2 = centerer.transform(K_pred) + assert_array_almost_equal(K_pred_centered, K_pred_centered2) + + # check the results coherence with the method proposed in: + # B. Schölkopf, A. Smola, and K.R. Müller, + # "Nonlinear component analysis as a kernel eigenvalue problem" + # equation (B.3) + + # K_centered3 = (I - 1_M) K (I - 1_M) + # = K - 1_M K - K 1_M + 1_M K 1_M + ones_M = np.ones_like(K_fit) / K_fit.shape[0] + K_fit_centered3 = K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M + assert_allclose(K_fit_centered, K_fit_centered3) + + # K_test_centered3 = (K_test - 1'_M K)(I - 1_M) + # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M + ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0] + K_pred_centered3 = ( + K_pred - ones_prime_M @ K_fit - K_pred @ ones_M + ones_prime_M @ K_fit @ ones_M + ) + assert_allclose(K_pred_centered, K_pred_centered3) + + +def test_kernelcenterer_non_linear_kernel(): + """Check kernel centering for non-linear kernel.""" + rng = np.random.RandomState(0) + X, X_test = rng.randn(100, 50), rng.randn(20, 50) + + def phi(X): + """Our mapping function phi.""" + return np.vstack( + [ + np.clip(X, a_min=0, a_max=None), + -np.clip(X, a_min=None, a_max=0), + ] + ) + + phi_X = phi(X) + phi_X_test = phi(X_test) + + # centered the projection + scaler = StandardScaler(with_std=False) + phi_X_center = scaler.fit_transform(phi_X) + phi_X_test_center = scaler.transform(phi_X_test) + + # create the different kernel + K = phi_X @ phi_X.T + K_test = phi_X_test @ phi_X.T + K_center = phi_X_center @ phi_X_center.T + K_test_center = phi_X_test_center @ phi_X_center.T + + kernel_centerer = KernelCenterer() + kernel_centerer.fit(K) + + assert_allclose(kernel_centerer.transform(K), K_center) + assert_allclose(kernel_centerer.transform(K_test), K_test_center) + + # check the results coherence with the method proposed in: + # B. Schölkopf, A. Smola, and K.R. Müller, + # "Nonlinear component analysis as a kernel eigenvalue problem" + # equation (B.3) + + # K_centered = (I - 1_M) K (I - 1_M) + # = K - 1_M K - K 1_M + 1_M K 1_M + ones_M = np.ones_like(K) / K.shape[0] + K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M + assert_allclose(kernel_centerer.transform(K), K_centered) + + # K_test_centered = (K_test - 1'_M K)(I - 1_M) + # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M + ones_prime_M = np.ones_like(K_test) / K.shape[0] + K_test_centered = ( + K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M + ) + assert_allclose(kernel_centerer.transform(K_test), K_test_centered) + + +def test_cv_pipeline_precomputed(): + # Cross-validate a regression on four coplanar points with the same + # value. Use precomputed kernel to ensure Pipeline with KernelCenterer + # is treated as a pairwise operation. + X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]]) + y_true = np.ones((4,)) + K = X.dot(X.T) + kcent = KernelCenterer() + pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())]) + + # did the pipeline set the pairwise attribute? + assert pipeline.__sklearn_tags__().input_tags.pairwise + + # test cross-validation, score should be almost perfect + # NB: this test is pretty vacuous -- it's mainly to test integration + # of Pipeline and KernelCenterer + y_pred = cross_val_predict(pipeline, K, y_true, cv=2) + assert_array_almost_equal(y_true, y_pred) + + +def test_fit_transform(): + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + for obj in (StandardScaler(), Normalizer(), Binarizer()): + X_transformed = obj.fit(X).transform(X) + X_transformed2 = obj.fit_transform(X) + assert_array_equal(X_transformed, X_transformed2) + + +def test_add_dummy_feature(): + X = [[1, 0], [0, 1], [0, 1]] + X = add_dummy_feature(X) + assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) + + +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_add_dummy_feature_sparse(sparse_container): + X = sparse_container([[1, 0], [0, 1], [0, 1]]) + desired_format = X.format + X = add_dummy_feature(X) + assert sparse.issparse(X) and X.format == desired_format, X + assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) + + +def test_fit_cold_start(): + X = iris.data + X_2d = X[:, :2] + + # Scalers that have a partial_fit method + scalers = [ + StandardScaler(with_mean=False, with_std=False), + MinMaxScaler(), + MaxAbsScaler(), + ] + + for scaler in scalers: + scaler.fit_transform(X) + # with a different shape, this may break the scaler unless the internal + # state is reset + scaler.fit_transform(X_2d) + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +def test_power_transformer_notfitted(method): + pt = PowerTransformer(method=method) + X = np.abs(X_1col) + with pytest.raises(NotFittedError): + pt.transform(X) + with pytest.raises(NotFittedError): + pt.inverse_transform(X) + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +@pytest.mark.parametrize("standardize", [True, False]) +@pytest.mark.parametrize("X", [X_1col, X_2d]) +def test_power_transformer_inverse(method, standardize, X): + # Make sure we get the original input when applying transform and then + # inverse transform + X = np.abs(X) if method == "box-cox" else X + pt = PowerTransformer(method=method, standardize=standardize) + X_trans = pt.fit_transform(X) + assert_almost_equal(X, pt.inverse_transform(X_trans)) + + +def test_power_transformer_1d(): + X = np.abs(X_1col) + + for standardize in [True, False]: + pt = PowerTransformer(method="box-cox", standardize=standardize) + + X_trans = pt.fit_transform(X) + X_trans_func = power_transform(X, method="box-cox", standardize=standardize) + + X_expected, lambda_expected = stats.boxcox(X.flatten()) + + if standardize: + X_expected = scale(X_expected) + + assert_almost_equal(X_expected.reshape(-1, 1), X_trans) + assert_almost_equal(X_expected.reshape(-1, 1), X_trans_func) + + assert_almost_equal(X, pt.inverse_transform(X_trans)) + assert_almost_equal(lambda_expected, pt.lambdas_[0]) + + assert len(pt.lambdas_) == X.shape[1] + assert isinstance(pt.lambdas_, np.ndarray) + + +def test_power_transformer_2d(): + X = np.abs(X_2d) + + for standardize in [True, False]: + pt = PowerTransformer(method="box-cox", standardize=standardize) + + X_trans_class = pt.fit_transform(X) + X_trans_func = power_transform(X, method="box-cox", standardize=standardize) + + for X_trans in [X_trans_class, X_trans_func]: + for j in range(X_trans.shape[1]): + X_expected, lmbda = stats.boxcox(X[:, j].flatten()) + + if standardize: + X_expected = scale(X_expected) + + assert_almost_equal(X_trans[:, j], X_expected) + assert_almost_equal(lmbda, pt.lambdas_[j]) + + # Test inverse transformation + X_inv = pt.inverse_transform(X_trans) + assert_array_almost_equal(X_inv, X) + + assert len(pt.lambdas_) == X.shape[1] + assert isinstance(pt.lambdas_, np.ndarray) + + +def test_power_transformer_boxcox_strictly_positive_exception(): + # Exceptions should be raised for negative arrays and zero arrays when + # method is boxcox + + pt = PowerTransformer(method="box-cox") + pt.fit(np.abs(X_2d)) + X_with_negatives = X_2d + not_positive_message = "strictly positive" + + with pytest.raises(ValueError, match=not_positive_message): + pt.transform(X_with_negatives) + + with pytest.raises(ValueError, match=not_positive_message): + pt.fit(X_with_negatives) + + with pytest.raises(ValueError, match=not_positive_message): + power_transform(X_with_negatives, method="box-cox") + + with pytest.raises(ValueError, match=not_positive_message): + pt.transform(np.zeros(X_2d.shape)) + + with pytest.raises(ValueError, match=not_positive_message): + pt.fit(np.zeros(X_2d.shape)) + + with pytest.raises(ValueError, match=not_positive_message): + power_transform(np.zeros(X_2d.shape), method="box-cox") + + +@pytest.mark.parametrize("X", [X_2d, np.abs(X_2d), -np.abs(X_2d), np.zeros(X_2d.shape)]) +def test_power_transformer_yeojohnson_any_input(X): + # Yeo-Johnson method should support any kind of input + power_transform(X, method="yeo-johnson") + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +def test_power_transformer_shape_exception(method): + pt = PowerTransformer(method=method) + X = np.abs(X_2d) + pt.fit(X) + + # Exceptions should be raised for arrays with different num_columns + # than during fitting + wrong_shape_message = ( + r"X has \d+ features, but PowerTransformer is " r"expecting \d+ features" + ) + + with pytest.raises(ValueError, match=wrong_shape_message): + pt.transform(X[:, 0:1]) + + with pytest.raises(ValueError, match=wrong_shape_message): + pt.inverse_transform(X[:, 0:1]) + + +def test_power_transformer_lambda_zero(): + pt = PowerTransformer(method="box-cox", standardize=False) + X = np.abs(X_2d)[:, 0:1] + + # Test the lambda = 0 case + pt.lambdas_ = np.array([0]) + X_trans = pt.transform(X) + assert_array_almost_equal(pt.inverse_transform(X_trans), X) + + +def test_power_transformer_lambda_one(): + # Make sure lambda = 1 corresponds to the identity for yeo-johnson + pt = PowerTransformer(method="yeo-johnson", standardize=False) + X = np.abs(X_2d)[:, 0:1] + + pt.lambdas_ = np.array([1]) + X_trans = pt.transform(X) + assert_array_almost_equal(X_trans, X) + + +@pytest.mark.parametrize( + "method, lmbda", + [ + ("box-cox", 0.1), + ("box-cox", 0.5), + ("yeo-johnson", 0.1), + ("yeo-johnson", 0.5), + ("yeo-johnson", 1.0), + ], +) +def test_optimization_power_transformer(method, lmbda): + # Test the optimization procedure: + # - set a predefined value for lambda + # - apply inverse_transform to a normal dist (we get X_inv) + # - apply fit_transform to X_inv (we get X_inv_trans) + # - check that X_inv_trans is roughly equal to X + + rng = np.random.RandomState(0) + n_samples = 20000 + X = rng.normal(loc=0, scale=1, size=(n_samples, 1)) + + if method == "box-cox": + # For box-cox, means that lmbda * y + 1 > 0 or y > - 1 / lmbda + # Clip the data here to make sure the inequality is valid. + X = np.clip(X, -1 / lmbda + 1e-5, None) + + pt = PowerTransformer(method=method, standardize=False) + pt.lambdas_ = [lmbda] + X_inv = pt.inverse_transform(X) + + pt = PowerTransformer(method=method, standardize=False) + X_inv_trans = pt.fit_transform(X_inv) + + assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, decimal=2) + assert_almost_equal(0, X_inv_trans.mean(), decimal=1) + assert_almost_equal(1, X_inv_trans.std(), decimal=1) + + +def test_invserse_box_cox(): + # output nan if the input is invalid + pt = PowerTransformer(method="box-cox", standardize=False) + pt.lambdas_ = [0.5] + X_inv = pt.inverse_transform([[-2.1]]) + assert np.isnan(X_inv) + + +def test_yeo_johnson_darwin_example(): + # test from original paper "A new family of power transformations to + # improve normality or symmetry" by Yeo and Johnson. + X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0] + X = np.array(X).reshape(-1, 1) + lmbda = PowerTransformer(method="yeo-johnson").fit(X).lambdas_ + assert np.allclose(lmbda, 1.305, atol=1e-3) + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +def test_power_transformer_nans(method): + # Make sure lambda estimation is not influenced by NaN values + # and that transform() supports NaN silently + + X = np.abs(X_1col) + pt = PowerTransformer(method=method) + pt.fit(X) + lmbda_no_nans = pt.lambdas_[0] + + # concat nans at the end and check lambda stays the same + X = np.concatenate([X, np.full_like(X, np.nan)]) + X = shuffle(X, random_state=0) + + pt.fit(X) + lmbda_nans = pt.lambdas_[0] + + assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5) + + X_trans = pt.transform(X) + assert_array_equal(np.isnan(X_trans), np.isnan(X)) + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_fit_transform(method, standardize): + # check that fit_transform() and fit().transform() return the same values + X = X_1col + if method == "box-cox": + X = np.abs(X) + + pt = PowerTransformer(method, standardize=standardize) + assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X)) + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_copy_True(method, standardize): + # Check that neither fit, transform, fit_transform nor inverse_transform + # modify X inplace when copy=True + X = X_1col + if method == "box-cox": + X = np.abs(X) + + X_original = X.copy() + assert X is not X_original # sanity checks + assert_array_almost_equal(X, X_original) + + pt = PowerTransformer(method, standardize=standardize, copy=True) + + pt.fit(X) + assert_array_almost_equal(X, X_original) + X_trans = pt.transform(X) + assert X_trans is not X + + X_trans = pt.fit_transform(X) + assert_array_almost_equal(X, X_original) + assert X_trans is not X + + X_inv_trans = pt.inverse_transform(X_trans) + assert X_trans is not X_inv_trans + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_copy_False(method, standardize): + # check that when copy=False fit doesn't change X inplace but transform, + # fit_transform and inverse_transform do. + X = X_1col + if method == "box-cox": + X = np.abs(X) + + X_original = X.copy() + assert X is not X_original # sanity checks + assert_array_almost_equal(X, X_original) + + pt = PowerTransformer(method, standardize=standardize, copy=False) + + pt.fit(X) + assert_array_almost_equal(X, X_original) # fit didn't change X + + X_trans = pt.transform(X) + assert X_trans is X + + if method == "box-cox": + X = np.abs(X) + X_trans = pt.fit_transform(X) + assert X_trans is X + + X_inv_trans = pt.inverse_transform(X_trans) + assert X_trans is X_inv_trans + + +def test_power_transformer_box_cox_raise_all_nans_col(): + """Check that box-cox raises informative when a column contains all nans. + + Non-regression test for gh-26303 + """ + X = rng.random_sample((4, 5)) + X[:, 0] = np.nan + + err_msg = "Column must not be all nan." + + pt = PowerTransformer(method="box-cox") + with pytest.raises(ValueError, match=err_msg): + pt.fit_transform(X) + + +@pytest.mark.parametrize( + "X_2", + [sparse.random(10, 1, density=0.8, random_state=0)] + + [ + csr_container(np.full((10, 1), fill_value=np.nan)) + for csr_container in CSR_CONTAINERS + ], +) +def test_standard_scaler_sparse_partial_fit_finite_variance(X_2): + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16448 + X_1 = sparse.random(5, 1, density=0.8) + scaler = StandardScaler(with_mean=False) + scaler.fit(X_1).partial_fit(X_2) + assert np.isfinite(scaler.var_[0]) + + +@pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)]) +def test_minmax_scaler_clip(feature_range): + # test behaviour of the parameter 'clip' in MinMaxScaler + X = iris.data + scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X) + X_min, X_max = np.min(X, axis=0), np.max(X, axis=0) + X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]] + X_transformed = scaler.transform(X_test) + assert_allclose( + X_transformed, + [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]], + ) + + +def test_standard_scaler_raise_error_for_1d_input(): + """Check that `inverse_transform` from `StandardScaler` raises an error + with 1D array. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19518 + """ + scaler = StandardScaler().fit(X_2d) + err_msg = "Expected 2D array, got 1D array instead" + with pytest.raises(ValueError, match=err_msg): + scaler.inverse_transform(X_2d[:, 0]) + + +def test_power_transformer_significantly_non_gaussian(): + """Check that significantly non-Gaussian data before transforms correctly. + + For some explored lambdas, the transformed data may be constant and will + be rejected. Non-regression test for + https://github.com/scikit-learn/scikit-learn/issues/14959 + """ + + X_non_gaussian = 1e6 * np.array( + [0.6, 2.0, 3.0, 4.0] * 4 + [11, 12, 12, 16, 17, 20, 85, 90], dtype=np.float64 + ).reshape(-1, 1) + pt = PowerTransformer() + + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + X_trans = pt.fit_transform(X_non_gaussian) + + assert not np.any(np.isnan(X_trans)) + assert X_trans.mean() == pytest.approx(0.0) + assert X_trans.std() == pytest.approx(1.0) + assert X_trans.min() > -2 + assert X_trans.max() < 2 + + +@pytest.mark.parametrize( + "Transformer", + [ + MinMaxScaler, + MaxAbsScaler, + RobustScaler, + StandardScaler, + QuantileTransformer, + PowerTransformer, + ], +) +def test_one_to_one_features(Transformer): + """Check one-to-one transformers give correct feature names.""" + tr = Transformer().fit(iris.data) + names_out = tr.get_feature_names_out(iris.feature_names) + assert_array_equal(names_out, iris.feature_names) + + +@pytest.mark.parametrize( + "Transformer", + [ + MinMaxScaler, + MaxAbsScaler, + RobustScaler, + StandardScaler, + QuantileTransformer, + PowerTransformer, + Normalizer, + Binarizer, + ], +) +def test_one_to_one_features_pandas(Transformer): + """Check one-to-one transformers give correct feature names.""" + pd = pytest.importorskip("pandas") + + df = pd.DataFrame(iris.data, columns=iris.feature_names) + tr = Transformer().fit(df) + + names_out_df_default = tr.get_feature_names_out() + assert_array_equal(names_out_df_default, iris.feature_names) + + names_out_df_valid_in = tr.get_feature_names_out(iris.feature_names) + assert_array_equal(names_out_df_valid_in, iris.feature_names) + + msg = re.escape("input_features is not equal to feature_names_in_") + with pytest.raises(ValueError, match=msg): + invalid_names = list("abcd") + tr.get_feature_names_out(invalid_names) + + +def test_kernel_centerer_feature_names_out(): + """Test that kernel centerer `feature_names_out`.""" + + rng = np.random.RandomState(0) + X = rng.random_sample((6, 4)) + X_pairwise = linear_kernel(X) + centerer = KernelCenterer().fit(X_pairwise) + + names_out = centerer.get_feature_names_out() + samples_out2 = X_pairwise.shape[1] + assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)]) + + +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_constant_feature(standardize): + """Check that PowerTransfomer leaves constant features unchanged.""" + X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]] + + pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X) + + assert_allclose(pt.lambdas_, [1, 1, 1]) + + Xft = pt.fit_transform(X) + Xt = pt.transform(X) + + for Xt_ in [Xft, Xt]: + if standardize: + assert_allclose(Xt_, np.zeros_like(X)) + else: + assert_allclose(Xt_, X)