diff --git a/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_2/data-v1-dl-1666876.arff.gz b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_2/data-v1-dl-1666876.arff.gz
new file mode 100644
index 0000000000000000000000000000000000000000..ee6e378589d722771363d186944ed1f0f78c9836
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_2/data-v1-dl-1666876.arff.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfe8945b949770b0da42daf58ce67d1c5fee25cf7b4fd145161837c2abc09429
+size 1841
diff --git a/.venv/Lib/site-packages/sklearn/mixture/__init__.py b/.venv/Lib/site-packages/sklearn/mixture/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..af69fc1649307bba5a43fda5772eaa8c03f7aa1b
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/mixture/__init__.py
@@ -0,0 +1,9 @@
+"""Mixture modeling algorithms."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._bayesian_mixture import BayesianGaussianMixture
+from ._gaussian_mixture import GaussianMixture
+
+__all__ = ["GaussianMixture", "BayesianGaussianMixture"]
diff --git a/.venv/Lib/site-packages/sklearn/mixture/tests/__init__.py b/.venv/Lib/site-packages/sklearn/mixture/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/Lib/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py b/.venv/Lib/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..306c6a7fc2b16a7c9d6970d1fabd620b6f198f46
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py
@@ -0,0 +1,464 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import copy
+
+import numpy as np
+import pytest
+from scipy.special import gammaln
+
+from sklearn.exceptions import NotFittedError
+from sklearn.metrics.cluster import adjusted_rand_score
+from sklearn.mixture import BayesianGaussianMixture
+from sklearn.mixture._bayesian_mixture import _log_dirichlet_norm, _log_wishart_norm
+from sklearn.mixture.tests.test_gaussian_mixture import RandomData
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_equal,
+)
+
+COVARIANCE_TYPE = ["full", "tied", "diag", "spherical"]
+PRIOR_TYPE = ["dirichlet_process", "dirichlet_distribution"]
+
+
+def test_log_dirichlet_norm():
+    rng = np.random.RandomState(0)
+
+    weight_concentration = rng.rand(2)
+    expected_norm = gammaln(np.sum(weight_concentration)) - np.sum(
+        gammaln(weight_concentration)
+    )
+    predected_norm = _log_dirichlet_norm(weight_concentration)
+
+    assert_almost_equal(expected_norm, predected_norm)
+
+
+def test_log_wishart_norm():
+    rng = np.random.RandomState(0)
+
+    n_components, n_features = 5, 2
+    degrees_of_freedom = np.abs(rng.rand(n_components)) + 1.0
+    log_det_precisions_chol = n_features * np.log(range(2, 2 + n_components))
+
+    expected_norm = np.empty(5)
+    for k, (degrees_of_freedom_k, log_det_k) in enumerate(
+        zip(degrees_of_freedom, log_det_precisions_chol)
+    ):
+        expected_norm[k] = -(
+            degrees_of_freedom_k * (log_det_k + 0.5 * n_features * np.log(2.0))
+            + np.sum(
+                gammaln(
+                    0.5
+                    * (degrees_of_freedom_k - np.arange(0, n_features)[:, np.newaxis])
+                ),
+                0,
+            )
+        ).item()
+    predected_norm = _log_wishart_norm(
+        degrees_of_freedom, log_det_precisions_chol, n_features
+    )
+
+    assert_almost_equal(expected_norm, predected_norm)
+
+
+def test_bayesian_mixture_weights_prior_initialisation():
+    rng = np.random.RandomState(0)
+    n_samples, n_components, n_features = 10, 5, 2
+    X = rng.rand(n_samples, n_features)
+
+    # Check correct init for a given value of weight_concentration_prior
+    weight_concentration_prior = rng.rand()
+    bgmm = BayesianGaussianMixture(
+        weight_concentration_prior=weight_concentration_prior, random_state=rng
+    ).fit(X)
+    assert_almost_equal(weight_concentration_prior, bgmm.weight_concentration_prior_)
+
+    # Check correct init for the default value of weight_concentration_prior
+    bgmm = BayesianGaussianMixture(n_components=n_components, random_state=rng).fit(X)
+    assert_almost_equal(1.0 / n_components, bgmm.weight_concentration_prior_)
+
+
+def test_bayesian_mixture_mean_prior_initialisation():
+    rng = np.random.RandomState(0)
+    n_samples, n_components, n_features = 10, 3, 2
+    X = rng.rand(n_samples, n_features)
+
+    # Check correct init for a given value of mean_precision_prior
+    mean_precision_prior = rng.rand()
+    bgmm = BayesianGaussianMixture(
+        mean_precision_prior=mean_precision_prior, random_state=rng
+    ).fit(X)
+    assert_almost_equal(mean_precision_prior, bgmm.mean_precision_prior_)
+
+    # Check correct init for the default value of mean_precision_prior
+    bgmm = BayesianGaussianMixture(random_state=rng).fit(X)
+    assert_almost_equal(1.0, bgmm.mean_precision_prior_)
+
+    # Check correct init for a given value of mean_prior
+    mean_prior = rng.rand(n_features)
+    bgmm = BayesianGaussianMixture(
+        n_components=n_components, mean_prior=mean_prior, random_state=rng
+    ).fit(X)
+    assert_almost_equal(mean_prior, bgmm.mean_prior_)
+
+    # Check correct init for the default value of bemean_priorta
+    bgmm = BayesianGaussianMixture(n_components=n_components, random_state=rng).fit(X)
+    assert_almost_equal(X.mean(axis=0), bgmm.mean_prior_)
+
+
+def test_bayesian_mixture_precisions_prior_initialisation():
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 2
+    X = rng.rand(n_samples, n_features)
+
+    # Check raise message for a bad value of degrees_of_freedom_prior
+    bad_degrees_of_freedom_prior_ = n_features - 1.0
+    bgmm = BayesianGaussianMixture(
+        degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng
+    )
+    msg = (
+        "The parameter 'degrees_of_freedom_prior' should be greater than"
+        f" {n_features -1}, but got {bad_degrees_of_freedom_prior_:.3f}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
+
+    # Check correct init for a given value of degrees_of_freedom_prior
+    degrees_of_freedom_prior = rng.rand() + n_features - 1.0
+    bgmm = BayesianGaussianMixture(
+        degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng
+    ).fit(X)
+    assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_)
+
+    # Check correct init for the default value of degrees_of_freedom_prior
+    degrees_of_freedom_prior_default = n_features
+    bgmm = BayesianGaussianMixture(
+        degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng
+    ).fit(X)
+    assert_almost_equal(
+        degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_
+    )
+
+    # Check correct init for a given value of covariance_prior
+    covariance_prior = {
+        "full": np.cov(X.T, bias=1) + 10,
+        "tied": np.cov(X.T, bias=1) + 5,
+        "diag": np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3,
+        "spherical": rng.rand(),
+    }
+
+    bgmm = BayesianGaussianMixture(random_state=rng)
+    for cov_type in ["full", "tied", "diag", "spherical"]:
+        bgmm.covariance_type = cov_type
+        bgmm.covariance_prior = covariance_prior[cov_type]
+        bgmm.fit(X)
+        assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_)
+
+    # Check correct init for the default value of covariance_prior
+    covariance_prior_default = {
+        "full": np.atleast_2d(np.cov(X.T)),
+        "tied": np.atleast_2d(np.cov(X.T)),
+        "diag": np.var(X, axis=0, ddof=1),
+        "spherical": np.var(X, axis=0, ddof=1).mean(),
+    }
+
+    bgmm = BayesianGaussianMixture(random_state=0)
+    for cov_type in ["full", "tied", "diag", "spherical"]:
+        bgmm.covariance_type = cov_type
+        bgmm.fit(X)
+        assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_)
+
+
+def test_bayesian_mixture_check_is_fitted():
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 2
+
+    # Check raise message
+    bgmm = BayesianGaussianMixture(random_state=rng)
+    X = rng.rand(n_samples, n_features)
+
+    msg = "This BayesianGaussianMixture instance is not fitted yet."
+    with pytest.raises(ValueError, match=msg):
+        bgmm.score(X)
+
+
+def test_bayesian_mixture_weights():
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 2
+
+    X = rng.rand(n_samples, n_features)
+
+    # Case Dirichlet distribution for the weight concentration prior type
+    bgmm = BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_distribution",
+        n_components=3,
+        random_state=rng,
+    ).fit(X)
+
+    expected_weights = bgmm.weight_concentration_ / np.sum(bgmm.weight_concentration_)
+    assert_almost_equal(expected_weights, bgmm.weights_)
+    assert_almost_equal(np.sum(bgmm.weights_), 1.0)
+
+    # Case Dirichlet process for the weight concentration prior type
+    dpgmm = BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_process",
+        n_components=3,
+        random_state=rng,
+    ).fit(X)
+    weight_dirichlet_sum = (
+        dpgmm.weight_concentration_[0] + dpgmm.weight_concentration_[1]
+    )
+    tmp = dpgmm.weight_concentration_[1] / weight_dirichlet_sum
+    expected_weights = (
+        dpgmm.weight_concentration_[0]
+        / weight_dirichlet_sum
+        * np.hstack((1, np.cumprod(tmp[:-1])))
+    )
+    expected_weights /= np.sum(expected_weights)
+    assert_almost_equal(expected_weights, dpgmm.weights_)
+    assert_almost_equal(np.sum(dpgmm.weights_), 1.0)
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_monotonic_likelihood():
+    # We check that each step of the each step of variational inference without
+    # regularization improve monotonically the training set of the bound
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=20)
+    n_components = rand_data.n_components
+
+    for prior_type in PRIOR_TYPE:
+        for covar_type in COVARIANCE_TYPE:
+            X = rand_data.X[covar_type]
+            bgmm = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=2 * n_components,
+                covariance_type=covar_type,
+                warm_start=True,
+                max_iter=1,
+                random_state=rng,
+                tol=1e-3,
+            )
+            current_lower_bound = -np.inf
+            # Do one training iteration at a time so we can make sure that the
+            # training log likelihood increases after each iteration.
+            for _ in range(600):
+                prev_lower_bound = current_lower_bound
+                current_lower_bound = bgmm.fit(X).lower_bound_
+                assert current_lower_bound >= prev_lower_bound
+
+                if bgmm.converged_:
+                    break
+            assert bgmm.converged_
+
+
+def test_compare_covar_type():
+    # We can compare the 'full' precision with the other cov_type if we apply
+    # 1 iter of the M-step (done during _initialize_parameters).
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    X = rand_data.X["full"]
+    n_components = rand_data.n_components
+
+    for prior_type in PRIOR_TYPE:
+        # Computation of the full_covariance
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components,
+            covariance_type="full",
+            max_iter=1,
+            random_state=0,
+            tol=1e-7,
+        )
+        bgmm._check_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+        full_covariances = (
+            bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis]
+        )
+
+        # Check tied_covariance = mean(full_covariances, 0)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components,
+            covariance_type="tied",
+            max_iter=1,
+            random_state=0,
+            tol=1e-7,
+        )
+        bgmm._check_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_
+        assert_almost_equal(tied_covariance, np.mean(full_covariances, 0))
+
+        # Check diag_covariance = diag(full_covariances)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components,
+            covariance_type="diag",
+            max_iter=1,
+            random_state=0,
+            tol=1e-7,
+        )
+        bgmm._check_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        diag_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis]
+        assert_almost_equal(
+            diag_covariances, np.array([np.diag(cov) for cov in full_covariances])
+        )
+
+        # Check spherical_covariance = np.mean(diag_covariances, 0)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components,
+            covariance_type="spherical",
+            max_iter=1,
+            random_state=0,
+            tol=1e-7,
+        )
+        bgmm._check_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_
+        assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1))
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_check_covariance_precision():
+    # We check that the dot product of the covariance and the precision
+    # matrices is identity.
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    n_components, n_features = 2 * rand_data.n_components, 2
+
+    # Computation of the full_covariance
+    bgmm = BayesianGaussianMixture(
+        n_components=n_components, max_iter=100, random_state=rng, tol=1e-3, reg_covar=0
+    )
+    for covar_type in COVARIANCE_TYPE:
+        bgmm.covariance_type = covar_type
+        bgmm.fit(rand_data.X[covar_type])
+
+        if covar_type == "full":
+            for covar, precision in zip(bgmm.covariances_, bgmm.precisions_):
+                assert_almost_equal(np.dot(covar, precision), np.eye(n_features))
+        elif covar_type == "tied":
+            assert_almost_equal(
+                np.dot(bgmm.covariances_, bgmm.precisions_), np.eye(n_features)
+            )
+
+        elif covar_type == "diag":
+            assert_almost_equal(
+                bgmm.covariances_ * bgmm.precisions_,
+                np.ones((n_components, n_features)),
+            )
+
+        else:
+            assert_almost_equal(
+                bgmm.covariances_ * bgmm.precisions_, np.ones(n_components)
+            )
+
+
+def test_invariant_translation():
+    # We check here that adding a constant in the data change correctly the
+    # parameters of the mixture
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=100)
+    n_components = 2 * rand_data.n_components
+
+    for prior_type in PRIOR_TYPE:
+        for covar_type in COVARIANCE_TYPE:
+            X = rand_data.X[covar_type]
+            bgmm1 = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=n_components,
+                max_iter=100,
+                random_state=0,
+                tol=1e-3,
+                reg_covar=0,
+            ).fit(X)
+            bgmm2 = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=n_components,
+                max_iter=100,
+                random_state=0,
+                tol=1e-3,
+                reg_covar=0,
+            ).fit(X + 100)
+
+            assert_almost_equal(bgmm1.means_, bgmm2.means_ - 100)
+            assert_almost_equal(bgmm1.weights_, bgmm2.weights_)
+            assert_almost_equal(bgmm1.covariances_, bgmm2.covariances_)
+
+
+@pytest.mark.filterwarnings("ignore:.*did not converge.*")
+@pytest.mark.parametrize(
+    "seed, max_iter, tol",
+    [
+        (0, 2, 1e-7),  # strict non-convergence
+        (1, 2, 1e-1),  # loose non-convergence
+        (3, 300, 1e-7),  # strict convergence
+        (4, 300, 1e-1),  # loose convergence
+    ],
+)
+def test_bayesian_mixture_fit_predict(seed, max_iter, tol):
+    rng = np.random.RandomState(seed)
+    rand_data = RandomData(rng, n_samples=50, scale=7)
+    n_components = 2 * rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        bgmm1 = BayesianGaussianMixture(
+            n_components=n_components,
+            max_iter=max_iter,
+            random_state=rng,
+            tol=tol,
+            reg_covar=0,
+        )
+        bgmm1.covariance_type = covar_type
+        bgmm2 = copy.deepcopy(bgmm1)
+        X = rand_data.X[covar_type]
+
+        Y_pred1 = bgmm1.fit(X).predict(X)
+        Y_pred2 = bgmm2.fit_predict(X)
+        assert_array_equal(Y_pred1, Y_pred2)
+
+
+def test_bayesian_mixture_fit_predict_n_init():
+    # Check that fit_predict is equivalent to fit.predict, when n_init > 1
+    X = np.random.RandomState(0).randn(50, 5)
+    gm = BayesianGaussianMixture(n_components=5, n_init=10, random_state=0)
+    y_pred1 = gm.fit_predict(X)
+    y_pred2 = gm.predict(X)
+    assert_array_equal(y_pred1, y_pred2)
+
+
+def test_bayesian_mixture_predict_predict_proba():
+    # this is the same test as test_gaussian_mixture_predict_predict_proba()
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    for prior_type in PRIOR_TYPE:
+        for covar_type in COVARIANCE_TYPE:
+            X = rand_data.X[covar_type]
+            Y = rand_data.Y
+            bgmm = BayesianGaussianMixture(
+                n_components=rand_data.n_components,
+                random_state=rng,
+                weight_concentration_prior_type=prior_type,
+                covariance_type=covar_type,
+            )
+
+            # Check a warning message arrive if we don't do fit
+            msg = (
+                "This BayesianGaussianMixture instance is not fitted yet. "
+                "Call 'fit' with appropriate arguments before using this "
+                "estimator."
+            )
+            with pytest.raises(NotFittedError, match=msg):
+                bgmm.predict(X)
+
+            bgmm.fit(X)
+            Y_pred = bgmm.predict(X)
+            Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1)
+            assert_array_equal(Y_pred, Y_pred_proba)
+            assert adjusted_rand_score(Y, Y_pred) >= 0.95
diff --git a/.venv/Lib/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py b/.venv/Lib/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa678797c82d75b3b5b0c036d4138c143f2d8325
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -0,0 +1,1420 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import copy
+import itertools
+import re
+import sys
+import warnings
+from io import StringIO
+from unittest.mock import Mock
+
+import numpy as np
+import pytest
+from scipy import linalg, stats
+
+import sklearn
+from sklearn.cluster import KMeans
+from sklearn.covariance import EmpiricalCovariance
+from sklearn.datasets import make_spd_matrix
+from sklearn.exceptions import ConvergenceWarning, NotFittedError
+from sklearn.metrics.cluster import adjusted_rand_score
+from sklearn.mixture import GaussianMixture
+from sklearn.mixture._gaussian_mixture import (
+    _compute_log_det_cholesky,
+    _compute_precision_cholesky,
+    _estimate_gaussian_covariances_diag,
+    _estimate_gaussian_covariances_full,
+    _estimate_gaussian_covariances_spherical,
+    _estimate_gaussian_covariances_tied,
+    _estimate_gaussian_parameters,
+)
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.extmath import fast_logdet
+
+COVARIANCE_TYPE = ["full", "tied", "diag", "spherical"]
+
+
+def generate_data(n_samples, n_features, weights, means, precisions, covariance_type):
+    rng = np.random.RandomState(0)
+
+    X = []
+    if covariance_type == "spherical":
+        for _, (w, m, c) in enumerate(zip(weights, means, precisions["spherical"])):
+            X.append(
+                rng.multivariate_normal(
+                    m, c * np.eye(n_features), int(np.round(w * n_samples))
+                )
+            )
+    if covariance_type == "diag":
+        for _, (w, m, c) in enumerate(zip(weights, means, precisions["diag"])):
+            X.append(
+                rng.multivariate_normal(m, np.diag(c), int(np.round(w * n_samples)))
+            )
+    if covariance_type == "tied":
+        for _, (w, m) in enumerate(zip(weights, means)):
+            X.append(
+                rng.multivariate_normal(
+                    m, precisions["tied"], int(np.round(w * n_samples))
+                )
+            )
+    if covariance_type == "full":
+        for _, (w, m, c) in enumerate(zip(weights, means, precisions["full"])):
+            X.append(rng.multivariate_normal(m, c, int(np.round(w * n_samples))))
+
+    X = np.vstack(X)
+    return X
+
+
+class RandomData:
+    def __init__(self, rng, n_samples=200, n_components=2, n_features=2, scale=50):
+        self.n_samples = n_samples
+        self.n_components = n_components
+        self.n_features = n_features
+
+        self.weights = rng.rand(n_components)
+        self.weights = self.weights / self.weights.sum()
+        self.means = rng.rand(n_components, n_features) * scale
+        self.covariances = {
+            "spherical": 0.5 + rng.rand(n_components),
+            "diag": (0.5 + rng.rand(n_components, n_features)) ** 2,
+            "tied": make_spd_matrix(n_features, random_state=rng),
+            "full": np.array(
+                [
+                    make_spd_matrix(n_features, random_state=rng) * 0.5
+                    for _ in range(n_components)
+                ]
+            ),
+        }
+        self.precisions = {
+            "spherical": 1.0 / self.covariances["spherical"],
+            "diag": 1.0 / self.covariances["diag"],
+            "tied": linalg.inv(self.covariances["tied"]),
+            "full": np.array(
+                [linalg.inv(covariance) for covariance in self.covariances["full"]]
+            ),
+        }
+
+        self.X = dict(
+            zip(
+                COVARIANCE_TYPE,
+                [
+                    generate_data(
+                        n_samples,
+                        n_features,
+                        self.weights,
+                        self.means,
+                        self.covariances,
+                        covar_type,
+                    )
+                    for covar_type in COVARIANCE_TYPE
+                ],
+            )
+        )
+        self.Y = np.hstack(
+            [
+                np.full(int(np.round(w * n_samples)), k, dtype=int)
+                for k, w in enumerate(self.weights)
+            ]
+        )
+
+
+def test_gaussian_mixture_attributes():
+    # test bad parameters
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 2)
+
+    # test good parameters
+    n_components, tol, n_init, max_iter, reg_covar = 2, 1e-4, 3, 30, 1e-1
+    covariance_type, init_params = "full", "random"
+    gmm = GaussianMixture(
+        n_components=n_components,
+        tol=tol,
+        n_init=n_init,
+        max_iter=max_iter,
+        reg_covar=reg_covar,
+        covariance_type=covariance_type,
+        init_params=init_params,
+    ).fit(X)
+
+    assert gmm.n_components == n_components
+    assert gmm.covariance_type == covariance_type
+    assert gmm.tol == tol
+    assert gmm.reg_covar == reg_covar
+    assert gmm.max_iter == max_iter
+    assert gmm.n_init == n_init
+    assert gmm.init_params == init_params
+
+
+def test_check_weights():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+
+    n_components = rand_data.n_components
+    X = rand_data.X["full"]
+
+    g = GaussianMixture(n_components=n_components)
+
+    # Check bad shape
+    weights_bad_shape = rng.rand(n_components, 1)
+    g.weights_init = weights_bad_shape
+    msg = re.escape(
+        "The parameter 'weights' should have the shape of "
+        f"({n_components},), but got {str(weights_bad_shape.shape)}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
+
+    # Check bad range
+    weights_bad_range = rng.rand(n_components) + 1
+    g.weights_init = weights_bad_range
+    msg = re.escape(
+        "The parameter 'weights' should be in the range [0, 1], but got"
+        f" max value {np.min(weights_bad_range):.5f}, "
+        f"min value {np.max(weights_bad_range):.5f}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
+
+    # Check bad normalization
+    weights_bad_norm = rng.rand(n_components)
+    weights_bad_norm = weights_bad_norm / (weights_bad_norm.sum() + 1)
+    g.weights_init = weights_bad_norm
+    msg = re.escape(
+        "The parameter 'weights' should be normalized, "
+        f"but got sum(weights) = {np.sum(weights_bad_norm):.5f}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
+
+    # Check good weights matrix
+    weights = rand_data.weights
+    g = GaussianMixture(weights_init=weights, n_components=n_components)
+    g.fit(X)
+    assert_array_equal(weights, g.weights_init)
+
+
+def test_check_means():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+
+    n_components, n_features = rand_data.n_components, rand_data.n_features
+    X = rand_data.X["full"]
+
+    g = GaussianMixture(n_components=n_components)
+
+    # Check means bad shape
+    means_bad_shape = rng.rand(n_components + 1, n_features)
+    g.means_init = means_bad_shape
+    msg = "The parameter 'means' should have the shape of "
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
+
+    # Check good means matrix
+    means = rand_data.means
+    g.means_init = means
+    g.fit(X)
+    assert_array_equal(means, g.means_init)
+
+
+def test_check_precisions():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+
+    n_components, n_features = rand_data.n_components, rand_data.n_features
+
+    # Define the bad precisions for each covariance_type
+    precisions_bad_shape = {
+        "full": np.ones((n_components + 1, n_features, n_features)),
+        "tied": np.ones((n_features + 1, n_features + 1)),
+        "diag": np.ones((n_components + 1, n_features)),
+        "spherical": np.ones((n_components + 1)),
+    }
+
+    # Define not positive-definite precisions
+    precisions_not_pos = np.ones((n_components, n_features, n_features))
+    precisions_not_pos[0] = np.eye(n_features)
+    precisions_not_pos[0, 0, 0] = -1.0
+
+    precisions_not_positive = {
+        "full": precisions_not_pos,
+        "tied": precisions_not_pos[0],
+        "diag": np.full((n_components, n_features), -1.0),
+        "spherical": np.full(n_components, -1.0),
+    }
+
+    not_positive_errors = {
+        "full": "symmetric, positive-definite",
+        "tied": "symmetric, positive-definite",
+        "diag": "positive",
+        "spherical": "positive",
+    }
+
+    for covar_type in COVARIANCE_TYPE:
+        X = RandomData(rng).X[covar_type]
+        g = GaussianMixture(
+            n_components=n_components, covariance_type=covar_type, random_state=rng
+        )
+
+        # Check precisions with bad shapes
+        g.precisions_init = precisions_bad_shape[covar_type]
+        msg = f"The parameter '{covar_type} precision' should have the shape of"
+        with pytest.raises(ValueError, match=msg):
+            g.fit(X)
+
+        # Check not positive precisions
+        g.precisions_init = precisions_not_positive[covar_type]
+        msg = f"'{covar_type} precision' should be {not_positive_errors[covar_type]}"
+        with pytest.raises(ValueError, match=msg):
+            g.fit(X)
+
+        # Check the correct init of precisions_init
+        g.precisions_init = rand_data.precisions[covar_type]
+        g.fit(X)
+        assert_array_equal(rand_data.precisions[covar_type], g.precisions_init)
+
+
+def test_suffstat_sk_full():
+    # compare the precision matrix compute from the
+    # EmpiricalCovariance.covariance fitted on X*sqrt(resp)
+    # with _sufficient_sk_full, n_components=1
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 500, 2
+
+    # special case 1, assuming data is "centered"
+    X = rng.rand(n_samples, n_features)
+    resp = rng.rand(n_samples, 1)
+    X_resp = np.sqrt(resp) * X
+    nk = np.array([n_samples])
+    xk = np.zeros((1, n_features))
+    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
+    ecov = EmpiricalCovariance(assume_centered=True)
+    ecov.fit(X_resp)
+    assert_almost_equal(ecov.error_norm(covars_pred[0], norm="frobenius"), 0)
+    assert_almost_equal(ecov.error_norm(covars_pred[0], norm="spectral"), 0)
+
+    # check the precision computation
+    precs_chol_pred = _compute_precision_cholesky(covars_pred, "full")
+    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
+    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
+    assert_array_almost_equal(precs_est, precs_pred)
+
+    # special case 2, assuming resp are all ones
+    resp = np.ones((n_samples, 1))
+    nk = np.array([n_samples])
+    xk = X.mean(axis=0).reshape((1, -1))
+    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
+    ecov = EmpiricalCovariance(assume_centered=False)
+    ecov.fit(X)
+    assert_almost_equal(ecov.error_norm(covars_pred[0], norm="frobenius"), 0)
+    assert_almost_equal(ecov.error_norm(covars_pred[0], norm="spectral"), 0)
+
+    # check the precision computation
+    precs_chol_pred = _compute_precision_cholesky(covars_pred, "full")
+    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
+    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
+    assert_array_almost_equal(precs_est, precs_pred)
+
+
+def test_suffstat_sk_tied():
+    # use equation Nk * Sk / N = S_tied
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 500, 2, 2
+
+    resp = rng.rand(n_samples, n_components)
+    resp = resp / resp.sum(axis=1)[:, np.newaxis]
+    X = rng.rand(n_samples, n_features)
+    nk = resp.sum(axis=0)
+    xk = np.dot(resp.T, X) / nk[:, np.newaxis]
+
+    covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
+    covars_pred_full = (
+        np.sum(nk[:, np.newaxis, np.newaxis] * covars_pred_full, 0) / n_samples
+    )
+
+    covars_pred_tied = _estimate_gaussian_covariances_tied(resp, X, nk, xk, 0)
+
+    ecov = EmpiricalCovariance()
+    ecov.covariance_ = covars_pred_full
+    assert_almost_equal(ecov.error_norm(covars_pred_tied, norm="frobenius"), 0)
+    assert_almost_equal(ecov.error_norm(covars_pred_tied, norm="spectral"), 0)
+
+    # check the precision computation
+    precs_chol_pred = _compute_precision_cholesky(covars_pred_tied, "tied")
+    precs_pred = np.dot(precs_chol_pred, precs_chol_pred.T)
+    precs_est = linalg.inv(covars_pred_tied)
+    assert_array_almost_equal(precs_est, precs_pred)
+
+
+def test_suffstat_sk_diag():
+    # test against 'full' case
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 500, 2, 2
+
+    resp = rng.rand(n_samples, n_components)
+    resp = resp / resp.sum(axis=1)[:, np.newaxis]
+    X = rng.rand(n_samples, n_features)
+    nk = resp.sum(axis=0)
+    xk = np.dot(resp.T, X) / nk[:, np.newaxis]
+    covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
+    covars_pred_diag = _estimate_gaussian_covariances_diag(resp, X, nk, xk, 0)
+
+    ecov = EmpiricalCovariance()
+    for cov_full, cov_diag in zip(covars_pred_full, covars_pred_diag):
+        ecov.covariance_ = np.diag(np.diag(cov_full))
+        cov_diag = np.diag(cov_diag)
+        assert_almost_equal(ecov.error_norm(cov_diag, norm="frobenius"), 0)
+        assert_almost_equal(ecov.error_norm(cov_diag, norm="spectral"), 0)
+
+    # check the precision computation
+    precs_chol_pred = _compute_precision_cholesky(covars_pred_diag, "diag")
+    assert_almost_equal(covars_pred_diag, 1.0 / precs_chol_pred**2)
+
+
+def test_gaussian_suffstat_sk_spherical():
+    # computing spherical covariance equals to the variance of one-dimension
+    # data after flattening, n_components=1
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 500, 2
+
+    X = rng.rand(n_samples, n_features)
+    X = X - X.mean()
+    resp = np.ones((n_samples, 1))
+    nk = np.array([n_samples])
+    xk = X.mean()
+    covars_pred_spherical = _estimate_gaussian_covariances_spherical(resp, X, nk, xk, 0)
+    covars_pred_spherical2 = np.dot(X.flatten().T, X.flatten()) / (
+        n_features * n_samples
+    )
+    assert_almost_equal(covars_pred_spherical, covars_pred_spherical2)
+
+    # check the precision computation
+    precs_chol_pred = _compute_precision_cholesky(covars_pred_spherical, "spherical")
+    assert_almost_equal(covars_pred_spherical, 1.0 / precs_chol_pred**2)
+
+
+def test_compute_log_det_cholesky():
+    n_features = 2
+    rand_data = RandomData(np.random.RandomState(0))
+
+    for covar_type in COVARIANCE_TYPE:
+        covariance = rand_data.covariances[covar_type]
+
+        if covar_type == "full":
+            predected_det = np.array([linalg.det(cov) for cov in covariance])
+        elif covar_type == "tied":
+            predected_det = linalg.det(covariance)
+        elif covar_type == "diag":
+            predected_det = np.array([np.prod(cov) for cov in covariance])
+        elif covar_type == "spherical":
+            predected_det = covariance**n_features
+
+        # We compute the cholesky decomposition of the covariance matrix
+        expected_det = _compute_log_det_cholesky(
+            _compute_precision_cholesky(covariance, covar_type),
+            covar_type,
+            n_features=n_features,
+        )
+        assert_array_almost_equal(expected_det, -0.5 * np.log(predected_det))
+
+
+def _naive_lmvnpdf_diag(X, means, covars):
+    resp = np.empty((len(X), len(means)))
+    stds = np.sqrt(covars)
+    for i, (mean, std) in enumerate(zip(means, stds)):
+        resp[:, i] = stats.norm.logpdf(X, mean, std).sum(axis=1)
+    return resp
+
+
+def test_gaussian_mixture_log_probabilities():
+    from sklearn.mixture._gaussian_mixture import _estimate_log_gaussian_prob
+
+    # test against with _naive_lmvnpdf_diag
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    n_samples = 500
+    n_features = rand_data.n_features
+    n_components = rand_data.n_components
+
+    means = rand_data.means
+    covars_diag = rng.rand(n_components, n_features)
+    X = rng.rand(n_samples, n_features)
+    log_prob_naive = _naive_lmvnpdf_diag(X, means, covars_diag)
+
+    # full covariances
+    precs_full = np.array([np.diag(1.0 / np.sqrt(x)) for x in covars_diag])
+
+    log_prob = _estimate_log_gaussian_prob(X, means, precs_full, "full")
+    assert_array_almost_equal(log_prob, log_prob_naive)
+
+    # diag covariances
+    precs_chol_diag = 1.0 / np.sqrt(covars_diag)
+    log_prob = _estimate_log_gaussian_prob(X, means, precs_chol_diag, "diag")
+    assert_array_almost_equal(log_prob, log_prob_naive)
+
+    # tied
+    covars_tied = np.array([x for x in covars_diag]).mean(axis=0)
+    precs_tied = np.diag(np.sqrt(1.0 / covars_tied))
+
+    log_prob_naive = _naive_lmvnpdf_diag(X, means, [covars_tied] * n_components)
+    log_prob = _estimate_log_gaussian_prob(X, means, precs_tied, "tied")
+
+    assert_array_almost_equal(log_prob, log_prob_naive)
+
+    # spherical
+    covars_spherical = covars_diag.mean(axis=1)
+    precs_spherical = 1.0 / np.sqrt(covars_diag.mean(axis=1))
+    log_prob_naive = _naive_lmvnpdf_diag(
+        X, means, [[k] * n_features for k in covars_spherical]
+    )
+    log_prob = _estimate_log_gaussian_prob(X, means, precs_spherical, "spherical")
+    assert_array_almost_equal(log_prob, log_prob_naive)
+
+
+# skip tests on weighted_log_probabilities, log_weights
+
+
+def test_gaussian_mixture_estimate_log_prob_resp():
+    # test whether responsibilities are normalized
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=5)
+    n_samples = rand_data.n_samples
+    n_features = rand_data.n_features
+    n_components = rand_data.n_components
+
+    X = rng.rand(n_samples, n_features)
+    for covar_type in COVARIANCE_TYPE:
+        weights = rand_data.weights
+        means = rand_data.means
+        precisions = rand_data.precisions[covar_type]
+        g = GaussianMixture(
+            n_components=n_components,
+            random_state=rng,
+            weights_init=weights,
+            means_init=means,
+            precisions_init=precisions,
+            covariance_type=covar_type,
+        )
+        g.fit(X)
+        resp = g.predict_proba(X)
+        assert_array_almost_equal(resp.sum(axis=1), np.ones(n_samples))
+        assert_array_equal(g.weights_init, weights)
+        assert_array_equal(g.means_init, means)
+        assert_array_equal(g.precisions_init, precisions)
+
+
+def test_gaussian_mixture_predict_predict_proba():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        Y = rand_data.Y
+        g = GaussianMixture(
+            n_components=rand_data.n_components,
+            random_state=rng,
+            weights_init=rand_data.weights,
+            means_init=rand_data.means,
+            precisions_init=rand_data.precisions[covar_type],
+            covariance_type=covar_type,
+        )
+
+        # Check a warning message arrive if we don't do fit
+        msg = (
+            "This GaussianMixture instance is not fitted yet. Call 'fit' "
+            "with appropriate arguments before using this estimator."
+        )
+        with pytest.raises(NotFittedError, match=msg):
+            g.predict(X)
+
+        g.fit(X)
+        Y_pred = g.predict(X)
+        Y_pred_proba = g.predict_proba(X).argmax(axis=1)
+        assert_array_equal(Y_pred, Y_pred_proba)
+        assert adjusted_rand_score(Y, Y_pred) > 0.95
+
+
+@pytest.mark.filterwarnings("ignore:.*did not converge.*")
+@pytest.mark.parametrize(
+    "seed, max_iter, tol",
+    [
+        (0, 2, 1e-7),  # strict non-convergence
+        (1, 2, 1e-1),  # loose non-convergence
+        (3, 300, 1e-7),  # strict convergence
+        (4, 300, 1e-1),  # loose convergence
+    ],
+)
+def test_gaussian_mixture_fit_predict(seed, max_iter, tol):
+    rng = np.random.RandomState(seed)
+    rand_data = RandomData(rng)
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        Y = rand_data.Y
+        g = GaussianMixture(
+            n_components=rand_data.n_components,
+            random_state=rng,
+            weights_init=rand_data.weights,
+            means_init=rand_data.means,
+            precisions_init=rand_data.precisions[covar_type],
+            covariance_type=covar_type,
+            max_iter=max_iter,
+            tol=tol,
+        )
+
+        # check if fit_predict(X) is equivalent to fit(X).predict(X)
+        f = copy.deepcopy(g)
+        Y_pred1 = f.fit(X).predict(X)
+        Y_pred2 = g.fit_predict(X)
+        assert_array_equal(Y_pred1, Y_pred2)
+        assert adjusted_rand_score(Y, Y_pred2) > 0.95
+
+
+def test_gaussian_mixture_fit_predict_n_init():
+    # Check that fit_predict is equivalent to fit.predict, when n_init > 1
+    X = np.random.RandomState(0).randn(1000, 5)
+    gm = GaussianMixture(n_components=5, n_init=5, random_state=0)
+    y_pred1 = gm.fit_predict(X)
+    y_pred2 = gm.predict(X)
+    assert_array_equal(y_pred1, y_pred2)
+
+
+def test_gaussian_mixture_fit():
+    # recover the ground truth
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    n_features = rand_data.n_features
+    n_components = rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        g = GaussianMixture(
+            n_components=n_components,
+            n_init=20,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+        )
+        g.fit(X)
+
+        # needs more data to pass the test with rtol=1e-7
+        assert_allclose(
+            np.sort(g.weights_), np.sort(rand_data.weights), rtol=0.1, atol=1e-2
+        )
+
+        arg_idx1 = g.means_[:, 0].argsort()
+        arg_idx2 = rand_data.means[:, 0].argsort()
+        assert_allclose(
+            g.means_[arg_idx1], rand_data.means[arg_idx2], rtol=0.1, atol=1e-2
+        )
+
+        if covar_type == "full":
+            prec_pred = g.precisions_
+            prec_test = rand_data.precisions["full"]
+        elif covar_type == "tied":
+            prec_pred = np.array([g.precisions_] * n_components)
+            prec_test = np.array([rand_data.precisions["tied"]] * n_components)
+        elif covar_type == "spherical":
+            prec_pred = np.array([np.eye(n_features) * c for c in g.precisions_])
+            prec_test = np.array(
+                [np.eye(n_features) * c for c in rand_data.precisions["spherical"]]
+            )
+        elif covar_type == "diag":
+            prec_pred = np.array([np.diag(d) for d in g.precisions_])
+            prec_test = np.array([np.diag(d) for d in rand_data.precisions["diag"]])
+
+        arg_idx1 = np.trace(prec_pred, axis1=1, axis2=2).argsort()
+        arg_idx2 = np.trace(prec_test, axis1=1, axis2=2).argsort()
+        for k, h in zip(arg_idx1, arg_idx2):
+            ecov = EmpiricalCovariance()
+            ecov.covariance_ = prec_test[h]
+            # the accuracy depends on the number of data and randomness, rng
+            assert_allclose(ecov.error_norm(prec_pred[k]), 0, atol=0.15)
+
+
+def test_gaussian_mixture_fit_best_params():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    n_components = rand_data.n_components
+    n_init = 10
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        g = GaussianMixture(
+            n_components=n_components,
+            n_init=1,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+        )
+        ll = []
+        for _ in range(n_init):
+            g.fit(X)
+            ll.append(g.score(X))
+        ll = np.array(ll)
+        g_best = GaussianMixture(
+            n_components=n_components,
+            n_init=n_init,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+        )
+        g_best.fit(X)
+        assert_almost_equal(ll.min(), g_best.score(X))
+
+
+def test_gaussian_mixture_fit_convergence_warning():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=1)
+    n_components = rand_data.n_components
+    max_iter = 1
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        g = GaussianMixture(
+            n_components=n_components,
+            n_init=1,
+            max_iter=max_iter,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+        )
+        msg = (
+            "Best performing initialization did not converge. "
+            "Try different init parameters, or increase max_iter, "
+            "tol, or check for degenerate data."
+        )
+        with pytest.warns(ConvergenceWarning, match=msg):
+            g.fit(X)
+
+
+def test_multiple_init():
+    # Test that multiple inits does not much worse than a single one
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 50, 5, 2
+    X = rng.randn(n_samples, n_features)
+    for cv_type in COVARIANCE_TYPE:
+        train1 = (
+            GaussianMixture(
+                n_components=n_components, covariance_type=cv_type, random_state=0
+            )
+            .fit(X)
+            .score(X)
+        )
+        train2 = (
+            GaussianMixture(
+                n_components=n_components,
+                covariance_type=cv_type,
+                random_state=0,
+                n_init=5,
+            )
+            .fit(X)
+            .score(X)
+        )
+        assert train2 >= train1
+
+
+def test_gaussian_mixture_n_parameters():
+    # Test that the right number of parameters is estimated
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 50, 5, 2
+    X = rng.randn(n_samples, n_features)
+    n_params = {"spherical": 13, "diag": 21, "tied": 26, "full": 41}
+    for cv_type in COVARIANCE_TYPE:
+        g = GaussianMixture(
+            n_components=n_components, covariance_type=cv_type, random_state=rng
+        ).fit(X)
+        assert g._n_parameters() == n_params[cv_type]
+
+
+def test_bic_1d_1component():
+    # Test all of the covariance_types return the same BIC score for
+    # 1-dimensional, 1 component fits.
+    rng = np.random.RandomState(0)
+    n_samples, n_dim, n_components = 100, 1, 1
+    X = rng.randn(n_samples, n_dim)
+    bic_full = (
+        GaussianMixture(
+            n_components=n_components, covariance_type="full", random_state=rng
+        )
+        .fit(X)
+        .bic(X)
+    )
+    for covariance_type in ["tied", "diag", "spherical"]:
+        bic = (
+            GaussianMixture(
+                n_components=n_components,
+                covariance_type=covariance_type,
+                random_state=rng,
+            )
+            .fit(X)
+            .bic(X)
+        )
+        assert_almost_equal(bic_full, bic)
+
+
+def test_gaussian_mixture_aic_bic():
+    # Test the aic and bic criteria
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 50, 3, 2
+    X = rng.randn(n_samples, n_features)
+    # standard gaussian entropy
+    sgh = 0.5 * (
+        fast_logdet(np.cov(X.T, bias=1)) + n_features * (1 + np.log(2 * np.pi))
+    )
+    for cv_type in COVARIANCE_TYPE:
+        g = GaussianMixture(
+            n_components=n_components,
+            covariance_type=cv_type,
+            random_state=rng,
+            max_iter=200,
+        )
+        g.fit(X)
+        aic = 2 * n_samples * sgh + 2 * g._n_parameters()
+        bic = 2 * n_samples * sgh + np.log(n_samples) * g._n_parameters()
+        bound = n_features / np.sqrt(n_samples)
+        assert (g.aic(X) - aic) / n_samples < bound
+        assert (g.bic(X) - bic) / n_samples < bound
+
+
+def test_gaussian_mixture_verbose():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    n_components = rand_data.n_components
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        g = GaussianMixture(
+            n_components=n_components,
+            n_init=1,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+            verbose=1,
+        )
+        h = GaussianMixture(
+            n_components=n_components,
+            n_init=1,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+            verbose=2,
+        )
+        old_stdout = sys.stdout
+        sys.stdout = StringIO()
+        try:
+            g.fit(X)
+            h.fit(X)
+        finally:
+            sys.stdout = old_stdout
+
+
+@pytest.mark.filterwarnings("ignore:.*did not converge.*")
+@pytest.mark.parametrize("seed", (0, 1, 2))
+def test_warm_start(seed):
+    random_state = seed
+    rng = np.random.RandomState(random_state)
+    n_samples, n_features, n_components = 500, 2, 2
+    X = rng.rand(n_samples, n_features)
+
+    # Assert the warm_start give the same result for the same number of iter
+    g = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        max_iter=2,
+        reg_covar=0,
+        random_state=random_state,
+        warm_start=False,
+    )
+    h = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        max_iter=1,
+        reg_covar=0,
+        random_state=random_state,
+        warm_start=True,
+    )
+
+    g.fit(X)
+    score1 = h.fit(X).score(X)
+    score2 = h.fit(X).score(X)
+
+    assert_almost_equal(g.weights_, h.weights_)
+    assert_almost_equal(g.means_, h.means_)
+    assert_almost_equal(g.precisions_, h.precisions_)
+    assert score2 > score1
+
+    # Assert that by using warm_start we can converge to a good solution
+    g = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        max_iter=5,
+        reg_covar=0,
+        random_state=random_state,
+        warm_start=False,
+        tol=1e-6,
+    )
+    h = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        max_iter=5,
+        reg_covar=0,
+        random_state=random_state,
+        warm_start=True,
+        tol=1e-6,
+    )
+
+    g.fit(X)
+    assert not g.converged_
+
+    h.fit(X)
+    # depending on the data there is large variability in the number of
+    # refit necessary to converge due to the complete randomness of the
+    # data
+    for _ in range(1000):
+        h.fit(X)
+        if h.converged_:
+            break
+    assert h.converged_
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_convergence_detected_with_warm_start():
+    # We check that convergence is detected when warm_start=True
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    n_components = rand_data.n_components
+    X = rand_data.X["full"]
+
+    for max_iter in (1, 2, 50):
+        gmm = GaussianMixture(
+            n_components=n_components,
+            warm_start=True,
+            max_iter=max_iter,
+            random_state=rng,
+        )
+        for _ in range(100):
+            gmm.fit(X)
+            if gmm.converged_:
+                break
+        assert gmm.converged_
+        assert max_iter >= gmm.n_iter_
+
+
+def test_score():
+    covar_type = "full"
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    n_components = rand_data.n_components
+    X = rand_data.X[covar_type]
+
+    # Check the error message if we don't call fit
+    gmm1 = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        max_iter=1,
+        reg_covar=0,
+        random_state=rng,
+        covariance_type=covar_type,
+    )
+    msg = (
+        "This GaussianMixture instance is not fitted yet. Call 'fit' with "
+        "appropriate arguments before using this estimator."
+    )
+    with pytest.raises(NotFittedError, match=msg):
+        gmm1.score(X)
+
+    # Check score value
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        gmm1.fit(X)
+    gmm_score = gmm1.score(X)
+    gmm_score_proba = gmm1.score_samples(X).mean()
+    assert_almost_equal(gmm_score, gmm_score_proba)
+
+    # Check if the score increase
+    gmm2 = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        reg_covar=0,
+        random_state=rng,
+        covariance_type=covar_type,
+    ).fit(X)
+    assert gmm2.score(X) > gmm1.score(X)
+
+
+def test_score_samples():
+    covar_type = "full"
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    n_components = rand_data.n_components
+    X = rand_data.X[covar_type]
+
+    # Check the error message if we don't call fit
+    gmm = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        reg_covar=0,
+        random_state=rng,
+        covariance_type=covar_type,
+    )
+    msg = (
+        "This GaussianMixture instance is not fitted yet. Call 'fit' with "
+        "appropriate arguments before using this estimator."
+    )
+    with pytest.raises(NotFittedError, match=msg):
+        gmm.score_samples(X)
+
+    gmm_score_samples = gmm.fit(X).score_samples(X)
+    assert gmm_score_samples.shape[0] == rand_data.n_samples
+
+
+def test_monotonic_likelihood():
+    # We check that each step of the EM without regularization improve
+    # monotonically the training set likelihood
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    n_components = rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        gmm = GaussianMixture(
+            n_components=n_components,
+            covariance_type=covar_type,
+            reg_covar=0,
+            warm_start=True,
+            max_iter=1,
+            random_state=rng,
+            tol=1e-7,
+        )
+        current_log_likelihood = -np.inf
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", ConvergenceWarning)
+            # Do one training iteration at a time so we can make sure that the
+            # training log likelihood increases after each iteration.
+            for _ in range(600):
+                prev_log_likelihood = current_log_likelihood
+                current_log_likelihood = gmm.fit(X).score(X)
+                assert current_log_likelihood >= prev_log_likelihood
+
+                if gmm.converged_:
+                    break
+
+            assert gmm.converged_
+
+
+def test_regularisation():
+    # We train the GaussianMixture on degenerate data by defining two clusters
+    # of a 0 covariance.
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 5
+
+    X = np.vstack(
+        (np.ones((n_samples // 2, n_features)), np.zeros((n_samples // 2, n_features)))
+    )
+
+    for covar_type in COVARIANCE_TYPE:
+        gmm = GaussianMixture(
+            n_components=n_samples,
+            reg_covar=0,
+            covariance_type=covar_type,
+            random_state=rng,
+        )
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", RuntimeWarning)
+            msg = re.escape(
+                "Fitting the mixture model failed because some components have"
+                " ill-defined empirical covariance (for instance caused by "
+                "singleton or collapsed samples). Try to decrease the number "
+                "of components, or increase reg_covar."
+            )
+            with pytest.raises(ValueError, match=msg):
+                gmm.fit(X)
+
+            gmm.set_params(reg_covar=1e-6).fit(X)
+
+
+def test_property():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    n_components = rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        gmm = GaussianMixture(
+            n_components=n_components,
+            covariance_type=covar_type,
+            random_state=rng,
+            n_init=5,
+        )
+        gmm.fit(X)
+        if covar_type == "full":
+            for prec, covar in zip(gmm.precisions_, gmm.covariances_):
+                assert_array_almost_equal(linalg.inv(prec), covar)
+        elif covar_type == "tied":
+            assert_array_almost_equal(linalg.inv(gmm.precisions_), gmm.covariances_)
+        else:
+            assert_array_almost_equal(gmm.precisions_, 1.0 / gmm.covariances_)
+
+
+def test_sample():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7, n_components=3)
+    n_features, n_components = rand_data.n_features, rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+
+        gmm = GaussianMixture(
+            n_components=n_components, covariance_type=covar_type, random_state=rng
+        )
+        # To sample we need that GaussianMixture is fitted
+        msg = "This GaussianMixture instance is not fitted"
+        with pytest.raises(NotFittedError, match=msg):
+            gmm.sample(0)
+        gmm.fit(X)
+
+        msg = "Invalid value for 'n_samples'"
+        with pytest.raises(ValueError, match=msg):
+            gmm.sample(0)
+
+        # Just to make sure the class samples correctly
+        n_samples = 20000
+        X_s, y_s = gmm.sample(n_samples)
+
+        for k in range(n_components):
+            if covar_type == "full":
+                assert_array_almost_equal(
+                    gmm.covariances_[k], np.cov(X_s[y_s == k].T), decimal=1
+                )
+            elif covar_type == "tied":
+                assert_array_almost_equal(
+                    gmm.covariances_, np.cov(X_s[y_s == k].T), decimal=1
+                )
+            elif covar_type == "diag":
+                assert_array_almost_equal(
+                    gmm.covariances_[k], np.diag(np.cov(X_s[y_s == k].T)), decimal=1
+                )
+            else:
+                assert_array_almost_equal(
+                    gmm.covariances_[k],
+                    np.var(X_s[y_s == k] - gmm.means_[k]),
+                    decimal=1,
+                )
+
+        means_s = np.array([np.mean(X_s[y_s == k], 0) for k in range(n_components)])
+        assert_array_almost_equal(gmm.means_, means_s, decimal=1)
+
+        # Check shapes of sampled data, see
+        # https://github.com/scikit-learn/scikit-learn/issues/7701
+        assert X_s.shape == (n_samples, n_features)
+
+        for sample_size in range(1, 100):
+            X_s, _ = gmm.sample(sample_size)
+            assert X_s.shape == (sample_size, n_features)
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_init():
+    # We check that by increasing the n_init number we have a better solution
+    for random_state in range(15):
+        rand_data = RandomData(
+            np.random.RandomState(random_state), n_samples=50, scale=1
+        )
+        n_components = rand_data.n_components
+        X = rand_data.X["full"]
+
+        gmm1 = GaussianMixture(
+            n_components=n_components, n_init=1, max_iter=1, random_state=random_state
+        ).fit(X)
+        gmm2 = GaussianMixture(
+            n_components=n_components, n_init=10, max_iter=1, random_state=random_state
+        ).fit(X)
+
+        assert gmm2.lower_bound_ >= gmm1.lower_bound_
+
+
+def test_gaussian_mixture_setting_best_params():
+    """`GaussianMixture`'s best_parameters, `n_iter_` and `lower_bound_`
+    must be set appropriately in the case of divergence.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/18216
+    """
+    rnd = np.random.RandomState(0)
+    n_samples = 30
+    X = rnd.uniform(size=(n_samples, 3))
+
+    # following initialization parameters were found to lead to divergence
+    means_init = np.array(
+        [
+            [0.670637869618158, 0.21038256107384043, 0.12892629765485303],
+            [0.09394051075844147, 0.5759464955561779, 0.929296197576212],
+            [0.5033230372781258, 0.9569852381759425, 0.08654043447295741],
+            [0.18578301420435747, 0.5531158970919143, 0.19388943970532435],
+            [0.4548589928173794, 0.35182513658825276, 0.568146063202464],
+            [0.609279894978321, 0.7929063819678847, 0.9620097270828052],
+        ]
+    )
+    precisions_init = np.array(
+        [
+            999999.999604483,
+            999999.9990869573,
+            553.7603944542167,
+            204.78596008931834,
+            15.867423501783637,
+            85.4595728389735,
+        ]
+    )
+    weights_init = [
+        0.03333333333333341,
+        0.03333333333333341,
+        0.06666666666666674,
+        0.06666666666666674,
+        0.7000000000000001,
+        0.10000000000000007,
+    ]
+
+    gmm = GaussianMixture(
+        covariance_type="spherical",
+        reg_covar=0,
+        means_init=means_init,
+        weights_init=weights_init,
+        random_state=rnd,
+        n_components=len(weights_init),
+        precisions_init=precisions_init,
+        max_iter=1,
+    )
+    # ensure that no error is thrown during fit
+    gmm.fit(X)
+
+    # check that the fit did not converge
+    assert not gmm.converged_
+
+    # check that parameters are set for gmm
+    for attr in [
+        "weights_",
+        "means_",
+        "covariances_",
+        "precisions_cholesky_",
+        "n_iter_",
+        "lower_bound_",
+    ]:
+        assert hasattr(gmm, attr)
+
+
+@pytest.mark.parametrize(
+    "init_params", ["random", "random_from_data", "k-means++", "kmeans"]
+)
+def test_init_means_not_duplicated(init_params, global_random_seed):
+    # Check that all initialisations provide not duplicated starting means
+    rng = np.random.RandomState(global_random_seed)
+    rand_data = RandomData(rng, scale=5)
+    n_components = rand_data.n_components
+    X = rand_data.X["full"]
+
+    gmm = GaussianMixture(
+        n_components=n_components, init_params=init_params, random_state=rng, max_iter=0
+    )
+    gmm.fit(X)
+
+    means = gmm.means_
+    for i_mean, j_mean in itertools.combinations(means, r=2):
+        assert not np.allclose(i_mean, j_mean)
+
+
+@pytest.mark.parametrize(
+    "init_params", ["random", "random_from_data", "k-means++", "kmeans"]
+)
+def test_means_for_all_inits(init_params, global_random_seed):
+    # Check fitted means properties for all initializations
+    rng = np.random.RandomState(global_random_seed)
+    rand_data = RandomData(rng, scale=5)
+    n_components = rand_data.n_components
+    X = rand_data.X["full"]
+
+    gmm = GaussianMixture(
+        n_components=n_components, init_params=init_params, random_state=rng
+    )
+    gmm.fit(X)
+
+    assert gmm.means_.shape == (n_components, X.shape[1])
+    assert np.all(X.min(axis=0) <= gmm.means_)
+    assert np.all(gmm.means_ <= X.max(axis=0))
+    assert gmm.converged_
+
+
+def test_max_iter_zero():
+    # Check that max_iter=0 returns initialisation as expected
+    # Pick arbitrary initial means and check equal to max_iter=0
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=5)
+    n_components = rand_data.n_components
+    X = rand_data.X["full"]
+    means_init = [[20, 30], [30, 25]]
+    gmm = GaussianMixture(
+        n_components=n_components,
+        random_state=rng,
+        means_init=means_init,
+        tol=1e-06,
+        max_iter=0,
+    )
+    gmm.fit(X)
+
+    assert_allclose(gmm.means_, means_init)
+
+
+def test_gaussian_mixture_precisions_init_diag():
+    """Check that we properly initialize `precision_cholesky_` when we manually
+    provide the precision matrix.
+
+    In this regard, we check the consistency between estimating the precision
+    matrix and providing the same precision matrix as initialization. It should
+    lead to the same results with the same number of iterations.
+
+    If the initialization is wrong then the number of iterations will increase.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/16944
+    """
+    # generate a toy dataset
+    n_samples = 300
+    rng = np.random.RandomState(0)
+    shifted_gaussian = rng.randn(n_samples, 2) + np.array([20, 20])
+    C = np.array([[0.0, -0.7], [3.5, 0.7]])
+    stretched_gaussian = np.dot(rng.randn(n_samples, 2), C)
+    X = np.vstack([shifted_gaussian, stretched_gaussian])
+
+    # common parameters to check the consistency of precision initialization
+    n_components, covariance_type, reg_covar, random_state = 2, "diag", 1e-6, 0
+
+    # execute the manual initialization to compute the precision matrix:
+    # - run KMeans to have an initial guess
+    # - estimate the covariance
+    # - compute the precision matrix from the estimated covariance
+    resp = np.zeros((X.shape[0], n_components))
+    label = (
+        KMeans(n_clusters=n_components, n_init=1, random_state=random_state)
+        .fit(X)
+        .labels_
+    )
+    resp[np.arange(X.shape[0]), label] = 1
+    _, _, covariance = _estimate_gaussian_parameters(
+        X, resp, reg_covar=reg_covar, covariance_type=covariance_type
+    )
+    precisions_init = 1 / covariance
+
+    gm_with_init = GaussianMixture(
+        n_components=n_components,
+        covariance_type=covariance_type,
+        reg_covar=reg_covar,
+        precisions_init=precisions_init,
+        random_state=random_state,
+    ).fit(X)
+
+    gm_without_init = GaussianMixture(
+        n_components=n_components,
+        covariance_type=covariance_type,
+        reg_covar=reg_covar,
+        random_state=random_state,
+    ).fit(X)
+
+    assert gm_without_init.n_iter_ == gm_with_init.n_iter_
+    assert_allclose(
+        gm_with_init.precisions_cholesky_, gm_without_init.precisions_cholesky_
+    )
+
+
+def _generate_data(seed, n_samples, n_features, n_components):
+    """Randomly generate samples and responsibilities."""
+    rs = np.random.RandomState(seed)
+    X = rs.random_sample((n_samples, n_features))
+    resp = rs.random_sample((n_samples, n_components))
+    resp /= resp.sum(axis=1)[:, np.newaxis]
+    return X, resp
+
+
+def _calculate_precisions(X, resp, covariance_type):
+    """Calculate precision matrix of X and its Cholesky decomposition
+    for the given covariance type.
+    """
+    reg_covar = 1e-6
+    weights, means, covariances = _estimate_gaussian_parameters(
+        X, resp, reg_covar, covariance_type
+    )
+    precisions_cholesky = _compute_precision_cholesky(covariances, covariance_type)
+
+    _, n_components = resp.shape
+    # Instantiate a `GaussianMixture` model in order to use its
+    # `_set_parameters` method to return the `precisions_` and
+    #  `precisions_cholesky_` from matching the `covariance_type`
+    # provided.
+    gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type)
+    params = (weights, means, covariances, precisions_cholesky)
+    gmm._set_parameters(params)
+    return gmm.precisions_, gmm.precisions_cholesky_
+
+
+@pytest.mark.parametrize("covariance_type", COVARIANCE_TYPE)
+def test_gaussian_mixture_precisions_init(covariance_type, global_random_seed):
+    """Non-regression test for #26415."""
+
+    X, resp = _generate_data(
+        seed=global_random_seed,
+        n_samples=100,
+        n_features=3,
+        n_components=4,
+    )
+
+    precisions_init, desired_precisions_cholesky = _calculate_precisions(
+        X, resp, covariance_type
+    )
+    gmm = GaussianMixture(
+        covariance_type=covariance_type, precisions_init=precisions_init
+    )
+    gmm._initialize(X, resp)
+    actual_precisions_cholesky = gmm.precisions_cholesky_
+    assert_allclose(actual_precisions_cholesky, desired_precisions_cholesky)
+
+
+def test_gaussian_mixture_single_component_stable():
+    """
+    Non-regression test for #23032 ensuring 1-component GM works on only a
+    few samples.
+    """
+    rng = np.random.RandomState(0)
+    X = rng.multivariate_normal(np.zeros(2), np.identity(2), size=3)
+    gm = GaussianMixture(n_components=1)
+    gm.fit(X).sample()
+
+
+def test_gaussian_mixture_all_init_does_not_estimate_gaussian_parameters(
+    monkeypatch,
+    global_random_seed,
+):
+    """When all init parameters are provided, the Gaussian parameters
+    are not estimated.
+
+    Non-regression test for gh-26015.
+    """
+
+    mock = Mock(side_effect=_estimate_gaussian_parameters)
+    monkeypatch.setattr(
+        sklearn.mixture._gaussian_mixture, "_estimate_gaussian_parameters", mock
+    )
+
+    rng = np.random.RandomState(global_random_seed)
+    rand_data = RandomData(rng)
+
+    gm = GaussianMixture(
+        n_components=rand_data.n_components,
+        weights_init=rand_data.weights,
+        means_init=rand_data.means,
+        precisions_init=rand_data.precisions["full"],
+        random_state=rng,
+    )
+    gm.fit(rand_data.X["full"])
+    # The initial gaussian parameters are not estimated. They are estimated for every
+    # m_step.
+    assert mock.call_count == gm.n_iter_
diff --git a/.venv/Lib/site-packages/sklearn/mixture/tests/test_mixture.py b/.venv/Lib/site-packages/sklearn/mixture/tests/test_mixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1a45ffef5e07f5eafacd704e69c1f0060c9bb93
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/mixture/tests/test_mixture.py
@@ -0,0 +1,30 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+import pytest
+
+from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
+
+
+@pytest.mark.parametrize("estimator", [GaussianMixture(), BayesianGaussianMixture()])
+def test_gaussian_mixture_n_iter(estimator):
+    # check that n_iter is the number of iteration performed.
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 5)
+    max_iter = 1
+    estimator.set_params(max_iter=max_iter)
+    estimator.fit(X)
+    assert estimator.n_iter_ == max_iter
+
+
+@pytest.mark.parametrize("estimator", [GaussianMixture(), BayesianGaussianMixture()])
+def test_mixture_n_components_greater_than_n_samples_error(estimator):
+    """Check error when n_components <= n_samples"""
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 5)
+    estimator.set_params(n_components=12)
+
+    msg = "Expected n_samples >= n_components"
+    with pytest.raises(ValueError, match=msg):
+        estimator.fit(X)
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/__init__.py b/.venv/Lib/site-packages/sklearn/model_selection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3009b9310d81166691ef1d4db3d41268ee906af5
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/__init__.py
@@ -0,0 +1,99 @@
+"""Tools for model selection, such as cross validation and hyper-parameter tuning."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import typing
+
+from ._classification_threshold import (
+    FixedThresholdClassifier,
+    TunedThresholdClassifierCV,
+)
+from ._plot import LearningCurveDisplay, ValidationCurveDisplay
+from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV
+from ._split import (
+    BaseCrossValidator,
+    BaseShuffleSplit,
+    GroupKFold,
+    GroupShuffleSplit,
+    KFold,
+    LeaveOneGroupOut,
+    LeaveOneOut,
+    LeavePGroupsOut,
+    LeavePOut,
+    PredefinedSplit,
+    RepeatedKFold,
+    RepeatedStratifiedKFold,
+    ShuffleSplit,
+    StratifiedGroupKFold,
+    StratifiedKFold,
+    StratifiedShuffleSplit,
+    TimeSeriesSplit,
+    check_cv,
+    train_test_split,
+)
+from ._validation import (
+    cross_val_predict,
+    cross_val_score,
+    cross_validate,
+    learning_curve,
+    permutation_test_score,
+    validation_curve,
+)
+
+if typing.TYPE_CHECKING:
+    # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
+    # TODO: remove this check once the estimator is no longer experimental.
+    from ._search_successive_halving import (  # noqa
+        HalvingGridSearchCV,
+        HalvingRandomSearchCV,
+    )
+
+
+__all__ = [
+    "BaseCrossValidator",
+    "BaseShuffleSplit",
+    "GridSearchCV",
+    "TimeSeriesSplit",
+    "KFold",
+    "GroupKFold",
+    "GroupShuffleSplit",
+    "LeaveOneGroupOut",
+    "LeaveOneOut",
+    "LeavePGroupsOut",
+    "LeavePOut",
+    "RepeatedKFold",
+    "RepeatedStratifiedKFold",
+    "ParameterGrid",
+    "ParameterSampler",
+    "PredefinedSplit",
+    "RandomizedSearchCV",
+    "ShuffleSplit",
+    "StratifiedKFold",
+    "StratifiedGroupKFold",
+    "StratifiedShuffleSplit",
+    "FixedThresholdClassifier",
+    "TunedThresholdClassifierCV",
+    "check_cv",
+    "cross_val_predict",
+    "cross_val_score",
+    "cross_validate",
+    "learning_curve",
+    "LearningCurveDisplay",
+    "permutation_test_score",
+    "train_test_split",
+    "validation_curve",
+    "ValidationCurveDisplay",
+]
+
+
+# TODO: remove this check once the estimator is no longer experimental.
+def __getattr__(name):
+    if name in {"HalvingGridSearchCV", "HalvingRandomSearchCV"}:
+        raise ImportError(
+            f"{name} is experimental and the API might change without any "
+            "deprecation cycle. To use it, you need to explicitly import "
+            "enable_halving_search_cv:\n"
+            "from sklearn.experimental import enable_halving_search_cv"
+        )
+    raise AttributeError(f"module {__name__} has no attribute {name}")
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_classification_threshold.py b/.venv/Lib/site-packages/sklearn/model_selection/_classification_threshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bff98d232492e00d5aea424331f32f7090d0bf6
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/_classification_threshold.py
@@ -0,0 +1,892 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from collections.abc import MutableMapping
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import (
+    BaseEstimator,
+    ClassifierMixin,
+    MetaEstimatorMixin,
+    _fit_context,
+    clone,
+)
+from ..exceptions import NotFittedError
+from ..metrics import (
+    check_scoring,
+    get_scorer_names,
+)
+from ..metrics._scorer import (
+    _CurveScorer,
+    _threshold_scores_to_class_labels,
+)
+from ..utils import _safe_indexing
+from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
+from ..utils._response import _get_response_values_binary
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    process_routing,
+)
+from ..utils.metaestimators import available_if
+from ..utils.multiclass import type_of_target
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _check_method_params,
+    _estimator_has,
+    _num_samples,
+    check_is_fitted,
+    indexable,
+)
+from ._split import StratifiedShuffleSplit, check_cv
+
+
+def _check_is_fitted(estimator):
+    try:
+        check_is_fitted(estimator.estimator)
+    except NotFittedError:
+        check_is_fitted(estimator, "estimator_")
+
+
+class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+    """Base class for binary classifiers that set a non-default decision threshold.
+
+    In this base class, we define the following interface:
+
+    - the validation of common parameters in `fit`;
+    - the different prediction methods that can be used with the classifier.
+
+    .. versionadded:: 1.5
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The binary classifier, fitted or not, for which we want to optimize
+        the decision threshold used during `predict`.
+
+    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        Methods by the classifier `estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke, for each classifier,
+          `"predict_proba"` or `"decision_function"` in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the classifier, it will raise an
+          error.
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [
+            HasMethods(["fit", "predict_proba"]),
+            HasMethods(["fit", "decision_function"]),
+        ],
+        "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
+    }
+
+    def __init__(self, estimator, *, response_method="auto"):
+        self.estimator = estimator
+        self.response_method = response_method
+
+    def _get_response_method(self):
+        """Define the response method."""
+        if self.response_method == "auto":
+            response_method = ["predict_proba", "decision_function"]
+        else:
+            response_method = self.response_method
+        return response_method
+
+    @_fit_context(
+        # *ThresholdClassifier*.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, **params):
+        """Fit the classifier.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        _raise_for_params(params, self, None)
+
+        X, y = indexable(X, y)
+
+        y_type = type_of_target(y, input_name="y")
+        if y_type != "binary":
+            raise ValueError(
+                f"Only binary classification is supported. Unknown label type: {y_type}"
+            )
+
+        self._fit(X, y, **params)
+
+        if hasattr(self.estimator_, "n_features_in_"):
+            self.n_features_in_ = self.estimator_.n_features_in_
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
+
+        return self
+
+    @property
+    def classes_(self):
+        """Classes labels."""
+        return self.estimator_.classes_
+
+    @available_if(_estimator_has("predict_proba"))
+    def predict_proba(self, X):
+        """Predict class probabilities for `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples.
+        """
+        _check_is_fitted(self)
+        estimator = getattr(self, "estimator_", self.estimator)
+        return estimator.predict_proba(X)
+
+    @available_if(_estimator_has("predict_log_proba"))
+    def predict_log_proba(self, X):
+        """Predict logarithm class probabilities for `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        log_probabilities : ndarray of shape (n_samples, n_classes)
+            The logarithm class probabilities of the input samples.
+        """
+        _check_is_fitted(self)
+        estimator = getattr(self, "estimator_", self.estimator)
+        return estimator.predict_log_proba(X)
+
+    @available_if(_estimator_has("decision_function"))
+    def decision_function(self, X):
+        """Decision function for samples in `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        decisions : ndarray of shape (n_samples,)
+            The decision function computed the fitted estimator.
+        """
+        _check_is_fitted(self)
+        estimator = getattr(self, "estimator_", self.estimator)
+        return estimator.decision_function(X)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_class = False
+        return tags
+
+
+class FixedThresholdClassifier(BaseThresholdClassifier):
+    """Binary classifier that manually sets the decision threshold.
+
+    This classifier allows to change the default decision threshold used for
+    converting posterior probability estimates (i.e. output of `predict_proba`) or
+    decision scores (i.e. output of `decision_function`) into a class label.
+
+    Here, the threshold is not optimized and is set to a constant value.
+
+    Read more in the :ref:`User Guide <FixedThresholdClassifier>`.
+
+    .. versionadded:: 1.5
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The binary classifier, fitted or not, for which we want to optimize
+        the decision threshold used during `predict`.
+
+    threshold : {"auto"} or float, default="auto"
+        The decision threshold to use when converting posterior probability estimates
+        (i.e. output of `predict_proba`) or decision scores (i.e. output of
+        `decision_function`) into a class label. When `"auto"`, the threshold is set
+        to 0.5 if `predict_proba` is used as `response_method`, otherwise it is set to
+        0 (i.e. the default threshold for `decision_function`).
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class. Used to process the output of the
+        `response_method` method. When `pos_label=None`, if `y_true` is in `{-1, 1}` or
+        `{0, 1}`, `pos_label` is set to 1, otherwise an error will be raised.
+
+    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        Methods by the classifier `estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke `"predict_proba"` or `"decision_function"`
+          in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the classifier, it will raise an
+          error.
+
+    Attributes
+    ----------
+    estimator_ : estimator instance
+        The fitted classifier used when predicting.
+
+    classes_ : ndarray of shape (n_classes,)
+        The class labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    See Also
+    --------
+    sklearn.model_selection.TunedThresholdClassifierCV : Classifier that post-tunes
+        the decision threshold based on some metrics and using cross-validation.
+    sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates
+        probabilities.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.metrics import confusion_matrix
+    >>> from sklearn.model_selection import FixedThresholdClassifier, train_test_split
+    >>> X, y = make_classification(
+    ...     n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42
+    ... )
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=42
+    ... )
+    >>> classifier = LogisticRegression(random_state=0).fit(X_train, y_train)
+    >>> print(confusion_matrix(y_test, classifier.predict(X_test)))
+    [[217   7]
+     [ 19   7]]
+    >>> classifier_other_threshold = FixedThresholdClassifier(
+    ...     classifier, threshold=0.1, response_method="predict_proba"
+    ... ).fit(X_train, y_train)
+    >>> print(confusion_matrix(y_test, classifier_other_threshold.predict(X_test)))
+    [[184  40]
+     [  6  20]]
+    """
+
+    _parameter_constraints: dict = {
+        **BaseThresholdClassifier._parameter_constraints,
+        "threshold": [StrOptions({"auto"}), Real],
+        "pos_label": [Real, str, "boolean", None],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        threshold="auto",
+        pos_label=None,
+        response_method="auto",
+    ):
+        super().__init__(estimator=estimator, response_method=response_method)
+        self.pos_label = pos_label
+        self.threshold = threshold
+
+    @property
+    def classes_(self):
+        if estimator := getattr(self, "estimator_", None):
+            return estimator.classes_
+        try:
+            check_is_fitted(self.estimator)
+            return self.estimator.classes_
+        except NotFittedError:
+            raise AttributeError(
+                "The underlying estimator is not fitted yet."
+            ) from NotFittedError
+
+    def _fit(self, X, y, **params):
+        """Fit the classifier.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        routed_params = process_routing(self, "fit", **params)
+        self.estimator_ = clone(self.estimator).fit(X, y, **routed_params.estimator.fit)
+        return self
+
+    def predict(self, X):
+        """Predict the target of new samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The samples, as accepted by `estimator.predict`.
+
+        Returns
+        -------
+        class_labels : ndarray of shape (n_samples,)
+            The predicted class.
+        """
+        _check_is_fitted(self)
+
+        estimator = getattr(self, "estimator_", self.estimator)
+
+        y_score, _, response_method_used = _get_response_values_binary(
+            estimator,
+            X,
+            self._get_response_method(),
+            pos_label=self.pos_label,
+            return_response_method_used=True,
+        )
+
+        if self.threshold == "auto":
+            decision_threshold = 0.5 if response_method_used == "predict_proba" else 0.0
+        else:
+            decision_threshold = self.threshold
+
+        return _threshold_scores_to_class_labels(
+            y_score, decision_threshold, self.classes_, self.pos_label
+        )
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+        )
+        return router
+
+
+def _fit_and_score_over_thresholds(
+    classifier,
+    X,
+    y,
+    *,
+    fit_params,
+    train_idx,
+    val_idx,
+    curve_scorer,
+    score_params,
+):
+    """Fit a classifier and compute the scores for different decision thresholds.
+
+    Parameters
+    ----------
+    classifier : estimator instance
+        The classifier to fit and use for scoring. If `classifier` is already fitted,
+        it will be used as is.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The entire dataset.
+
+    y : array-like of shape (n_samples,)
+        The entire target vector.
+
+    fit_params : dict
+        Parameters to pass to the `fit` method of the underlying classifier.
+
+    train_idx : ndarray of shape (n_train_samples,) or None
+        The indices of the training set. If `None`, `classifier` is expected to be
+        already fitted.
+
+    val_idx : ndarray of shape (n_val_samples,)
+        The indices of the validation set used to score `classifier`. If `train_idx`,
+        the entire set will be used.
+
+    curve_scorer : scorer instance
+        The scorer taking `classifier` and the validation set as input and outputting
+        decision thresholds and scores as a curve. Note that this is different from
+        the usual scorer that output a single score value:
+
+        * when `score_method` is one of the four constraint metrics, the curve scorer
+          will output a curve of two scores parametrized by the decision threshold, e.g.
+          TPR/TNR or precision/recall curves for each threshold;
+        * otherwise, the curve scorer will output a single score value for each
+          threshold.
+
+    score_params : dict
+        Parameters to pass to the `score` method of the underlying scorer.
+
+    Returns
+    -------
+    scores : ndarray of shape (thresholds,) or tuple of such arrays
+        The scores computed for each decision threshold. When TPR/TNR or precision/
+        recall are computed, `scores` is a tuple of two arrays.
+
+    potential_thresholds : ndarray of shape (thresholds,)
+        The decision thresholds used to compute the scores. They are returned in
+        ascending order.
+    """
+
+    if train_idx is not None:
+        X_train, X_val = _safe_indexing(X, train_idx), _safe_indexing(X, val_idx)
+        y_train, y_val = _safe_indexing(y, train_idx), _safe_indexing(y, val_idx)
+        fit_params_train = _check_method_params(X, fit_params, indices=train_idx)
+        score_params_val = _check_method_params(X, score_params, indices=val_idx)
+        classifier.fit(X_train, y_train, **fit_params_train)
+    else:  # prefit estimator, only a validation set is provided
+        X_val, y_val, score_params_val = X, y, score_params
+
+    return curve_scorer(classifier, X_val, y_val, **score_params_val)
+
+
+def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
+    """Compute the mean interpolated score across folds by defining common thresholds.
+
+    Parameters
+    ----------
+    target_thresholds : ndarray of shape (thresholds,)
+        The thresholds to use to compute the mean score.
+
+    cv_thresholds : ndarray of shape (n_folds, thresholds_fold)
+        The thresholds used to compute the scores for each fold.
+
+    cv_scores : ndarray of shape (n_folds, thresholds_fold)
+        The scores computed for each threshold for each fold.
+
+    Returns
+    -------
+    mean_score : ndarray of shape (thresholds,)
+        The mean score across all folds for each target threshold.
+    """
+    return np.mean(
+        [
+            np.interp(target_thresholds, split_thresholds, split_score)
+            for split_thresholds, split_score in zip(cv_thresholds, cv_scores)
+        ],
+        axis=0,
+    )
+
+
+class TunedThresholdClassifierCV(BaseThresholdClassifier):
+    """Classifier that post-tunes the decision threshold using cross-validation.
+
+    This estimator post-tunes the decision threshold (cut-off point) that is
+    used for converting posterior probability estimates (i.e. output of
+    `predict_proba`) or decision scores (i.e. output of `decision_function`)
+    into a class label. The tuning is done by optimizing a binary metric,
+    potentially constrained by a another metric.
+
+    Read more in the :ref:`User Guide <TunedThresholdClassifierCV>`.
+
+    .. versionadded:: 1.5
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The classifier, fitted or not, for which we want to optimize
+        the decision threshold used during `predict`.
+
+    scoring : str or callable, default="balanced_accuracy"
+        The objective metric to be optimized. Can be one of:
+
+        * a string associated to a scoring function for binary classification
+          (see :ref:`scoring_parameter`);
+        * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`;
+
+    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        Methods by the classifier `estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke, for each classifier,
+          `"predict_proba"` or `"decision_function"` in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the classifier, it will raise an
+          error.
+
+    thresholds : int or array-like, default=100
+        The number of decision threshold to use when discretizing the output of the
+        classifier `method`. Pass an array-like to manually specify the thresholds
+        to use.
+
+    cv : int, float, cross-validation generator, iterable or "prefit", default=None
+        Determines the cross-validation splitting strategy to train classifier.
+        Possible inputs for cv are:
+
+        * `None`, to use the default 5-fold stratified K-fold cross validation;
+        * An integer number, to specify the number of folds in a stratified k-fold;
+        * A float number, to specify a single shuffle split. The floating number should
+          be in (0, 1) and represent the size of the validation set;
+        * An object to be used as a cross-validation generator;
+        * An iterable yielding train, test splits;
+        * `"prefit"`, to bypass the cross-validation.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. warning::
+            Using `cv="prefit"` and passing the same dataset for fitting `estimator`
+            and tuning the cut-off point is subject to undesired overfitting. You can
+            refer to :ref:`TunedThresholdClassifierCV_no_cv` for an example.
+
+            This option should only be used when the set used to fit `estimator` is
+            different from the one used to tune the cut-off point (by calling
+            :meth:`TunedThresholdClassifierCV.fit`).
+
+    refit : bool, default=True
+        Whether or not to refit the classifier on the entire training set once
+        the decision threshold has been found.
+        Note that forcing `refit=False` on cross-validation having more
+        than a single split will raise an error. Similarly, `refit=True` in
+        conjunction with `cv="prefit"` will raise an error.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. When `cv` represents a
+        cross-validation strategy, the fitting and scoring on each data split
+        is done in parallel. ``None`` means 1 unless in a
+        :obj:`joblib.parallel_backend` context. ``-1`` means using all
+        processors. See :term:`Glossary <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of cross-validation when `cv` is a float.
+        See :term:`Glossary <random_state>`.
+
+    store_cv_results : bool, default=False
+        Whether to store all scores and thresholds computed during the cross-validation
+        process.
+
+    Attributes
+    ----------
+    estimator_ : estimator instance
+        The fitted classifier used when predicting.
+
+    best_threshold_ : float
+        The new decision threshold.
+
+    best_score_ : float or None
+        The optimal score of the objective metric, evaluated at `best_threshold_`.
+
+    cv_results_ : dict or None
+        A dictionary containing the scores and thresholds computed during the
+        cross-validation process. Only exist if `store_cv_results=True`. The
+        keys are `"thresholds"` and `"scores"`.
+
+    classes_ : ndarray of shape (n_classes,)
+        The class labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    See Also
+    --------
+    sklearn.model_selection.FixedThresholdClassifier : Classifier that uses a
+        constant threshold.
+    sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates
+        probabilities.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.metrics import classification_report
+    >>> from sklearn.model_selection import TunedThresholdClassifierCV, train_test_split
+    >>> X, y = make_classification(
+    ...     n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42
+    ... )
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=42
+    ... )
+    >>> classifier = RandomForestClassifier(random_state=0).fit(X_train, y_train)
+    >>> print(classification_report(y_test, classifier.predict(X_test)))
+                  precision    recall  f1-score   support
+    <BLANKLINE>
+               0       0.94      0.99      0.96       224
+               1       0.80      0.46      0.59        26
+    <BLANKLINE>
+        accuracy                           0.93       250
+       macro avg       0.87      0.72      0.77       250
+    weighted avg       0.93      0.93      0.92       250
+    <BLANKLINE>
+    >>> classifier_tuned = TunedThresholdClassifierCV(
+    ...     classifier, scoring="balanced_accuracy"
+    ... ).fit(X_train, y_train)
+    >>> print(
+    ...     f"Cut-off point found at {classifier_tuned.best_threshold_:.3f}"
+    ... )
+    Cut-off point found at 0.342
+    >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))
+                  precision    recall  f1-score   support
+    <BLANKLINE>
+               0       0.96      0.95      0.96       224
+               1       0.61      0.65      0.63        26
+    <BLANKLINE>
+        accuracy                           0.92       250
+       macro avg       0.78      0.80      0.79       250
+    weighted avg       0.92      0.92      0.92       250
+    <BLANKLINE>
+    """
+
+    _parameter_constraints: dict = {
+        **BaseThresholdClassifier._parameter_constraints,
+        "scoring": [
+            StrOptions(set(get_scorer_names())),
+            callable,
+            MutableMapping,
+        ],
+        "thresholds": [Interval(Integral, 1, None, closed="left"), "array-like"],
+        "cv": [
+            "cv_object",
+            StrOptions({"prefit"}),
+            Interval(RealNotInt, 0.0, 1.0, closed="neither"),
+        ],
+        "refit": ["boolean"],
+        "n_jobs": [Integral, None],
+        "random_state": ["random_state"],
+        "store_cv_results": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        scoring="balanced_accuracy",
+        response_method="auto",
+        thresholds=100,
+        cv=None,
+        refit=True,
+        n_jobs=None,
+        random_state=None,
+        store_cv_results=False,
+    ):
+        super().__init__(estimator=estimator, response_method=response_method)
+        self.scoring = scoring
+        self.thresholds = thresholds
+        self.cv = cv
+        self.refit = refit
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.store_cv_results = store_cv_results
+
+    def _fit(self, X, y, **params):
+        """Fit the classifier and post-tune the decision threshold.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier and to the `scoring` scorer.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        if isinstance(self.cv, Real) and 0 < self.cv < 1:
+            cv = StratifiedShuffleSplit(
+                n_splits=1, test_size=self.cv, random_state=self.random_state
+            )
+        elif self.cv == "prefit":
+            if self.refit is True:
+                raise ValueError("When cv='prefit', refit cannot be True.")
+            try:
+                check_is_fitted(self.estimator, "classes_")
+            except NotFittedError as exc:
+                raise NotFittedError(
+                    """When cv='prefit', `estimator` must be fitted."""
+                ) from exc
+            cv = self.cv
+        else:
+            cv = check_cv(self.cv, y=y, classifier=True)
+            if self.refit is False and cv.get_n_splits() > 1:
+                raise ValueError("When cv has several folds, refit cannot be False.")
+
+        routed_params = process_routing(self, "fit", **params)
+        self._curve_scorer = self._get_curve_scorer()
+
+        # in the following block, we:
+        # - define the final classifier `self.estimator_` and train it if necessary
+        # - define `classifier` to be used to post-tune the decision threshold
+        # - define `split` to be used to fit/score `classifier`
+        if cv == "prefit":
+            self.estimator_ = self.estimator
+            classifier = self.estimator_
+            splits = [(None, range(_num_samples(X)))]
+        else:
+            self.estimator_ = clone(self.estimator)
+            classifier = clone(self.estimator)
+            splits = cv.split(X, y, **routed_params.splitter.split)
+
+            if self.refit:
+                # train on the whole dataset
+                X_train, y_train, fit_params_train = X, y, routed_params.estimator.fit
+            else:
+                # single split cross-validation
+                train_idx, _ = next(cv.split(X, y, **routed_params.splitter.split))
+                X_train = _safe_indexing(X, train_idx)
+                y_train = _safe_indexing(y, train_idx)
+                fit_params_train = _check_method_params(
+                    X, routed_params.estimator.fit, indices=train_idx
+                )
+
+            self.estimator_.fit(X_train, y_train, **fit_params_train)
+
+        cv_scores, cv_thresholds = zip(
+            *Parallel(n_jobs=self.n_jobs)(
+                delayed(_fit_and_score_over_thresholds)(
+                    clone(classifier) if cv != "prefit" else classifier,
+                    X,
+                    y,
+                    fit_params=routed_params.estimator.fit,
+                    train_idx=train_idx,
+                    val_idx=val_idx,
+                    curve_scorer=self._curve_scorer,
+                    score_params=routed_params.scorer.score,
+                )
+                for train_idx, val_idx in splits
+            )
+        )
+
+        if any(np.isclose(th[0], th[-1]) for th in cv_thresholds):
+            raise ValueError(
+                "The provided estimator makes constant predictions. Therefore, it is "
+                "impossible to optimize the decision threshold."
+            )
+
+        # find the global min and max thresholds across all folds
+        min_threshold = min(
+            split_thresholds.min() for split_thresholds in cv_thresholds
+        )
+        max_threshold = max(
+            split_thresholds.max() for split_thresholds in cv_thresholds
+        )
+        if isinstance(self.thresholds, Integral):
+            decision_thresholds = np.linspace(
+                min_threshold, max_threshold, num=self.thresholds
+            )
+        else:
+            decision_thresholds = np.asarray(self.thresholds)
+
+        objective_scores = _mean_interpolated_score(
+            decision_thresholds, cv_thresholds, cv_scores
+        )
+        best_idx = objective_scores.argmax()
+        self.best_score_ = objective_scores[best_idx]
+        self.best_threshold_ = decision_thresholds[best_idx]
+        if self.store_cv_results:
+            self.cv_results_ = {
+                "thresholds": decision_thresholds,
+                "scores": objective_scores,
+            }
+
+        return self
+
+    def predict(self, X):
+        """Predict the target of new samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The samples, as accepted by `estimator.predict`.
+
+        Returns
+        -------
+        class_labels : ndarray of shape (n_samples,)
+            The predicted class.
+        """
+        check_is_fitted(self, "estimator_")
+        pos_label = self._curve_scorer._get_pos_label()
+        y_score, _ = _get_response_values_binary(
+            self.estimator_,
+            X,
+            self._get_response_method(),
+            pos_label=pos_label,
+        )
+
+        return _threshold_scores_to_class_labels(
+            y_score, self.best_threshold_, self.classes_, pos_label
+        )
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add(
+                estimator=self.estimator,
+                method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+            )
+            .add(
+                splitter=self.cv,
+                method_mapping=MethodMapping().add(callee="split", caller="fit"),
+            )
+            .add(
+                scorer=self._get_curve_scorer(),
+                method_mapping=MethodMapping().add(callee="score", caller="fit"),
+            )
+        )
+        return router
+
+    def _get_curve_scorer(self):
+        """Get the curve scorer based on the objective metric used."""
+        scoring = check_scoring(self.estimator, scoring=self.scoring)
+        curve_scorer = _CurveScorer.from_scorer(
+            scoring, self._get_response_method(), self.thresholds
+        )
+        return curve_scorer
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_plot.py b/.venv/Lib/site-packages/sklearn/model_selection/_plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..241be09dfb9b483ac441dd46e7f757feabfe25e4
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/_plot.py
@@ -0,0 +1,877 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+from ..utils._optional_dependencies import check_matplotlib_support
+from ..utils._plotting import _interval_max_min_ratio, _validate_score_name
+from ._validation import learning_curve, validation_curve
+
+
+class _BaseCurveDisplay:
+    def _plot_curve(
+        self,
+        x_data,
+        *,
+        ax=None,
+        negate_score=False,
+        score_name=None,
+        score_type="test",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        check_matplotlib_support(f"{self.__class__.__name__}.plot")
+
+        import matplotlib.pyplot as plt
+
+        if ax is None:
+            _, ax = plt.subplots()
+
+        if negate_score:
+            train_scores, test_scores = -self.train_scores, -self.test_scores
+        else:
+            train_scores, test_scores = self.train_scores, self.test_scores
+
+        if std_display_style not in ("errorbar", "fill_between", None):
+            raise ValueError(
+                f"Unknown std_display_style: {std_display_style}. Should be one of"
+                " 'errorbar', 'fill_between', or None."
+            )
+
+        if score_type not in ("test", "train", "both"):
+            raise ValueError(
+                f"Unknown score_type: {score_type}. Should be one of 'test', "
+                "'train', or 'both'."
+            )
+
+        if score_type == "train":
+            scores = {"Train": train_scores}
+        elif score_type == "test":
+            scores = {"Test": test_scores}
+        else:  # score_type == "both"
+            scores = {"Train": train_scores, "Test": test_scores}
+
+        if std_display_style in ("fill_between", None):
+            # plot the mean score
+            if line_kw is None:
+                line_kw = {}
+
+            self.lines_ = []
+            for line_label, score in scores.items():
+                self.lines_.append(
+                    *ax.plot(
+                        x_data,
+                        score.mean(axis=1),
+                        label=line_label,
+                        **line_kw,
+                    )
+                )
+            self.errorbar_ = None
+            self.fill_between_ = None  # overwritten below by fill_between
+
+        if std_display_style == "errorbar":
+            if errorbar_kw is None:
+                errorbar_kw = {}
+
+            self.errorbar_ = []
+            for line_label, score in scores.items():
+                self.errorbar_.append(
+                    ax.errorbar(
+                        x_data,
+                        score.mean(axis=1),
+                        score.std(axis=1),
+                        label=line_label,
+                        **errorbar_kw,
+                    )
+                )
+            self.lines_, self.fill_between_ = None, None
+        elif std_display_style == "fill_between":
+            if fill_between_kw is None:
+                fill_between_kw = {}
+            default_fill_between_kw = {"alpha": 0.5}
+            fill_between_kw = {**default_fill_between_kw, **fill_between_kw}
+
+            self.fill_between_ = []
+            for line_label, score in scores.items():
+                self.fill_between_.append(
+                    ax.fill_between(
+                        x_data,
+                        score.mean(axis=1) - score.std(axis=1),
+                        score.mean(axis=1) + score.std(axis=1),
+                        **fill_between_kw,
+                    )
+                )
+
+        score_name = self.score_name if score_name is None else score_name
+
+        ax.legend()
+
+        # We found that a ratio, smaller or bigger than 5, between the largest and
+        # smallest gap of the x values is a good indicator to choose between linear
+        # and log scale.
+        if _interval_max_min_ratio(x_data) > 5:
+            xscale = "symlog" if x_data.min() <= 0 else "log"
+        else:
+            xscale = "linear"
+
+        ax.set_xscale(xscale)
+        ax.set_ylabel(f"{score_name}")
+
+        self.ax_ = ax
+        self.figure_ = ax.figure
+
+
+class LearningCurveDisplay(_BaseCurveDisplay):
+    """Learning Curve visualization.
+
+    It is recommended to use
+    :meth:`~sklearn.model_selection.LearningCurveDisplay.from_estimator` to
+    create a :class:`~sklearn.model_selection.LearningCurveDisplay` instance.
+    All parameters are stored as attributes.
+
+    Read more in the :ref:`User Guide <visualizations>` for general information
+    about the visualization API and
+    :ref:`detailed documentation <learning_curve>` regarding the learning
+    curve visualization.
+
+    .. versionadded:: 1.2
+
+    Parameters
+    ----------
+    train_sizes : ndarray of shape (n_unique_ticks,)
+        Numbers of training examples that has been used to generate the
+        learning curve.
+
+    train_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    score_name : str, default=None
+        The name of the score used in `learning_curve`. It will override the name
+        inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if
+        `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a
+        string or a callable, we infer the name. We replace `_` by spaces and capitalize
+        the first letter. We remove `neg_` and replace it by `"Negative"` if
+        `negate_score` is `False` or just remove it otherwise.
+
+    Attributes
+    ----------
+    ax_ : matplotlib Axes
+        Axes with the learning curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the learning curve.
+
+    errorbar_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"errorbar"`, this is a list of
+        `matplotlib.container.ErrorbarContainer` objects. If another style is
+        used, `errorbar_` is `None`.
+
+    lines_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.lines.Line2D` objects corresponding to the mean train and
+        test scores. If another style is used, `line_` is `None`.
+
+    fill_between_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.collections.PolyCollection` objects. If another style is
+        used, `fill_between_` is `None`.
+
+    See Also
+    --------
+    sklearn.model_selection.learning_curve : Compute the learning curve.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import LearningCurveDisplay, learning_curve
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> X, y = load_iris(return_X_y=True)
+    >>> tree = DecisionTreeClassifier(random_state=0)
+    >>> train_sizes, train_scores, test_scores = learning_curve(
+    ...     tree, X, y)
+    >>> display = LearningCurveDisplay(train_sizes=train_sizes,
+    ...     train_scores=train_scores, test_scores=test_scores, score_name="Score")
+    >>> display.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(self, *, train_sizes, train_scores, test_scores, score_name=None):
+        self.train_sizes = train_sizes
+        self.train_scores = train_scores
+        self.test_scores = test_scores
+        self.score_name = score_name
+
+    def plot(
+        self,
+        ax=None,
+        *,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.learning_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If None, no standard deviation representation is
+            displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.LearningCurveDisplay`
+            Object that stores computed values.
+        """
+        self._plot_curve(
+            self.train_sizes,
+            ax=ax,
+            negate_score=negate_score,
+            score_name=score_name,
+            score_type=score_type,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
+        self.ax_.set_xlabel("Number of samples in the training set")
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        groups=None,
+        train_sizes=np.linspace(0.1, 1.0, 5),
+        cv=None,
+        scoring=None,
+        exploit_incremental_learning=False,
+        n_jobs=None,
+        pre_dispatch="all",
+        verbose=0,
+        shuffle=False,
+        random_state=None,
+        error_score=np.nan,
+        fit_params=None,
+        ax=None,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Create a learning curve display from an estimator.
+
+        Read more in the :ref:`User Guide <visualizations>` for general
+        information about the visualization API and :ref:`detailed
+        documentation <learning_curve>` regarding the learning curve
+        visualization.
+
+        Parameters
+        ----------
+        estimator : object type that implements the "fit" and "predict" methods
+            An object of that type which is cloned for each validation.
+
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Only used in conjunction with a "Group" :term:`cv`
+            instance (e.g., :class:`GroupKFold`).
+
+        train_sizes : array-like of shape (n_ticks,), \
+                default=np.linspace(0.1, 1.0, 5)
+            Relative or absolute numbers of training examples that will be used
+            to generate the learning curve. If the dtype is float, it is
+            regarded as a fraction of the maximum size of the training set
+            (that is determined by the selected validation method), i.e. it has
+            to be within (0, 1]. Otherwise it is interpreted as absolute sizes
+            of the training sets. Note that for classification the number of
+            samples usually have to be big enough to contain at least one
+            sample from each class.
+
+        cv : int, cross-validation generator or an iterable, default=None
+            Determines the cross-validation splitting strategy.
+            Possible inputs for cv are:
+
+            - None, to use the default 5-fold cross validation,
+            - int, to specify the number of folds in a `(Stratified)KFold`,
+            - :term:`CV splitter`,
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+            For int/None inputs, if the estimator is a classifier and `y` is
+            either binary or multiclass,
+            :class:`~sklearn.model_selection.StratifiedKFold` is used. In all
+            other cases, :class:`~sklearn.model_selection.KFold` is used. These
+            splitters are instantiated with `shuffle=False` so the splits will
+            be the same across calls.
+
+            Refer :ref:`User Guide <cross_validation>` for the various
+            cross-validation strategies that can be used here.
+
+        scoring : str or callable, default=None
+            A string (see :ref:`scoring_parameter`) or
+            a scorer callable object / function with signature
+            `scorer(estimator, X, y)` (see :ref:`scoring_callable`).
+
+        exploit_incremental_learning : bool, default=False
+            If the estimator supports incremental learning, this will be
+            used to speed up fitting for different training set sizes.
+
+        n_jobs : int, default=None
+            Number of jobs to run in parallel. Training the estimator and
+            computing the score are parallelized over the different training
+            and test sets. `None` means 1 unless in a
+            :obj:`joblib.parallel_backend` context. `-1` means using all
+            processors. See :term:`Glossary <n_jobs>` for more details.
+
+        pre_dispatch : int or str, default='all'
+            Number of predispatched jobs for parallel execution (default is
+            all). The option can reduce the allocated memory. The str can
+            be an expression like '2*n_jobs'.
+
+        verbose : int, default=0
+            Controls the verbosity: the higher, the more messages.
+
+        shuffle : bool, default=False
+            Whether to shuffle training data before taking prefixes of it
+            based on`train_sizes`.
+
+        random_state : int, RandomState instance or None, default=None
+            Used when `shuffle` is True. Pass an int for reproducible
+            output across multiple function calls.
+            See :term:`Glossary <random_state>`.
+
+        error_score : 'raise' or numeric, default=np.nan
+            Value to assign to the score if an error occurs in estimator
+            fitting. If set to 'raise', the error is raised. If a numeric value
+            is given, FitFailedWarning is raised.
+
+        fit_params : dict, default=None
+            Parameters to pass to the fit method of the estimator.
+
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.learning_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If `None`, no representation of the standard deviation
+            is displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.LearningCurveDisplay`
+            Object that stores computed values.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import load_iris
+        >>> from sklearn.model_selection import LearningCurveDisplay
+        >>> from sklearn.tree import DecisionTreeClassifier
+        >>> X, y = load_iris(return_X_y=True)
+        >>> tree = DecisionTreeClassifier(random_state=0)
+        >>> LearningCurveDisplay.from_estimator(tree, X, y)
+        <...>
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_estimator")
+
+        score_name = _validate_score_name(score_name, scoring, negate_score)
+
+        train_sizes, train_scores, test_scores = learning_curve(
+            estimator,
+            X,
+            y,
+            groups=groups,
+            train_sizes=train_sizes,
+            cv=cv,
+            scoring=scoring,
+            exploit_incremental_learning=exploit_incremental_learning,
+            n_jobs=n_jobs,
+            pre_dispatch=pre_dispatch,
+            verbose=verbose,
+            shuffle=shuffle,
+            random_state=random_state,
+            error_score=error_score,
+            return_times=False,
+            fit_params=fit_params,
+        )
+
+        viz = cls(
+            train_sizes=train_sizes,
+            train_scores=train_scores,
+            test_scores=test_scores,
+            score_name=score_name,
+        )
+        return viz.plot(
+            ax=ax,
+            negate_score=negate_score,
+            score_type=score_type,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
+
+
+class ValidationCurveDisplay(_BaseCurveDisplay):
+    """Validation Curve visualization.
+
+    It is recommended to use
+    :meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` to
+    create a :class:`~sklearn.model_selection.ValidationCurveDisplay` instance.
+    All parameters are stored as attributes.
+
+    Read more in the :ref:`User Guide <visualizations>` for general information
+    about the visualization API and :ref:`detailed documentation
+    <validation_curve>` regarding the validation curve visualization.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    param_name : str
+        Name of the parameter that has been varied.
+
+    param_range : array-like of shape (n_ticks,)
+        The values of the parameter that have been evaluated.
+
+    train_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    score_name : str, default=None
+        The name of the score used in `validation_curve`. It will override the name
+        inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if
+        `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a
+        string or a callable, we infer the name. We replace `_` by spaces and capitalize
+        the first letter. We remove `neg_` and replace it by `"Negative"` if
+        `negate_score` is `False` or just remove it otherwise.
+
+    Attributes
+    ----------
+    ax_ : matplotlib Axes
+        Axes with the validation curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the validation curve.
+
+    errorbar_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"errorbar"`, this is a list of
+        `matplotlib.container.ErrorbarContainer` objects. If another style is
+        used, `errorbar_` is `None`.
+
+    lines_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.lines.Line2D` objects corresponding to the mean train and
+        test scores. If another style is used, `line_` is `None`.
+
+    fill_between_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.collections.PolyCollection` objects. If another style is
+        used, `fill_between_` is `None`.
+
+    See Also
+    --------
+    sklearn.model_selection.validation_curve : Compute the validation curve.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import ValidationCurveDisplay, validation_curve
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> X, y = make_classification(n_samples=1_000, random_state=0)
+    >>> logistic_regression = LogisticRegression()
+    >>> param_name, param_range = "C", np.logspace(-8, 3, 10)
+    >>> train_scores, test_scores = validation_curve(
+    ...     logistic_regression, X, y, param_name=param_name, param_range=param_range
+    ... )
+    >>> display = ValidationCurveDisplay(
+    ...     param_name=param_name, param_range=param_range,
+    ...     train_scores=train_scores, test_scores=test_scores, score_name="Score"
+    ... )
+    >>> display.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(
+        self, *, param_name, param_range, train_scores, test_scores, score_name=None
+    ):
+        self.param_name = param_name
+        self.param_range = param_range
+        self.train_scores = train_scores
+        self.test_scores = test_scores
+        self.score_name = score_name
+
+    def plot(
+        self,
+        ax=None,
+        *,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.validation_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If None, no standard deviation representation is
+            displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.ValidationCurveDisplay`
+            Object that stores computed values.
+        """
+        self._plot_curve(
+            self.param_range,
+            ax=ax,
+            negate_score=negate_score,
+            score_name=score_name,
+            score_type=score_type,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
+        self.ax_.set_xlabel(f"{self.param_name}")
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        param_name,
+        param_range,
+        groups=None,
+        cv=None,
+        scoring=None,
+        n_jobs=None,
+        pre_dispatch="all",
+        verbose=0,
+        error_score=np.nan,
+        fit_params=None,
+        ax=None,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Create a validation curve display from an estimator.
+
+        Read more in the :ref:`User Guide <visualizations>` for general
+        information about the visualization API and :ref:`detailed
+        documentation <validation_curve>` regarding the validation curve
+        visualization.
+
+        Parameters
+        ----------
+        estimator : object type that implements the "fit" and "predict" methods
+            An object of that type which is cloned for each validation.
+
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        param_name : str
+            Name of the parameter that will be varied.
+
+        param_range : array-like of shape (n_values,)
+            The values of the parameter that will be evaluated.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Only used in conjunction with a "Group" :term:`cv`
+            instance (e.g., :class:`GroupKFold`).
+
+        cv : int, cross-validation generator or an iterable, default=None
+            Determines the cross-validation splitting strategy.
+            Possible inputs for cv are:
+
+            - None, to use the default 5-fold cross validation,
+            - int, to specify the number of folds in a `(Stratified)KFold`,
+            - :term:`CV splitter`,
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+            For int/None inputs, if the estimator is a classifier and `y` is
+            either binary or multiclass,
+            :class:`~sklearn.model_selection.StratifiedKFold` is used. In all
+            other cases, :class:`~sklearn.model_selection.KFold` is used. These
+            splitters are instantiated with `shuffle=False` so the splits will
+            be the same across calls.
+
+            Refer :ref:`User Guide <cross_validation>` for the various
+            cross-validation strategies that can be used here.
+
+        scoring : str or callable, default=None
+            A string (see :ref:`scoring_parameter`) or
+            a scorer callable object / function with signature
+            `scorer(estimator, X, y)` (see :ref:`scoring_callable`).
+
+        n_jobs : int, default=None
+            Number of jobs to run in parallel. Training the estimator and
+            computing the score are parallelized over the different training
+            and test sets. `None` means 1 unless in a
+            :obj:`joblib.parallel_backend` context. `-1` means using all
+            processors. See :term:`Glossary <n_jobs>` for more details.
+
+        pre_dispatch : int or str, default='all'
+            Number of predispatched jobs for parallel execution (default is
+            all). The option can reduce the allocated memory. The str can
+            be an expression like '2*n_jobs'.
+
+        verbose : int, default=0
+            Controls the verbosity: the higher, the more messages.
+
+        error_score : 'raise' or numeric, default=np.nan
+            Value to assign to the score if an error occurs in estimator
+            fitting. If set to 'raise', the error is raised. If a numeric value
+            is given, FitFailedWarning is raised.
+
+        fit_params : dict, default=None
+            Parameters to pass to the fit method of the estimator.
+
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.validation_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If `None`, no representation of the standard deviation
+            is displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.ValidationCurveDisplay`
+            Object that stores computed values.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.model_selection import ValidationCurveDisplay
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> X, y = make_classification(n_samples=1_000, random_state=0)
+        >>> logistic_regression = LogisticRegression()
+        >>> param_name, param_range = "C", np.logspace(-8, 3, 10)
+        >>> ValidationCurveDisplay.from_estimator(
+        ...     logistic_regression, X, y, param_name=param_name,
+        ...     param_range=param_range,
+        ... )
+        <...>
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_estimator")
+
+        score_name = _validate_score_name(score_name, scoring, negate_score)
+
+        train_scores, test_scores = validation_curve(
+            estimator,
+            X,
+            y,
+            param_name=param_name,
+            param_range=param_range,
+            groups=groups,
+            cv=cv,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            pre_dispatch=pre_dispatch,
+            verbose=verbose,
+            error_score=error_score,
+            fit_params=fit_params,
+        )
+
+        viz = cls(
+            param_name=param_name,
+            param_range=np.asarray(param_range),
+            train_scores=train_scores,
+            test_scores=test_scores,
+            score_name=score_name,
+        )
+        return viz.plot(
+            ax=ax,
+            negate_score=negate_score,
+            score_type=score_type,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_search.py b/.venv/Lib/site-packages/sklearn/model_selection/_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..75266a41ae71d7f049d240db1a84dca14d477493
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/_search.py
@@ -0,0 +1,1954 @@
+"""
+The :mod:`sklearn.model_selection._search` includes utilities to fine-tune the
+parameters of an estimator.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import operator
+import time
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import defaultdict
+from collections.abc import Iterable, Mapping, Sequence
+from copy import deepcopy
+from functools import partial, reduce
+from itertools import product
+
+import numpy as np
+from numpy.ma import MaskedArray
+from scipy.stats import rankdata
+
+from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
+from ..exceptions import NotFittedError
+from ..metrics import check_scoring
+from ..metrics._scorer import (
+    _check_multimetric_scoring,
+    _MultimetricScorer,
+    get_scorer_names,
+)
+from ..utils import Bunch, check_random_state
+from ..utils._estimator_html_repr import _VisualBlock
+from ..utils._param_validation import HasMethods, Interval, StrOptions
+from ..utils._tags import get_tags
+from ..utils.deprecation import _deprecate_Xt_in_inverse_transform
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.metaestimators import available_if
+from ..utils.parallel import Parallel, delayed
+from ..utils.random import sample_without_replacement
+from ..utils.validation import _check_method_params, check_is_fitted, indexable
+from ._split import check_cv
+from ._validation import (
+    _aggregate_score_dicts,
+    _fit_and_score,
+    _insert_error_scores,
+    _normalize_score_results,
+    _warn_or_raise_about_fit_failures,
+)
+
+__all__ = ["GridSearchCV", "ParameterGrid", "ParameterSampler", "RandomizedSearchCV"]
+
+
+class ParameterGrid:
+    """Grid of parameters with a discrete number of values for each.
+
+    Can be used to iterate over parameter value combinations with the
+    Python built-in function iter.
+    The order of the generated parameter combinations is deterministic.
+
+    Read more in the :ref:`User Guide <grid_search>`.
+
+    Parameters
+    ----------
+    param_grid : dict of str to sequence, or sequence of such
+        The parameter grid to explore, as a dictionary mapping estimator
+        parameters to sequences of allowed values.
+
+        An empty dict signifies default parameters.
+
+        A sequence of dicts signifies a sequence of grids to search, and is
+        useful to avoid exploring parameter combinations that make no sense
+        or have no effect. See the examples below.
+
+    Examples
+    --------
+    >>> from sklearn.model_selection import ParameterGrid
+    >>> param_grid = {'a': [1, 2], 'b': [True, False]}
+    >>> list(ParameterGrid(param_grid)) == (
+    ...    [{'a': 1, 'b': True}, {'a': 1, 'b': False},
+    ...     {'a': 2, 'b': True}, {'a': 2, 'b': False}])
+    True
+
+    >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
+    >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
+    ...                               {'kernel': 'rbf', 'gamma': 1},
+    ...                               {'kernel': 'rbf', 'gamma': 10}]
+    True
+    >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
+    True
+
+    See Also
+    --------
+    GridSearchCV : Uses :class:`ParameterGrid` to perform a full parallelized
+        parameter search.
+    """
+
+    def __init__(self, param_grid):
+        if not isinstance(param_grid, (Mapping, Iterable)):
+            raise TypeError(
+                f"Parameter grid should be a dict or a list, got: {param_grid!r} of"
+                f" type {type(param_grid).__name__}"
+            )
+
+        if isinstance(param_grid, Mapping):
+            # wrap dictionary in a singleton list to support either dict
+            # or list of dicts
+            param_grid = [param_grid]
+
+        # check if all entries are dictionaries of lists
+        for grid in param_grid:
+            if not isinstance(grid, dict):
+                raise TypeError(f"Parameter grid is not a dict ({grid!r})")
+            for key, value in grid.items():
+                if isinstance(value, np.ndarray) and value.ndim > 1:
+                    raise ValueError(
+                        f"Parameter array for {key!r} should be one-dimensional, got:"
+                        f" {value!r} with shape {value.shape}"
+                    )
+                if isinstance(value, str) or not isinstance(
+                    value, (np.ndarray, Sequence)
+                ):
+                    raise TypeError(
+                        f"Parameter grid for parameter {key!r} needs to be a list or a"
+                        f" numpy array, but got {value!r} (of type "
+                        f"{type(value).__name__}) instead. Single values "
+                        "need to be wrapped in a list with one element."
+                    )
+                if len(value) == 0:
+                    raise ValueError(
+                        f"Parameter grid for parameter {key!r} need "
+                        f"to be a non-empty sequence, got: {value!r}"
+                    )
+
+        self.param_grid = param_grid
+
+    def __iter__(self):
+        """Iterate over the points in the grid.
+
+        Returns
+        -------
+        params : iterator over dict of str to any
+            Yields dictionaries mapping each estimator parameter to one of its
+            allowed values.
+        """
+        for p in self.param_grid:
+            # Always sort the keys of a dictionary, for reproducibility
+            items = sorted(p.items())
+            if not items:
+                yield {}
+            else:
+                keys, values = zip(*items)
+                for v in product(*values):
+                    params = dict(zip(keys, v))
+                    yield params
+
+    def __len__(self):
+        """Number of points on the grid."""
+        # Product function that can handle iterables (np.prod can't).
+        product = partial(reduce, operator.mul)
+        return sum(
+            product(len(v) for v in p.values()) if p else 1 for p in self.param_grid
+        )
+
+    def __getitem__(self, ind):
+        """Get the parameters that would be ``ind``th in iteration
+
+        Parameters
+        ----------
+        ind : int
+            The iteration index
+
+        Returns
+        -------
+        params : dict of str to any
+            Equal to list(self)[ind]
+        """
+        # This is used to make discrete sampling without replacement memory
+        # efficient.
+        for sub_grid in self.param_grid:
+            # XXX: could memoize information used here
+            if not sub_grid:
+                if ind == 0:
+                    return {}
+                else:
+                    ind -= 1
+                    continue
+
+            # Reverse so most frequent cycling parameter comes first
+            keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
+            sizes = [len(v_list) for v_list in values_lists]
+            total = np.prod(sizes)
+
+            if ind >= total:
+                # Try the next grid
+                ind -= total
+            else:
+                out = {}
+                for key, v_list, n in zip(keys, values_lists, sizes):
+                    ind, offset = divmod(ind, n)
+                    out[key] = v_list[offset]
+                return out
+
+        raise IndexError("ParameterGrid index out of range")
+
+
+class ParameterSampler:
+    """Generator on parameters sampled from given distributions.
+
+    Non-deterministic iterable over random candidate combinations for hyper-
+    parameter search. If all parameters are presented as a list,
+    sampling without replacement is performed. If at least one parameter
+    is given as a distribution, sampling with replacement is used.
+    It is highly recommended to use continuous distributions for continuous
+    parameters.
+
+    Read more in the :ref:`User Guide <grid_search>`.
+
+    Parameters
+    ----------
+    param_distributions : dict
+        Dictionary with parameters names (`str`) as keys and distributions
+        or lists of parameters to try. Distributions must provide a ``rvs``
+        method for sampling (such as those from scipy.stats.distributions).
+        If a list is given, it is sampled uniformly.
+        If a list of dicts is given, first a dict is sampled uniformly, and
+        then a parameter is sampled using that dict as above.
+
+    n_iter : int
+        Number of parameter settings that are produced.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo random number generator state used for random uniform sampling
+        from lists of possible values instead of scipy.stats distributions.
+        Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    params : dict of str to any
+        **Yields** dictionaries mapping each estimator parameter to
+        as sampled value.
+
+    Examples
+    --------
+    >>> from sklearn.model_selection import ParameterSampler
+    >>> from scipy.stats.distributions import expon
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(0)
+    >>> param_grid = {'a':[1, 2], 'b': expon()}
+    >>> param_list = list(ParameterSampler(param_grid, n_iter=4,
+    ...                                    random_state=rng))
+    >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
+    ...                 for d in param_list]
+    >>> rounded_list == [{'b': 0.89856, 'a': 1},
+    ...                  {'b': 0.923223, 'a': 1},
+    ...                  {'b': 1.878964, 'a': 2},
+    ...                  {'b': 1.038159, 'a': 2}]
+    True
+    """
+
+    def __init__(self, param_distributions, n_iter, *, random_state=None):
+        if not isinstance(param_distributions, (Mapping, Iterable)):
+            raise TypeError(
+                "Parameter distribution is not a dict or a list,"
+                f" got: {param_distributions!r} of type "
+                f"{type(param_distributions).__name__}"
+            )
+
+        if isinstance(param_distributions, Mapping):
+            # wrap dictionary in a singleton list to support either dict
+            # or list of dicts
+            param_distributions = [param_distributions]
+
+        for dist in param_distributions:
+            if not isinstance(dist, dict):
+                raise TypeError(
+                    "Parameter distribution is not a dict ({!r})".format(dist)
+                )
+            for key in dist:
+                if not isinstance(dist[key], Iterable) and not hasattr(
+                    dist[key], "rvs"
+                ):
+                    raise TypeError(
+                        f"Parameter grid for parameter {key!r} is not iterable "
+                        f"or a distribution (value={dist[key]})"
+                    )
+        self.n_iter = n_iter
+        self.random_state = random_state
+        self.param_distributions = param_distributions
+
+    def _is_all_lists(self):
+        return all(
+            all(not hasattr(v, "rvs") for v in dist.values())
+            for dist in self.param_distributions
+        )
+
+    def __iter__(self):
+        rng = check_random_state(self.random_state)
+
+        # if all distributions are given as lists, we want to sample without
+        # replacement
+        if self._is_all_lists():
+            # look up sampled parameter settings in parameter grid
+            param_grid = ParameterGrid(self.param_distributions)
+            grid_size = len(param_grid)
+            n_iter = self.n_iter
+
+            if grid_size < n_iter:
+                warnings.warn(
+                    "The total space of parameters %d is smaller "
+                    "than n_iter=%d. Running %d iterations. For exhaustive "
+                    "searches, use GridSearchCV." % (grid_size, self.n_iter, grid_size),
+                    UserWarning,
+                )
+                n_iter = grid_size
+            for i in sample_without_replacement(grid_size, n_iter, random_state=rng):
+                yield param_grid[i]
+
+        else:
+            for _ in range(self.n_iter):
+                dist = rng.choice(self.param_distributions)
+                # Always sort the keys of a dictionary, for reproducibility
+                items = sorted(dist.items())
+                params = dict()
+                for k, v in items:
+                    if hasattr(v, "rvs"):
+                        params[k] = v.rvs(random_state=rng)
+                    else:
+                        params[k] = v[rng.randint(len(v))]
+                yield params
+
+    def __len__(self):
+        """Number of points that will be sampled."""
+        if self._is_all_lists():
+            grid_size = len(ParameterGrid(self.param_distributions))
+            return min(self.n_iter, grid_size)
+        else:
+            return self.n_iter
+
+
+def _check_refit(search_cv, attr):
+    if not search_cv.refit:
+        raise AttributeError(
+            f"This {type(search_cv).__name__} instance was initialized with "
+            f"`refit=False`. {attr} is available only after refitting on the best "
+            "parameters. You can refit an estimator manually using the "
+            "`best_params_` attribute"
+        )
+
+
+def _search_estimator_has(attr):
+    """Check if we can delegate a method to the underlying estimator.
+
+    Calling a prediction method will only be available if `refit=True`. In
+    such case, we check first the fitted best estimator. If it is not
+    fitted, we check the unfitted estimator.
+
+    Checking the unfitted estimator allows to use `hasattr` on the `SearchCV`
+    instance even before calling `fit`.
+    """
+
+    def check(self):
+        _check_refit(self, attr)
+        if hasattr(self, "best_estimator_"):
+            # raise an AttributeError if `attr` does not exist
+            getattr(self.best_estimator_, attr)
+            return True
+        # raise an AttributeError if `attr` does not exist
+        getattr(self.estimator, attr)
+        return True
+
+    return check
+
+
+def _yield_masked_array_for_each_param(candidate_params):
+    """
+    Yield a masked array for each candidate param.
+
+    `candidate_params` is a sequence of params which were used in
+    a `GridSearchCV`. We use masked arrays for the results, as not
+    all params are necessarily present in each element of
+    `candidate_params`. For example, if using `GridSearchCV` with
+    a `SVC` model, then one might search over params like:
+
+        - kernel=["rbf"], gamma=[0.1, 1]
+        - kernel=["poly"], degree=[1, 2]
+
+    and then param `'gamma'` would not be present in entries of
+    `candidate_params` corresponding to `kernel='poly'`.
+    """
+    n_candidates = len(candidate_params)
+    param_results = defaultdict(dict)
+
+    for cand_idx, params in enumerate(candidate_params):
+        for name, value in params.items():
+            param_results["param_%s" % name][cand_idx] = value
+
+    for key, param_result in param_results.items():
+        param_list = list(param_result.values())
+        try:
+            arr = np.array(param_list)
+        except ValueError:
+            # This can happen when param_list contains lists of different
+            # lengths, for example:
+            # param_list=[[1], [2, 3]]
+            arr_dtype = np.dtype(object)
+        else:
+            # There are two cases when we don't use the automatically inferred
+            # dtype when creating the array and we use object instead:
+            # - string dtype
+            # - when array.ndim > 1, that means that param_list was something
+            #   like a list of same-size sequences, which gets turned into a
+            #   multi-dimensional array but we want a 1d array
+            arr_dtype = arr.dtype if arr.dtype.kind != "U" and arr.ndim == 1 else object
+
+        # Use one MaskedArray and mask all the places where the param is not
+        # applicable for that candidate (which may not contain all the params).
+        ma = MaskedArray(np.empty(n_candidates, dtype=arr_dtype), mask=True)
+        for index, value in param_result.items():
+            # Setting the value at an index unmasks that index
+            ma[index] = value
+        yield (key, ma)
+
+
+class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
+    """Abstract base class for hyper parameter search with cross-validation."""
+
+    _parameter_constraints: dict = {
+        "estimator": [HasMethods(["fit"])],
+        "scoring": [
+            StrOptions(set(get_scorer_names())),
+            callable,
+            list,
+            tuple,
+            dict,
+            None,
+        ],
+        "n_jobs": [numbers.Integral, None],
+        "refit": ["boolean", str, callable],
+        "cv": ["cv_object"],
+        "verbose": ["verbose"],
+        "pre_dispatch": [numbers.Integral, str],
+        "error_score": [StrOptions({"raise"}), numbers.Real],
+        "return_train_score": ["boolean"],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        estimator,
+        *,
+        scoring=None,
+        n_jobs=None,
+        refit=True,
+        cv=None,
+        verbose=0,
+        pre_dispatch="2*n_jobs",
+        error_score=np.nan,
+        return_train_score=True,
+    ):
+        self.scoring = scoring
+        self.estimator = estimator
+        self.n_jobs = n_jobs
+        self.refit = refit
+        self.cv = cv
+        self.verbose = verbose
+        self.pre_dispatch = pre_dispatch
+        self.error_score = error_score
+        self.return_train_score = return_train_score
+
+    @property
+    # TODO(1.8) remove this property
+    def _estimator_type(self):
+        return self.estimator._estimator_type
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        sub_estimator_tags = get_tags(self.estimator)
+        tags.estimator_type = sub_estimator_tags.estimator_type
+        tags.classifier_tags = deepcopy(sub_estimator_tags.classifier_tags)
+        tags.regressor_tags = deepcopy(sub_estimator_tags.regressor_tags)
+        # allows cross-validation to see 'precomputed' metrics
+        tags.input_tags.pairwise = get_tags(self.estimator).input_tags.pairwise
+        tags.array_api_support = get_tags(self.estimator).array_api_support
+        return tags
+
+    def score(self, X, y=None, **params):
+        """Return the score on the given data, if the estimator has been refit.
+
+        This uses the score defined by ``scoring`` where provided, and the
+        ``best_estimator_.score`` method otherwise.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples, n_output) \
+            or (n_samples,), default=None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        **params : dict
+            Parameters to be passed to the underlying scorer(s).
+
+            .. versionadded:: 1.4
+                Only available if `enable_metadata_routing=True`. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        score : float
+            The score defined by ``scoring`` if provided, and the
+            ``best_estimator_.score`` method otherwise.
+        """
+        _check_refit(self, "score")
+        check_is_fitted(self)
+
+        _raise_for_params(params, self, "score")
+
+        if _routing_enabled():
+            score_params = process_routing(self, "score", **params).scorer["score"]
+        else:
+            score_params = dict()
+
+        if self.scorer_ is None:
+            raise ValueError(
+                "No score function explicitly defined, "
+                "and the estimator doesn't provide one %s" % self.best_estimator_
+            )
+        if isinstance(self.scorer_, dict):
+            if self.multimetric_:
+                scorer = self.scorer_[self.refit]
+            else:
+                scorer = self.scorer_
+            return scorer(self.best_estimator_, X, y, **score_params)
+
+        # callable
+        score = self.scorer_(self.best_estimator_, X, y, **score_params)
+        if self.multimetric_:
+            score = score[self.refit]
+        return score
+
+    @available_if(_search_estimator_has("score_samples"))
+    def score_samples(self, X):
+        """Call score_samples on the estimator with the best found parameters.
+
+        Only available if ``refit=True`` and the underlying estimator supports
+        ``score_samples``.
+
+        .. versionadded:: 0.24
+
+        Parameters
+        ----------
+        X : iterable
+            Data to predict on. Must fulfill input requirements
+            of the underlying estimator.
+
+        Returns
+        -------
+        y_score : ndarray of shape (n_samples,)
+            The ``best_estimator_.score_samples`` method.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.score_samples(X)
+
+    @available_if(_search_estimator_has("predict"))
+    def predict(self, X):
+        """Call predict on the estimator with the best found parameters.
+
+        Only available if ``refit=True`` and the underlying estimator supports
+        ``predict``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            The predicted labels or values for `X` based on the estimator with
+            the best found parameters.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.predict(X)
+
+    @available_if(_search_estimator_has("predict_proba"))
+    def predict_proba(self, X):
+        """Call predict_proba on the estimator with the best found parameters.
+
+        Only available if ``refit=True`` and the underlying estimator supports
+        ``predict_proba``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Predicted class probabilities for `X` based on the estimator with
+            the best found parameters. The order of the classes corresponds
+            to that in the fitted attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.predict_proba(X)
+
+    @available_if(_search_estimator_has("predict_log_proba"))
+    def predict_log_proba(self, X):
+        """Call predict_log_proba on the estimator with the best found parameters.
+
+        Only available if ``refit=True`` and the underlying estimator supports
+        ``predict_log_proba``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Predicted class log-probabilities for `X` based on the estimator
+            with the best found parameters. The order of the classes
+            corresponds to that in the fitted attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.predict_log_proba(X)
+
+    @available_if(_search_estimator_has("decision_function"))
+    def decision_function(self, X):
+        """Call decision_function on the estimator with the best found parameters.
+
+        Only available if ``refit=True`` and the underlying estimator supports
+        ``decision_function``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        y_score : ndarray of shape (n_samples,) or (n_samples, n_classes) \
+                or (n_samples, n_classes * (n_classes-1) / 2)
+            Result of the decision function for `X` based on the estimator with
+            the best found parameters.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.decision_function(X)
+
+    @available_if(_search_estimator_has("transform"))
+    def transform(self, X):
+        """Call transform on the estimator with the best found parameters.
+
+        Only available if the underlying estimator supports ``transform`` and
+        ``refit=True``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            `X` transformed in the new space based on the estimator with
+            the best found parameters.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.transform(X)
+
+    @available_if(_search_estimator_has("inverse_transform"))
+    def inverse_transform(self, X=None, Xt=None):
+        """Call inverse_transform on the estimator with the best found params.
+
+        Only available if the underlying estimator implements
+        ``inverse_transform`` and ``refit=True``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Xt : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+            .. deprecated:: 1.5
+                `Xt` was deprecated in 1.5 and will be removed in 1.7. Use `X` instead.
+
+        Returns
+        -------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Result of the `inverse_transform` function for `Xt` based on the
+            estimator with the best found parameters.
+        """
+        X = _deprecate_Xt_in_inverse_transform(X, Xt)
+        check_is_fitted(self)
+        return self.best_estimator_.inverse_transform(X)
+
+    @property
+    def n_features_in_(self):
+        """Number of features seen during :term:`fit`.
+
+        Only available when `refit=True`.
+        """
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the search estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute.".format(
+                    self.__class__.__name__
+                )
+            ) from nfe
+
+        return self.best_estimator_.n_features_in_
+
+    @property
+    def classes_(self):
+        """Class labels.
+
+        Only available when `refit=True` and the estimator is a classifier.
+        """
+        _search_estimator_has("classes_")(self)
+        return self.best_estimator_.classes_
+
+    def _run_search(self, evaluate_candidates):
+        """Repeatedly calls `evaluate_candidates` to conduct a search.
+
+        This method, implemented in sub-classes, makes it possible to
+        customize the scheduling of evaluations: GridSearchCV and
+        RandomizedSearchCV schedule evaluations for their whole parameter
+        search space at once but other more sequential approaches are also
+        possible: for instance is possible to iteratively schedule evaluations
+        for new regions of the parameter search space based on previously
+        collected evaluation results. This makes it possible to implement
+        Bayesian optimization or more generally sequential model-based
+        optimization by deriving from the BaseSearchCV abstract base class.
+        For example, Successive Halving is implemented by calling
+        `evaluate_candidates` multiples times (once per iteration of the SH
+        process), each time passing a different set of candidates with `X`
+        and `y` of increasing sizes.
+
+        Parameters
+        ----------
+        evaluate_candidates : callable
+            This callback accepts:
+                - a list of candidates, where each candidate is a dict of
+                  parameter settings.
+                - an optional `cv` parameter which can be used to e.g.
+                  evaluate candidates on different dataset splits, or
+                  evaluate candidates on subsampled data (as done in the
+                  SucessiveHaling estimators). By default, the original `cv`
+                  parameter is used, and it is available as a private
+                  `_checked_cv_orig` attribute.
+                - an optional `more_results` dict. Each key will be added to
+                  the `cv_results_` attribute. Values should be lists of
+                  length `n_candidates`
+
+            It returns a dict of all results so far, formatted like
+            ``cv_results_``.
+
+            Important note (relevant whether the default cv is used or not):
+            in randomized splitters, and unless the random_state parameter of
+            cv was set to an int, calling cv.split() multiple times will
+            yield different splits. Since cv.split() is called in
+            evaluate_candidates, this means that candidates will be evaluated
+            on different splits each time evaluate_candidates is called. This
+            might be a methodological issue depending on the search strategy
+            that you're implementing. To prevent randomized splitters from
+            being used, you may use _split._yields_constant_splits()
+
+        Examples
+        --------
+
+        ::
+
+            def _run_search(self, evaluate_candidates):
+                'Try C=0.1 only if C=1 is better than C=10'
+                all_results = evaluate_candidates([{'C': 1}, {'C': 10}])
+                score = all_results['mean_test_score']
+                if score[0] < score[1]:
+                    evaluate_candidates([{'C': 0.1}])
+        """
+        raise NotImplementedError("_run_search not implemented.")
+
+    def _check_refit_for_multimetric(self, scores):
+        """Check `refit` is compatible with `scores` is valid"""
+        multimetric_refit_msg = (
+            "For multi-metric scoring, the parameter refit must be set to a "
+            "scorer key or a callable to refit an estimator with the best "
+            "parameter setting on the whole data and make the best_* "
+            "attributes available for that metric. If this is not needed, "
+            f"refit should be set to False explicitly. {self.refit!r} was "
+            "passed."
+        )
+
+        valid_refit_dict = isinstance(self.refit, str) and self.refit in scores
+
+        if (
+            self.refit is not False
+            and not valid_refit_dict
+            and not callable(self.refit)
+        ):
+            raise ValueError(multimetric_refit_msg)
+
+    @staticmethod
+    def _select_best_index(refit, refit_metric, results):
+        """Select index of the best combination of hyperparemeters."""
+        if callable(refit):
+            # If callable, refit is expected to return the index of the best
+            # parameter set.
+            best_index = refit(results)
+            if not isinstance(best_index, numbers.Integral):
+                raise TypeError("best_index_ returned is not an integer")
+            if best_index < 0 or best_index >= len(results["params"]):
+                raise IndexError("best_index_ index out of range")
+        else:
+            best_index = results[f"rank_test_{refit_metric}"].argmin()
+        return best_index
+
+    def _get_scorers(self):
+        """Get the scorer(s) to be used.
+
+        This is used in ``fit`` and ``get_metadata_routing``.
+
+        Returns
+        -------
+        scorers, refit_metric
+        """
+        refit_metric = "score"
+
+        if callable(self.scoring):
+            scorers = self.scoring
+        elif self.scoring is None or isinstance(self.scoring, str):
+            scorers = check_scoring(self.estimator, self.scoring)
+        else:
+            scorers = _check_multimetric_scoring(self.estimator, self.scoring)
+            self._check_refit_for_multimetric(scorers)
+            refit_metric = self.refit
+            scorers = _MultimetricScorer(
+                scorers=scorers, raise_exc=(self.error_score == "raise")
+            )
+
+        return scorers, refit_metric
+
+    def _get_routed_params_for_fit(self, params):
+        """Get the parameters to be used for routing.
+
+        This is a method instead of a snippet in ``fit`` since it's used twice,
+        here in ``fit``, and in ``HalvingRandomSearchCV.fit``.
+        """
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **params)
+        else:
+            params = params.copy()
+            groups = params.pop("groups", None)
+            routed_params = Bunch(
+                estimator=Bunch(fit=params),
+                splitter=Bunch(split={"groups": groups}),
+                scorer=Bunch(score={}),
+            )
+        return routed_params
+
+    @_fit_context(
+        # *SearchCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None, **params):
+        """Run fit with all sets of parameters.
+
+        Parameters
+        ----------
+
+        X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features. For precomputed kernel or
+            distance matrix, the expected shape of X is (n_samples, n_samples).
+
+        y : array-like of shape (n_samples, n_output) \
+            or (n_samples,), default=None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        **params : dict of str -> object
+            Parameters passed to the ``fit`` method of the estimator, the scorer,
+            and the CV splitter.
+
+            If a fit parameter is an array-like whose length is equal to
+            `num_samples` then it will be split by cross-validation along with
+            `X` and `y`. For example, the :term:`sample_weight` parameter is
+            split because `len(sample_weights) = len(X)`. However, this behavior
+            does not apply to `groups` which is passed to the splitter configured
+            via the `cv` parameter of the constructor. Thus, `groups` is used
+            *to perform the split* and determines which samples are
+            assigned to the each side of the a split.
+
+        Returns
+        -------
+        self : object
+            Instance of fitted estimator.
+        """
+        estimator = self.estimator
+        scorers, refit_metric = self._get_scorers()
+
+        X, y = indexable(X, y)
+        params = _check_method_params(X, params=params)
+
+        routed_params = self._get_routed_params_for_fit(params)
+
+        cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
+        n_splits = cv_orig.get_n_splits(X, y, **routed_params.splitter.split)
+
+        base_estimator = clone(self.estimator)
+
+        parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch)
+
+        fit_and_score_kwargs = dict(
+            scorer=scorers,
+            fit_params=routed_params.estimator.fit,
+            score_params=routed_params.scorer.score,
+            return_train_score=self.return_train_score,
+            return_n_test_samples=True,
+            return_times=True,
+            return_parameters=False,
+            error_score=self.error_score,
+            verbose=self.verbose,
+        )
+        results = {}
+        with parallel:
+            all_candidate_params = []
+            all_out = []
+            all_more_results = defaultdict(list)
+
+            def evaluate_candidates(candidate_params, cv=None, more_results=None):
+                cv = cv or cv_orig
+                candidate_params = list(candidate_params)
+                n_candidates = len(candidate_params)
+
+                if self.verbose > 0:
+                    print(
+                        "Fitting {0} folds for each of {1} candidates,"
+                        " totalling {2} fits".format(
+                            n_splits, n_candidates, n_candidates * n_splits
+                        )
+                    )
+
+                out = parallel(
+                    delayed(_fit_and_score)(
+                        clone(base_estimator),
+                        X,
+                        y,
+                        train=train,
+                        test=test,
+                        parameters=parameters,
+                        split_progress=(split_idx, n_splits),
+                        candidate_progress=(cand_idx, n_candidates),
+                        **fit_and_score_kwargs,
+                    )
+                    for (cand_idx, parameters), (split_idx, (train, test)) in product(
+                        enumerate(candidate_params),
+                        enumerate(cv.split(X, y, **routed_params.splitter.split)),
+                    )
+                )
+
+                if len(out) < 1:
+                    raise ValueError(
+                        "No fits were performed. "
+                        "Was the CV iterator empty? "
+                        "Were there no candidates?"
+                    )
+                elif len(out) != n_candidates * n_splits:
+                    raise ValueError(
+                        "cv.split and cv.get_n_splits returned "
+                        "inconsistent results. Expected {} "
+                        "splits, got {}".format(n_splits, len(out) // n_candidates)
+                    )
+
+                _warn_or_raise_about_fit_failures(out, self.error_score)
+
+                # For callable self.scoring, the return type is only know after
+                # calling. If the return type is a dictionary, the error scores
+                # can now be inserted with the correct key. The type checking
+                # of out will be done in `_insert_error_scores`.
+                if callable(self.scoring):
+                    _insert_error_scores(out, self.error_score)
+
+                all_candidate_params.extend(candidate_params)
+                all_out.extend(out)
+
+                if more_results is not None:
+                    for key, value in more_results.items():
+                        all_more_results[key].extend(value)
+
+                nonlocal results
+                results = self._format_results(
+                    all_candidate_params, n_splits, all_out, all_more_results
+                )
+
+                return results
+
+            self._run_search(evaluate_candidates)
+
+            # multimetric is determined here because in the case of a callable
+            # self.scoring the return type is only known after calling
+            first_test_score = all_out[0]["test_scores"]
+            self.multimetric_ = isinstance(first_test_score, dict)
+
+            # check refit_metric now for a callable scorer that is multimetric
+            if callable(self.scoring) and self.multimetric_:
+                self._check_refit_for_multimetric(first_test_score)
+                refit_metric = self.refit
+
+        # For multi-metric evaluation, store the best_index_, best_params_ and
+        # best_score_ iff refit is one of the scorer names
+        # In single metric evaluation, refit_metric is "score"
+        if self.refit or not self.multimetric_:
+            self.best_index_ = self._select_best_index(
+                self.refit, refit_metric, results
+            )
+            if not callable(self.refit):
+                # With a non-custom callable, we can select the best score
+                # based on the best index
+                self.best_score_ = results[f"mean_test_{refit_metric}"][
+                    self.best_index_
+                ]
+            self.best_params_ = results["params"][self.best_index_]
+
+        if self.refit:
+            # here we clone the estimator as well as the parameters, since
+            # sometimes the parameters themselves might be estimators, e.g.
+            # when we search over different estimators in a pipeline.
+            # ref: https://github.com/scikit-learn/scikit-learn/pull/26786
+            self.best_estimator_ = clone(base_estimator).set_params(
+                **clone(self.best_params_, safe=False)
+            )
+
+            refit_start_time = time.time()
+            if y is not None:
+                self.best_estimator_.fit(X, y, **routed_params.estimator.fit)
+            else:
+                self.best_estimator_.fit(X, **routed_params.estimator.fit)
+            refit_end_time = time.time()
+            self.refit_time_ = refit_end_time - refit_start_time
+
+            if hasattr(self.best_estimator_, "feature_names_in_"):
+                self.feature_names_in_ = self.best_estimator_.feature_names_in_
+
+        # Store the only scorer not as a dict for single metric evaluation
+        if isinstance(scorers, _MultimetricScorer):
+            self.scorer_ = scorers._scorers
+        else:
+            self.scorer_ = scorers
+
+        self.cv_results_ = results
+        self.n_splits_ = n_splits
+
+        return self
+
+    def _format_results(self, candidate_params, n_splits, out, more_results=None):
+        n_candidates = len(candidate_params)
+        out = _aggregate_score_dicts(out)
+
+        results = dict(more_results or {})
+        for key, val in results.items():
+            # each value is a list (as per evaluate_candidate's convention)
+            # we convert it to an array for consistency with the other keys
+            results[key] = np.asarray(val)
+
+        def _store(key_name, array, weights=None, splits=False, rank=False):
+            """A small helper to store the scores/times to the cv_results_"""
+            # When iterated first by splits, then by parameters
+            # We want `array` to have `n_candidates` rows and `n_splits` cols.
+            array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits)
+            if splits:
+                for split_idx in range(n_splits):
+                    # Uses closure to alter the results
+                    results["split%d_%s" % (split_idx, key_name)] = array[:, split_idx]
+
+            array_means = np.average(array, axis=1, weights=weights)
+            results["mean_%s" % key_name] = array_means
+
+            if key_name.startswith(("train_", "test_")) and np.any(
+                ~np.isfinite(array_means)
+            ):
+                warnings.warn(
+                    (
+                        f"One or more of the {key_name.split('_')[0]} scores "
+                        f"are non-finite: {array_means}"
+                    ),
+                    category=UserWarning,
+                )
+
+            # Weighted std is not directly available in numpy
+            array_stds = np.sqrt(
+                np.average(
+                    (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights
+                )
+            )
+            results["std_%s" % key_name] = array_stds
+
+            if rank:
+                # When the fit/scoring fails `array_means` contains NaNs, we
+                # will exclude them from the ranking process and consider them
+                # as tied with the worst performers.
+                if np.isnan(array_means).all():
+                    # All fit/scoring routines failed.
+                    rank_result = np.ones_like(array_means, dtype=np.int32)
+                else:
+                    min_array_means = np.nanmin(array_means) - 1
+                    array_means = np.nan_to_num(array_means, nan=min_array_means)
+                    rank_result = rankdata(-array_means, method="min").astype(
+                        np.int32, copy=False
+                    )
+                results["rank_%s" % key_name] = rank_result
+
+        _store("fit_time", out["fit_time"])
+        _store("score_time", out["score_time"])
+        # Store a list of param dicts at the key 'params'
+        for param, ma in _yield_masked_array_for_each_param(candidate_params):
+            results[param] = ma
+        results["params"] = candidate_params
+
+        test_scores_dict = _normalize_score_results(out["test_scores"])
+        if self.return_train_score:
+            train_scores_dict = _normalize_score_results(out["train_scores"])
+
+        for scorer_name in test_scores_dict:
+            # Computed the (weighted) mean and std for test scores alone
+            _store(
+                "test_%s" % scorer_name,
+                test_scores_dict[scorer_name],
+                splits=True,
+                rank=True,
+                weights=None,
+            )
+            if self.return_train_score:
+                _store(
+                    "train_%s" % scorer_name,
+                    train_scores_dict[scorer_name],
+                    splits=True,
+                )
+
+        return results
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.4
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__)
+        router.add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+        )
+
+        scorer, _ = self._get_scorers()
+        router.add(
+            scorer=scorer,
+            method_mapping=MethodMapping()
+            .add(caller="score", callee="score")
+            .add(caller="fit", callee="score"),
+        )
+        router.add(
+            splitter=self.cv,
+            method_mapping=MethodMapping().add(caller="fit", callee="split"),
+        )
+        return router
+
+    def _sk_visual_block_(self):
+        if hasattr(self, "best_estimator_"):
+            key, estimator = "best_estimator_", self.best_estimator_
+        else:
+            key, estimator = "estimator", self.estimator
+
+        return _VisualBlock(
+            "parallel",
+            [estimator],
+            names=[f"{key}: {estimator.__class__.__name__}"],
+            name_details=[str(estimator)],
+        )
+
+
+class GridSearchCV(BaseSearchCV):
+    """Exhaustive search over specified parameter values for an estimator.
+
+    Important members are fit, predict.
+
+    GridSearchCV implements a "fit" and a "score" method.
+    It also implements "score_samples", "predict", "predict_proba",
+    "decision_function", "transform" and "inverse_transform" if they are
+    implemented in the estimator used.
+
+    The parameters of the estimator used to apply these methods are optimized
+    by cross-validated grid-search over a parameter grid.
+
+    Read more in the :ref:`User Guide <grid_search>`.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        This is assumed to implement the scikit-learn estimator interface.
+        Either estimator needs to provide a ``score`` function,
+        or ``scoring`` must be passed.
+
+    param_grid : dict or list of dictionaries
+        Dictionary with parameters names (`str`) as keys and lists of
+        parameter settings to try as values, or a list of such
+        dictionaries, in which case the grids spanned by each dictionary
+        in the list are explored. This enables searching over any sequence
+        of parameter settings.
+
+    scoring : str, callable, list, tuple or dict, default=None
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set.
+
+        If `scoring` represents a single score, one can use:
+
+        - a single string (see :ref:`scoring_parameter`);
+        - a callable (see :ref:`scoring_callable`) that returns a single value.
+
+        If `scoring` represents multiple scores, one can use:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables as values.
+
+        See :ref:`multimetric_grid_search` for an example.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
+    refit : bool, str, or callable, default=True
+        Refit an estimator using the best found parameters on the whole
+        dataset.
+
+        For multiple metric evaluation, this needs to be a `str` denoting the
+        scorer that would be used to find the best parameters for refitting
+        the estimator at the end.
+
+        Where there are considerations other than maximum score in
+        choosing a best estimator, ``refit`` can be set to a function which
+        returns the selected ``best_index_`` given ``cv_results_``. In that
+        case, the ``best_estimator_`` and ``best_params_`` will be set
+        according to the returned ``best_index_`` while the ``best_score_``
+        attribute will not be available.
+
+        The refitted estimator is made available at the ``best_estimator_``
+        attribute and permits using ``predict`` directly on this
+        ``GridSearchCV`` instance.
+
+        Also for multiple metric evaluation, the attributes ``best_index_``,
+        ``best_score_`` and ``best_params_`` will only be available if
+        ``refit`` is set and all of them will be determined w.r.t this specific
+        scorer.
+
+        See ``scoring`` parameter to know more about multiple metric
+        evaluation.
+
+        See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py`
+        to see how to design a custom selection strategy using a callable
+        via `refit`.
+
+        .. versionchanged:: 0.20
+            Support for callable added.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    verbose : int
+        Controls the verbosity: the higher, the more messages.
+
+        - >1 : the computation time for each fold and parameter candidate is
+          displayed;
+        - >2 : the score is also displayed;
+        - >3 : the fold and candidate parameter indexes are also displayed
+          together with the starting time of the computation.
+
+    pre_dispatch : int, or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - None, in which case all the jobs are immediately created and spawned. Use
+          this for lightweight and fast-running jobs, to avoid delays due to on-demand
+          spawning of the jobs
+        - An int, giving the exact number of total jobs that are spawned
+        - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised. If a numeric value is given,
+        FitFailedWarning is raised. This parameter does not affect the refit
+        step, which will always raise the error.
+
+    return_train_score : bool, default=False
+        If ``False``, the ``cv_results_`` attribute will not include training
+        scores.
+        Computing training scores is used to get insights on how different
+        parameter settings impact the overfitting/underfitting trade-off.
+        However computing the scores on the training set can be computationally
+        expensive and is not strictly required to select the parameters that
+        yield the best generalization performance.
+
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
+    Attributes
+    ----------
+    cv_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas ``DataFrame``.
+
+        For instance the below given table
+
+        +------------+-----------+------------+-----------------+---+---------+
+        |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...|
+        +============+===========+============+=================+===+=========+
+        |  'poly'    |     --    |      2     |       0.80      |...|    2    |
+        +------------+-----------+------------+-----------------+---+---------+
+        |  'poly'    |     --    |      3     |       0.70      |...|    4    |
+        +------------+-----------+------------+-----------------+---+---------+
+        |  'rbf'     |     0.1   |     --     |       0.80      |...|    3    |
+        +------------+-----------+------------+-----------------+---+---------+
+        |  'rbf'     |     0.2   |     --     |       0.93      |...|    1    |
+        +------------+-----------+------------+-----------------+---+---------+
+
+        will be represented by a ``cv_results_`` dict of::
+
+            {
+            'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
+                                         mask = [False False False False]...)
+            'param_gamma': masked_array(data = [-- -- 0.1 0.2],
+                                        mask = [ True  True False False]...),
+            'param_degree': masked_array(data = [2.0 3.0 -- --],
+                                         mask = [False False  True  True]...),
+            'split0_test_score'  : [0.80, 0.70, 0.80, 0.93],
+            'split1_test_score'  : [0.82, 0.50, 0.70, 0.78],
+            'mean_test_score'    : [0.81, 0.60, 0.75, 0.85],
+            'std_test_score'     : [0.01, 0.10, 0.05, 0.08],
+            'rank_test_score'    : [2, 4, 3, 1],
+            'split0_train_score' : [0.80, 0.92, 0.70, 0.93],
+            'split1_train_score' : [0.82, 0.55, 0.70, 0.87],
+            'mean_train_score'   : [0.81, 0.74, 0.70, 0.90],
+            'std_train_score'    : [0.01, 0.19, 0.00, 0.03],
+            'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
+            'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
+            'mean_score_time'    : [0.01, 0.06, 0.04, 0.04],
+            'std_score_time'     : [0.00, 0.00, 0.00, 0.01],
+            'params'             : [{'kernel': 'poly', 'degree': 2}, ...],
+            }
+
+        NOTE
+
+        The key ``'params'`` is used to store a list of parameter
+        settings dicts for all the parameter candidates.
+
+        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
+        ``std_score_time`` are all in seconds.
+
+        For multi-metric evaluation, the scores for all the scorers are
+        available in the ``cv_results_`` dict at the keys ending with that
+        scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
+        above. ('split0_test_precision', 'mean_train_precision' etc.)
+
+    best_estimator_ : estimator
+        Estimator that was chosen by the search, i.e. estimator
+        which gave highest score (or smallest loss if specified)
+        on the left out data. Not available if ``refit=False``.
+
+        See ``refit`` parameter for more information on allowed values.
+
+    best_score_ : float
+        Mean cross-validated score of the best_estimator
+
+        For multi-metric evaluation, this is present only if ``refit`` is
+        specified.
+
+        This attribute is not available if ``refit`` is a function.
+
+    best_params_ : dict
+        Parameter setting that gave the best results on the hold out data.
+
+        For multi-metric evaluation, this is present only if ``refit`` is
+        specified.
+
+    best_index_ : int
+        The index (of the ``cv_results_`` arrays) which corresponds to the best
+        candidate parameter setting.
+
+        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
+        the parameter setting for the best model, that gives the highest
+        mean score (``search.best_score_``).
+
+        For multi-metric evaluation, this is present only if ``refit`` is
+        specified.
+
+    scorer_ : function or a dict
+        Scorer function used on the held out data to choose the best
+        parameters for the model.
+
+        For multi-metric evaluation, this attribute holds the validated
+        ``scoring`` dict which maps the scorer key to the scorer callable.
+
+    n_splits_ : int
+        The number of cross-validation splits (folds/iterations).
+
+    refit_time_ : float
+        Seconds used for refitting the best model on the whole dataset.
+
+        This is present only if ``refit`` is not False.
+
+        .. versionadded:: 0.20
+
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `n_features_in_` when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `feature_names_in_` when fit.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    ParameterGrid : Generates all the combinations of a hyperparameter grid.
+    train_test_split : Utility function to split the data into a development
+        set usable for fitting a GridSearchCV instance and an evaluation set
+        for its final evaluation.
+    sklearn.metrics.make_scorer : Make a scorer from a performance metric or
+        loss function.
+
+    Notes
+    -----
+    The parameters selected are those that maximize the score of the left out
+    data, unless an explicit score is passed in which case it is used instead.
+
+    If `n_jobs` was set to a value higher than one, the data is copied for each
+    point in the grid (and not `n_jobs` times). This is done for efficiency
+    reasons if individual jobs take very little time, but may raise errors if
+    the dataset is large and not enough memory is available.  A workaround in
+    this case is to set `pre_dispatch`. Then, the memory is copied only
+    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
+    n_jobs`.
+
+    Examples
+    --------
+    >>> from sklearn import svm, datasets
+    >>> from sklearn.model_selection import GridSearchCV
+    >>> iris = datasets.load_iris()
+    >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
+    >>> svc = svm.SVC()
+    >>> clf = GridSearchCV(svc, parameters)
+    >>> clf.fit(iris.data, iris.target)
+    GridSearchCV(estimator=SVC(),
+                 param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})
+    >>> sorted(clf.cv_results_.keys())
+    ['mean_fit_time', 'mean_score_time', 'mean_test_score',...
+     'param_C', 'param_kernel', 'params',...
+     'rank_test_score', 'split0_test_score',...
+     'split2_test_score', ...
+     'std_fit_time', 'std_score_time', 'std_test_score']
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSearchCV._parameter_constraints,
+        "param_grid": [dict, list],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        param_grid,
+        *,
+        scoring=None,
+        n_jobs=None,
+        refit=True,
+        cv=None,
+        verbose=0,
+        pre_dispatch="2*n_jobs",
+        error_score=np.nan,
+        return_train_score=False,
+    ):
+        super().__init__(
+            estimator=estimator,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            refit=refit,
+            cv=cv,
+            verbose=verbose,
+            pre_dispatch=pre_dispatch,
+            error_score=error_score,
+            return_train_score=return_train_score,
+        )
+        self.param_grid = param_grid
+
+    def _run_search(self, evaluate_candidates):
+        """Search all candidates in param_grid"""
+        evaluate_candidates(ParameterGrid(self.param_grid))
+
+
+class RandomizedSearchCV(BaseSearchCV):
+    """Randomized search on hyper parameters.
+
+    RandomizedSearchCV implements a "fit" and a "score" method.
+    It also implements "score_samples", "predict", "predict_proba",
+    "decision_function", "transform" and "inverse_transform" if they are
+    implemented in the estimator used.
+
+    The parameters of the estimator used to apply these methods are optimized
+    by cross-validated search over parameter settings.
+
+    In contrast to GridSearchCV, not all parameter values are tried out, but
+    rather a fixed number of parameter settings is sampled from the specified
+    distributions. The number of parameter settings that are tried is
+    given by n_iter.
+
+    If all parameters are presented as a list,
+    sampling without replacement is performed. If at least one parameter
+    is given as a distribution, sampling with replacement is used.
+    It is highly recommended to use continuous distributions for continuous
+    parameters.
+
+    Read more in the :ref:`User Guide <randomized_parameter_search>`.
+
+    .. versionadded:: 0.14
+
+    Parameters
+    ----------
+    estimator : estimator object
+        An object of that type is instantiated for each grid point.
+        This is assumed to implement the scikit-learn estimator interface.
+        Either estimator needs to provide a ``score`` function,
+        or ``scoring`` must be passed.
+
+    param_distributions : dict or list of dicts
+        Dictionary with parameters names (`str`) as keys and distributions
+        or lists of parameters to try. Distributions must provide a ``rvs``
+        method for sampling (such as those from scipy.stats.distributions).
+        If a list is given, it is sampled uniformly.
+        If a list of dicts is given, first a dict is sampled uniformly, and
+        then a parameter is sampled using that dict as above.
+
+    n_iter : int, default=10
+        Number of parameter settings that are sampled. n_iter trades
+        off runtime vs quality of the solution.
+
+    scoring : str, callable, list, tuple or dict, default=None
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set.
+
+        If `scoring` represents a single score, one can use:
+
+        - a single string (see :ref:`scoring_parameter`);
+        - a callable (see :ref:`scoring_callable`) that returns a single value.
+
+        If `scoring` represents multiple scores, one can use:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables as values.
+
+        See :ref:`multimetric_grid_search` for an example.
+
+        If None, the estimator's score method is used.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
+    refit : bool, str, or callable, default=True
+        Refit an estimator using the best found parameters on the whole
+        dataset.
+
+        For multiple metric evaluation, this needs to be a `str` denoting the
+        scorer that would be used to find the best parameters for refitting
+        the estimator at the end.
+
+        Where there are considerations other than maximum score in
+        choosing a best estimator, ``refit`` can be set to a function which
+        returns the selected ``best_index_`` given the ``cv_results_``. In that
+        case, the ``best_estimator_`` and ``best_params_`` will be set
+        according to the returned ``best_index_`` while the ``best_score_``
+        attribute will not be available.
+
+        The refitted estimator is made available at the ``best_estimator_``
+        attribute and permits using ``predict`` directly on this
+        ``RandomizedSearchCV`` instance.
+
+        Also for multiple metric evaluation, the attributes ``best_index_``,
+        ``best_score_`` and ``best_params_`` will only be available if
+        ``refit`` is set and all of them will be determined w.r.t this specific
+        scorer.
+
+        See ``scoring`` parameter to know more about multiple metric
+        evaluation.
+
+        .. versionchanged:: 0.20
+            Support for callable added.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    verbose : int
+        Controls the verbosity: the higher, the more messages.
+
+        - >1 : the computation time for each fold and parameter candidate is
+          displayed;
+        - >2 : the score is also displayed;
+        - >3 : the fold and candidate parameter indexes are also displayed
+          together with the starting time of the computation.
+
+    pre_dispatch : int, or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - None, in which case all the jobs are immediately created and spawned. Use
+          this for lightweight and fast-running jobs, to avoid delays due to on-demand
+          spawning of the jobs
+        - An int, giving the exact number of total jobs that are spawned
+        - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo random number generator state used for random uniform sampling
+        from lists of possible values instead of scipy.stats distributions.
+        Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised. If a numeric value is given,
+        FitFailedWarning is raised. This parameter does not affect the refit
+        step, which will always raise the error.
+
+    return_train_score : bool, default=False
+        If ``False``, the ``cv_results_`` attribute will not include training
+        scores.
+        Computing training scores is used to get insights on how different
+        parameter settings impact the overfitting/underfitting trade-off.
+        However computing the scores on the training set can be computationally
+        expensive and is not strictly required to select the parameters that
+        yield the best generalization performance.
+
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
+    Attributes
+    ----------
+    cv_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas ``DataFrame``.
+
+        For instance the below given table
+
+        +--------------+-------------+-------------------+---+---------------+
+        | param_kernel | param_gamma | split0_test_score |...|rank_test_score|
+        +==============+=============+===================+===+===============+
+        |    'rbf'     |     0.1     |       0.80        |...|       1       |
+        +--------------+-------------+-------------------+---+---------------+
+        |    'rbf'     |     0.2     |       0.84        |...|       3       |
+        +--------------+-------------+-------------------+---+---------------+
+        |    'rbf'     |     0.3     |       0.70        |...|       2       |
+        +--------------+-------------+-------------------+---+---------------+
+
+        will be represented by a ``cv_results_`` dict of::
+
+            {
+            'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],
+                                          mask = False),
+            'param_gamma'  : masked_array(data = [0.1 0.2 0.3], mask = False),
+            'split0_test_score'  : [0.80, 0.84, 0.70],
+            'split1_test_score'  : [0.82, 0.50, 0.70],
+            'mean_test_score'    : [0.81, 0.67, 0.70],
+            'std_test_score'     : [0.01, 0.24, 0.00],
+            'rank_test_score'    : [1, 3, 2],
+            'split0_train_score' : [0.80, 0.92, 0.70],
+            'split1_train_score' : [0.82, 0.55, 0.70],
+            'mean_train_score'   : [0.81, 0.74, 0.70],
+            'std_train_score'    : [0.01, 0.19, 0.00],
+            'mean_fit_time'      : [0.73, 0.63, 0.43],
+            'std_fit_time'       : [0.01, 0.02, 0.01],
+            'mean_score_time'    : [0.01, 0.06, 0.04],
+            'std_score_time'     : [0.00, 0.00, 0.00],
+            'params'             : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
+            }
+
+        NOTE
+
+        The key ``'params'`` is used to store a list of parameter
+        settings dicts for all the parameter candidates.
+
+        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
+        ``std_score_time`` are all in seconds.
+
+        For multi-metric evaluation, the scores for all the scorers are
+        available in the ``cv_results_`` dict at the keys ending with that
+        scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
+        above. ('split0_test_precision', 'mean_train_precision' etc.)
+
+    best_estimator_ : estimator
+        Estimator that was chosen by the search, i.e. estimator
+        which gave highest score (or smallest loss if specified)
+        on the left out data. Not available if ``refit=False``.
+
+        For multi-metric evaluation, this attribute is present only if
+        ``refit`` is specified.
+
+        See ``refit`` parameter for more information on allowed values.
+
+    best_score_ : float
+        Mean cross-validated score of the best_estimator.
+
+        For multi-metric evaluation, this is not available if ``refit`` is
+        ``False``. See ``refit`` parameter for more information.
+
+        This attribute is not available if ``refit`` is a function.
+
+    best_params_ : dict
+        Parameter setting that gave the best results on the hold out data.
+
+        For multi-metric evaluation, this is not available if ``refit`` is
+        ``False``. See ``refit`` parameter for more information.
+
+    best_index_ : int
+        The index (of the ``cv_results_`` arrays) which corresponds to the best
+        candidate parameter setting.
+
+        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
+        the parameter setting for the best model, that gives the highest
+        mean score (``search.best_score_``).
+
+        For multi-metric evaluation, this is not available if ``refit`` is
+        ``False``. See ``refit`` parameter for more information.
+
+    scorer_ : function or a dict
+        Scorer function used on the held out data to choose the best
+        parameters for the model.
+
+        For multi-metric evaluation, this attribute holds the validated
+        ``scoring`` dict which maps the scorer key to the scorer callable.
+
+    n_splits_ : int
+        The number of cross-validation splits (folds/iterations).
+
+    refit_time_ : float
+        Seconds used for refitting the best model on the whole dataset.
+
+        This is present only if ``refit`` is not False.
+
+        .. versionadded:: 0.20
+
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `n_features_in_` when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `feature_names_in_` when fit.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    GridSearchCV : Does exhaustive search over a grid of parameters.
+    ParameterSampler : A generator over parameter settings, constructed from
+        param_distributions.
+
+    Notes
+    -----
+    The parameters selected are those that maximize the score of the held-out
+    data, according to the scoring parameter.
+
+    If `n_jobs` was set to a value higher than one, the data is copied for each
+    parameter setting(and not `n_jobs` times). This is done for efficiency
+    reasons if individual jobs take very little time, but may raise errors if
+    the dataset is large and not enough memory is available.  A workaround in
+    this case is to set `pre_dispatch`. Then, the memory is copied only
+    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
+    n_jobs`.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.model_selection import RandomizedSearchCV
+    >>> from scipy.stats import uniform
+    >>> iris = load_iris()
+    >>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
+    ...                               random_state=0)
+    >>> distributions = dict(C=uniform(loc=0, scale=4),
+    ...                      penalty=['l2', 'l1'])
+    >>> clf = RandomizedSearchCV(logistic, distributions, random_state=0)
+    >>> search = clf.fit(iris.data, iris.target)
+    >>> search.best_params_
+    {'C': np.float64(2...), 'penalty': 'l1'}
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSearchCV._parameter_constraints,
+        "param_distributions": [dict, list],
+        "n_iter": [Interval(numbers.Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        param_distributions,
+        *,
+        n_iter=10,
+        scoring=None,
+        n_jobs=None,
+        refit=True,
+        cv=None,
+        verbose=0,
+        pre_dispatch="2*n_jobs",
+        random_state=None,
+        error_score=np.nan,
+        return_train_score=False,
+    ):
+        self.param_distributions = param_distributions
+        self.n_iter = n_iter
+        self.random_state = random_state
+        super().__init__(
+            estimator=estimator,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            refit=refit,
+            cv=cv,
+            verbose=verbose,
+            pre_dispatch=pre_dispatch,
+            error_score=error_score,
+            return_train_score=return_train_score,
+        )
+
+    def _run_search(self, evaluate_candidates):
+        """Search n_iter candidates from param_distributions"""
+        evaluate_candidates(
+            ParameterSampler(
+                self.param_distributions, self.n_iter, random_state=self.random_state
+            )
+        )
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_search_successive_halving.py b/.venv/Lib/site-packages/sklearn/model_selection/_search_successive_halving.py
new file mode 100644
index 0000000000000000000000000000000000000000..662335d7140bb153e234d60c596997861873e877
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/_search_successive_halving.py
@@ -0,0 +1,1063 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import abstractmethod
+from math import ceil, floor, log
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import _fit_context, is_classifier
+from ..metrics._scorer import get_scorer_names
+from ..utils import resample
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import _num_samples, validate_data
+from . import ParameterGrid, ParameterSampler
+from ._search import BaseSearchCV
+from ._split import _yields_constant_splits, check_cv
+
+__all__ = ["HalvingGridSearchCV", "HalvingRandomSearchCV"]
+
+
+class _SubsampleMetaSplitter:
+    """Splitter that subsamples a given fraction of the dataset"""
+
+    def __init__(self, *, base_cv, fraction, subsample_test, random_state):
+        self.base_cv = base_cv
+        self.fraction = fraction
+        self.subsample_test = subsample_test
+        self.random_state = random_state
+
+    def split(self, X, y, **kwargs):
+        for train_idx, test_idx in self.base_cv.split(X, y, **kwargs):
+            train_idx = resample(
+                train_idx,
+                replace=False,
+                random_state=self.random_state,
+                n_samples=int(self.fraction * len(train_idx)),
+            )
+            if self.subsample_test:
+                test_idx = resample(
+                    test_idx,
+                    replace=False,
+                    random_state=self.random_state,
+                    n_samples=int(self.fraction * len(test_idx)),
+                )
+            yield train_idx, test_idx
+
+
+def _top_k(results, k, itr):
+    # Return the best candidates of a given iteration
+    iteration, mean_test_score, params = (
+        np.asarray(a)
+        for a in (results["iter"], results["mean_test_score"], results["params"])
+    )
+    iter_indices = np.flatnonzero(iteration == itr)
+    scores = mean_test_score[iter_indices]
+    # argsort() places NaNs at the end of the array so we move NaNs to the
+    # front of the array so the last `k` items are the those with the
+    # highest scores.
+    sorted_indices = np.roll(np.argsort(scores), np.count_nonzero(np.isnan(scores)))
+    return np.array(params[iter_indices][sorted_indices[-k:]])
+
+
+class BaseSuccessiveHalving(BaseSearchCV):
+    """Implements successive halving.
+
+    Ref:
+    Almost optimal exploration in multi-armed bandits, ICML 13
+    Zohar Karnin, Tomer Koren, Oren Somekh
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSearchCV._parameter_constraints,
+        # overwrite `scoring` since multi-metrics are not supported
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "random_state": ["random_state"],
+        "max_resources": [
+            Interval(Integral, 0, None, closed="neither"),
+            StrOptions({"auto"}),
+        ],
+        "min_resources": [
+            Interval(Integral, 0, None, closed="neither"),
+            StrOptions({"exhaust", "smallest"}),
+        ],
+        "resource": [str],
+        "factor": [Interval(Real, 0, None, closed="neither")],
+        "aggressive_elimination": ["boolean"],
+    }
+    _parameter_constraints.pop("pre_dispatch")  # not used in this class
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        scoring=None,
+        n_jobs=None,
+        refit=True,
+        cv=5,
+        verbose=0,
+        random_state=None,
+        error_score=np.nan,
+        return_train_score=True,
+        max_resources="auto",
+        min_resources="exhaust",
+        resource="n_samples",
+        factor=3,
+        aggressive_elimination=False,
+    ):
+        super().__init__(
+            estimator,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            refit=refit,
+            cv=cv,
+            verbose=verbose,
+            error_score=error_score,
+            return_train_score=return_train_score,
+        )
+
+        self.random_state = random_state
+        self.max_resources = max_resources
+        self.resource = resource
+        self.factor = factor
+        self.min_resources = min_resources
+        self.aggressive_elimination = aggressive_elimination
+
+    def _check_input_parameters(self, X, y, split_params):
+        # We need to enforce that successive calls to cv.split() yield the same
+        # splits: see https://github.com/scikit-learn/scikit-learn/issues/15149
+        if not _yields_constant_splits(self._checked_cv_orig):
+            raise ValueError(
+                "The cv parameter must yield consistent folds across "
+                "calls to split(). Set its random_state to an int, or set "
+                "shuffle=False."
+            )
+
+        if (
+            self.resource != "n_samples"
+            and self.resource not in self.estimator.get_params()
+        ):
+            raise ValueError(
+                f"Cannot use resource={self.resource} which is not supported "
+                f"by estimator {self.estimator.__class__.__name__}"
+            )
+
+        if isinstance(self, HalvingRandomSearchCV):
+            if self.min_resources == self.n_candidates == "exhaust":
+                # for n_candidates=exhaust to work, we need to know what
+                # min_resources is. Similarly min_resources=exhaust needs to
+                # know the actual number of candidates.
+                raise ValueError(
+                    "n_candidates and min_resources cannot be both set to 'exhaust'."
+                )
+
+        self.min_resources_ = self.min_resources
+        if self.min_resources_ in ("smallest", "exhaust"):
+            if self.resource == "n_samples":
+                n_splits = self._checked_cv_orig.get_n_splits(X, y, **split_params)
+                # please see https://gph.is/1KjihQe for a justification
+                magic_factor = 2
+                self.min_resources_ = n_splits * magic_factor
+                if is_classifier(self.estimator):
+                    y = validate_data(self, X="no_validation", y=y)
+                    check_classification_targets(y)
+                    n_classes = np.unique(y).shape[0]
+                    self.min_resources_ *= n_classes
+            else:
+                self.min_resources_ = 1
+            # if 'exhaust', min_resources_ might be set to a higher value later
+            # in _run_search
+
+        self.max_resources_ = self.max_resources
+        if self.max_resources_ == "auto":
+            if not self.resource == "n_samples":
+                raise ValueError(
+                    "resource can only be 'n_samples' when max_resources='auto'"
+                )
+            self.max_resources_ = _num_samples(X)
+
+        if self.min_resources_ > self.max_resources_:
+            raise ValueError(
+                f"min_resources_={self.min_resources_} is greater "
+                f"than max_resources_={self.max_resources_}."
+            )
+
+        if self.min_resources_ == 0:
+            raise ValueError(
+                f"min_resources_={self.min_resources_}: you might have passed "
+                "an empty dataset X."
+            )
+
+    @staticmethod
+    def _select_best_index(refit, refit_metric, results):
+        """Custom refit callable to return the index of the best candidate.
+
+        We want the best candidate out of the last iteration. By default
+        BaseSearchCV would return the best candidate out of all iterations.
+
+        Currently, we only support for a single metric thus `refit` and
+        `refit_metric` are not required.
+        """
+        last_iter = np.max(results["iter"])
+        last_iter_indices = np.flatnonzero(results["iter"] == last_iter)
+
+        test_scores = results["mean_test_score"][last_iter_indices]
+        # If all scores are NaNs there is no way to pick between them,
+        # so we (arbitrarily) declare the zero'th entry the best one
+        if np.isnan(test_scores).all():
+            best_idx = 0
+        else:
+            best_idx = np.nanargmax(test_scores)
+
+        return last_iter_indices[best_idx]
+
+    @_fit_context(
+        # Halving*SearchCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None, **params):
+        """Run fit with all sets of parameters.
+
+        Parameters
+        ----------
+
+        X : array-like, shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like, shape (n_samples,) or (n_samples, n_output), optional
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        **params : dict of string -> object
+            Parameters passed to the ``fit`` method of the estimator.
+
+        Returns
+        -------
+        self : object
+            Instance of fitted estimator.
+        """
+        self._checked_cv_orig = check_cv(
+            self.cv, y, classifier=is_classifier(self.estimator)
+        )
+
+        routed_params = self._get_routed_params_for_fit(params)
+        self._check_input_parameters(
+            X=X, y=y, split_params=routed_params.splitter.split
+        )
+
+        self._n_samples_orig = _num_samples(X)
+
+        super().fit(X, y=y, **params)
+
+        # Set best_score_: BaseSearchCV does not set it, as refit is a callable
+        self.best_score_ = self.cv_results_["mean_test_score"][self.best_index_]
+
+        return self
+
+    def _run_search(self, evaluate_candidates):
+        candidate_params = self._generate_candidate_params()
+
+        if self.resource != "n_samples" and any(
+            self.resource in candidate for candidate in candidate_params
+        ):
+            # Can only check this now since we need the candidates list
+            raise ValueError(
+                f"Cannot use parameter {self.resource} as the resource since "
+                "it is part of the searched parameters."
+            )
+
+        # n_required_iterations is the number of iterations needed so that the
+        # last iterations evaluates less than `factor` candidates.
+        n_required_iterations = 1 + floor(log(len(candidate_params), self.factor))
+
+        if self.min_resources == "exhaust":
+            # To exhaust the resources, we want to start with the biggest
+            # min_resources possible so that the last (required) iteration
+            # uses as many resources as possible
+            last_iteration = n_required_iterations - 1
+            self.min_resources_ = max(
+                self.min_resources_,
+                self.max_resources_ // self.factor**last_iteration,
+            )
+
+        # n_possible_iterations is the number of iterations that we can
+        # actually do starting from min_resources and without exceeding
+        # max_resources. Depending on max_resources and the number of
+        # candidates, this may be higher or smaller than
+        # n_required_iterations.
+        n_possible_iterations = 1 + floor(
+            log(self.max_resources_ // self.min_resources_, self.factor)
+        )
+
+        if self.aggressive_elimination:
+            n_iterations = n_required_iterations
+        else:
+            n_iterations = min(n_possible_iterations, n_required_iterations)
+
+        if self.verbose:
+            print(f"n_iterations: {n_iterations}")
+            print(f"n_required_iterations: {n_required_iterations}")
+            print(f"n_possible_iterations: {n_possible_iterations}")
+            print(f"min_resources_: {self.min_resources_}")
+            print(f"max_resources_: {self.max_resources_}")
+            print(f"aggressive_elimination: {self.aggressive_elimination}")
+            print(f"factor: {self.factor}")
+
+        self.n_resources_ = []
+        self.n_candidates_ = []
+
+        for itr in range(n_iterations):
+            power = itr  # default
+            if self.aggressive_elimination:
+                # this will set n_resources to the initial value (i.e. the
+                # value of n_resources at the first iteration) for as many
+                # iterations as needed (while candidates are being
+                # eliminated), and then go on as usual.
+                power = max(0, itr - n_required_iterations + n_possible_iterations)
+
+            n_resources = int(self.factor**power * self.min_resources_)
+            # guard, probably not needed
+            n_resources = min(n_resources, self.max_resources_)
+            self.n_resources_.append(n_resources)
+
+            n_candidates = len(candidate_params)
+            self.n_candidates_.append(n_candidates)
+
+            if self.verbose:
+                print("-" * 10)
+                print(f"iter: {itr}")
+                print(f"n_candidates: {n_candidates}")
+                print(f"n_resources: {n_resources}")
+
+            if self.resource == "n_samples":
+                # subsampling will be done in cv.split()
+                cv = _SubsampleMetaSplitter(
+                    base_cv=self._checked_cv_orig,
+                    fraction=n_resources / self._n_samples_orig,
+                    subsample_test=True,
+                    random_state=self.random_state,
+                )
+
+            else:
+                # Need copy so that the n_resources of next iteration does
+                # not overwrite
+                candidate_params = [c.copy() for c in candidate_params]
+                for candidate in candidate_params:
+                    candidate[self.resource] = n_resources
+                cv = self._checked_cv_orig
+
+            more_results = {
+                "iter": [itr] * n_candidates,
+                "n_resources": [n_resources] * n_candidates,
+            }
+
+            results = evaluate_candidates(
+                candidate_params, cv, more_results=more_results
+            )
+
+            n_candidates_to_keep = ceil(n_candidates / self.factor)
+            candidate_params = _top_k(results, n_candidates_to_keep, itr)
+
+        self.n_remaining_candidates_ = len(candidate_params)
+        self.n_required_iterations_ = n_required_iterations
+        self.n_possible_iterations_ = n_possible_iterations
+        self.n_iterations_ = n_iterations
+
+    @abstractmethod
+    def _generate_candidate_params(self):
+        pass
+
+
+class HalvingGridSearchCV(BaseSuccessiveHalving):
+    """Search over specified parameter values with successive halving.
+
+    The search strategy starts evaluating all the candidates with a small
+    amount of resources and iteratively selects the best candidates, using
+    more and more resources.
+
+    Read more in the :ref:`User guide <successive_halving_user_guide>`.
+
+    .. note::
+
+      This estimator is still **experimental** for now: the predictions
+      and the API might change without any deprecation cycle. To use it,
+      you need to explicitly import ``enable_halving_search_cv``::
+
+        >>> # explicitly require this experimental feature
+        >>> from sklearn.experimental import enable_halving_search_cv # noqa
+        >>> # now you can import normally from model_selection
+        >>> from sklearn.model_selection import HalvingGridSearchCV
+
+    Parameters
+    ----------
+    estimator : estimator object
+        This is assumed to implement the scikit-learn estimator interface.
+        Either estimator needs to provide a ``score`` function,
+        or ``scoring`` must be passed.
+
+    param_grid : dict or list of dictionaries
+        Dictionary with parameters names (string) as keys and lists of
+        parameter settings to try as values, or a list of such
+        dictionaries, in which case the grids spanned by each dictionary
+        in the list are explored. This enables searching over any sequence
+        of parameter settings.
+
+    factor : int or float, default=3
+        The 'halving' parameter, which determines the proportion of candidates
+        that are selected for each subsequent iteration. For example,
+        ``factor=3`` means that only one third of the candidates are selected.
+
+    resource : ``'n_samples'`` or str, default='n_samples'
+        Defines the resource that increases with each iteration. By default,
+        the resource is the number of samples. It can also be set to any
+        parameter of the base estimator that accepts positive integer
+        values, e.g. 'n_iterations' or 'n_estimators' for a gradient
+        boosting estimator. In this case ``max_resources`` cannot be 'auto'
+        and must be set explicitly.
+
+    max_resources : int, default='auto'
+        The maximum amount of resource that any candidate is allowed to use
+        for a given iteration. By default, this is set to ``n_samples`` when
+        ``resource='n_samples'`` (default), else an error is raised.
+
+    min_resources : {'exhaust', 'smallest'} or int, default='exhaust'
+        The minimum amount of resource that any candidate is allowed to use
+        for a given iteration. Equivalently, this defines the amount of
+        resources `r0` that are allocated for each candidate at the first
+        iteration.
+
+        - 'smallest' is a heuristic that sets `r0` to a small value:
+
+          - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem
+          - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a
+            classification problem
+          - ``1`` when ``resource != 'n_samples'``
+
+        - 'exhaust' will set `r0` such that the **last** iteration uses as
+          much resources as possible. Namely, the last iteration will use the
+          highest value smaller than ``max_resources`` that is a multiple of
+          both ``min_resources`` and ``factor``. In general, using 'exhaust'
+          leads to a more accurate estimator, but is slightly more time
+          consuming.
+
+        Note that the amount of resources used at each iteration is always a
+        multiple of ``min_resources``.
+
+    aggressive_elimination : bool, default=False
+        This is only relevant in cases where there isn't enough resources to
+        reduce the remaining candidates to at most `factor` after the last
+        iteration. If ``True``, then the search process will 'replay' the
+        first iteration for as long as needed until the number of candidates
+        is small enough. This is ``False`` by default, which means that the
+        last iteration may evaluate more than ``factor`` candidates. See
+        :ref:`aggressive_elimination` for more details.
+
+    cv : int, cross-validation generator or iterable, default=5
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. note::
+            Due to implementation details, the folds produced by `cv` must be
+            the same across multiple calls to `cv.split()`. For
+            built-in `scikit-learn` iterators, this can be achieved by
+            deactivating shuffling (`shuffle=False`), or by setting the
+            `cv`'s `random_state` parameter to an integer.
+
+    scoring : str, callable, or None, default=None
+        A single string (see :ref:`scoring_parameter`) or a callable
+        (see :ref:`scoring_callable`) to evaluate the predictions on the test set.
+        If None, the estimator's score method is used.
+
+    refit : bool, default=True
+        If True, refit an estimator using the best found parameters on the
+        whole dataset.
+
+        The refitted estimator is made available at the ``best_estimator_``
+        attribute and permits using ``predict`` directly on this
+        ``HalvingGridSearchCV`` instance.
+
+    error_score : 'raise' or numeric
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised. If a numeric value is given,
+        FitFailedWarning is raised. This parameter does not affect the refit
+        step, which will always raise the error. Default is ``np.nan``.
+
+    return_train_score : bool, default=False
+        If ``False``, the ``cv_results_`` attribute will not include training
+        scores.
+        Computing training scores is used to get insights on how different
+        parameter settings impact the overfitting/underfitting trade-off.
+        However computing the scores on the training set can be computationally
+        expensive and is not strictly required to select the parameters that
+        yield the best generalization performance.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo random number generator state used for subsampling the dataset
+        when `resources != 'n_samples'`. Ignored otherwise.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int or None, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int
+        Controls the verbosity: the higher, the more messages.
+
+    Attributes
+    ----------
+    n_resources_ : list of int
+        The amount of resources used at each iteration.
+
+    n_candidates_ : list of int
+        The number of candidate parameters that were evaluated at each
+        iteration.
+
+    n_remaining_candidates_ : int
+        The number of candidate parameters that are left after the last
+        iteration. It corresponds to `ceil(n_candidates[-1] / factor)`
+
+    max_resources_ : int
+        The maximum number of resources that any candidate is allowed to use
+        for a given iteration. Note that since the number of resources used
+        at each iteration must be a multiple of ``min_resources_``, the
+        actual number of resources used at the last iteration may be smaller
+        than ``max_resources_``.
+
+    min_resources_ : int
+        The amount of resources that are allocated for each candidate at the
+        first iteration.
+
+    n_iterations_ : int
+        The actual number of iterations that were run. This is equal to
+        ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``.
+        Else, this is equal to ``min(n_possible_iterations_,
+        n_required_iterations_)``.
+
+    n_possible_iterations_ : int
+        The number of iterations that are possible starting with
+        ``min_resources_`` resources and without exceeding
+        ``max_resources_``.
+
+    n_required_iterations_ : int
+        The number of iterations that are required to end up with less than
+        ``factor`` candidates at the last iteration, starting with
+        ``min_resources_`` resources. This will be smaller than
+        ``n_possible_iterations_`` when there isn't enough resources.
+
+    cv_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas ``DataFrame``. It contains lots of information
+        for analysing the results of a search.
+        Please refer to the :ref:`User guide<successive_halving_cv_results>`
+        for details.
+
+    best_estimator_ : estimator or dict
+        Estimator that was chosen by the search, i.e. estimator
+        which gave highest score (or smallest loss if specified)
+        on the left out data. Not available if ``refit=False``.
+
+    best_score_ : float
+        Mean cross-validated score of the best_estimator.
+
+    best_params_ : dict
+        Parameter setting that gave the best results on the hold out data.
+
+    best_index_ : int
+        The index (of the ``cv_results_`` arrays) which corresponds to the best
+        candidate parameter setting.
+
+        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
+        the parameter setting for the best model, that gives the highest
+        mean score (``search.best_score_``).
+
+    scorer_ : function or a dict
+        Scorer function used on the held out data to choose the best
+        parameters for the model.
+
+    n_splits_ : int
+        The number of cross-validation splits (folds/iterations).
+
+    refit_time_ : float
+        Seconds used for refitting the best model on the whole dataset.
+
+        This is present only if ``refit`` is not False.
+
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `n_features_in_` when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `feature_names_in_` when fit.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    :class:`HalvingRandomSearchCV`:
+        Random search over a set of parameters using successive halving.
+
+    Notes
+    -----
+    The parameters selected are those that maximize the score of the held-out
+    data, according to the scoring parameter.
+
+    All parameter combinations scored with a NaN will share the lowest rank.
+
+    Examples
+    --------
+
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.experimental import enable_halving_search_cv  # noqa
+    >>> from sklearn.model_selection import HalvingGridSearchCV
+    ...
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = RandomForestClassifier(random_state=0)
+    ...
+    >>> param_grid = {"max_depth": [3, None],
+    ...               "min_samples_split": [5, 10]}
+    >>> search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators',
+    ...                              max_resources=10,
+    ...                              random_state=0).fit(X, y)
+    >>> search.best_params_  # doctest: +SKIP
+    {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9}
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSuccessiveHalving._parameter_constraints,
+        "param_grid": [dict, list],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        param_grid,
+        *,
+        factor=3,
+        resource="n_samples",
+        max_resources="auto",
+        min_resources="exhaust",
+        aggressive_elimination=False,
+        cv=5,
+        scoring=None,
+        refit=True,
+        error_score=np.nan,
+        return_train_score=True,
+        random_state=None,
+        n_jobs=None,
+        verbose=0,
+    ):
+        super().__init__(
+            estimator,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            refit=refit,
+            verbose=verbose,
+            cv=cv,
+            random_state=random_state,
+            error_score=error_score,
+            return_train_score=return_train_score,
+            max_resources=max_resources,
+            resource=resource,
+            factor=factor,
+            min_resources=min_resources,
+            aggressive_elimination=aggressive_elimination,
+        )
+        self.param_grid = param_grid
+
+    def _generate_candidate_params(self):
+        return ParameterGrid(self.param_grid)
+
+
+class HalvingRandomSearchCV(BaseSuccessiveHalving):
+    """Randomized search on hyper parameters.
+
+    The search strategy starts evaluating all the candidates with a small
+    amount of resources and iteratively selects the best candidates, using more
+    and more resources.
+
+    The candidates are sampled at random from the parameter space and the
+    number of sampled candidates is determined by ``n_candidates``.
+
+    Read more in the :ref:`User guide<successive_halving_user_guide>`.
+
+    .. note::
+
+      This estimator is still **experimental** for now: the predictions
+      and the API might change without any deprecation cycle. To use it,
+      you need to explicitly import ``enable_halving_search_cv``::
+
+        >>> # explicitly require this experimental feature
+        >>> from sklearn.experimental import enable_halving_search_cv # noqa
+        >>> # now you can import normally from model_selection
+        >>> from sklearn.model_selection import HalvingRandomSearchCV
+
+    Parameters
+    ----------
+    estimator : estimator object
+        This is assumed to implement the scikit-learn estimator interface.
+        Either estimator needs to provide a ``score`` function,
+        or ``scoring`` must be passed.
+
+    param_distributions : dict or list of dicts
+        Dictionary with parameters names (`str`) as keys and distributions
+        or lists of parameters to try. Distributions must provide a ``rvs``
+        method for sampling (such as those from scipy.stats.distributions).
+        If a list is given, it is sampled uniformly.
+        If a list of dicts is given, first a dict is sampled uniformly, and
+        then a parameter is sampled using that dict as above.
+
+    n_candidates : "exhaust" or int, default="exhaust"
+        The number of candidate parameters to sample, at the first
+        iteration. Using 'exhaust' will sample enough candidates so that the
+        last iteration uses as many resources as possible, based on
+        `min_resources`, `max_resources` and `factor`. In this case,
+        `min_resources` cannot be 'exhaust'.
+
+    factor : int or float, default=3
+        The 'halving' parameter, which determines the proportion of candidates
+        that are selected for each subsequent iteration. For example,
+        ``factor=3`` means that only one third of the candidates are selected.
+
+    resource : ``'n_samples'`` or str, default='n_samples'
+        Defines the resource that increases with each iteration. By default,
+        the resource is the number of samples. It can also be set to any
+        parameter of the base estimator that accepts positive integer
+        values, e.g. 'n_iterations' or 'n_estimators' for a gradient
+        boosting estimator. In this case ``max_resources`` cannot be 'auto'
+        and must be set explicitly.
+
+    max_resources : int, default='auto'
+        The maximum number of resources that any candidate is allowed to use
+        for a given iteration. By default, this is set ``n_samples`` when
+        ``resource='n_samples'`` (default), else an error is raised.
+
+    min_resources : {'exhaust', 'smallest'} or int, default='smallest'
+        The minimum amount of resource that any candidate is allowed to use
+        for a given iteration. Equivalently, this defines the amount of
+        resources `r0` that are allocated for each candidate at the first
+        iteration.
+
+        - 'smallest' is a heuristic that sets `r0` to a small value:
+
+          - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem
+          - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a
+            classification problem
+          - ``1`` when ``resource != 'n_samples'``
+
+        - 'exhaust' will set `r0` such that the **last** iteration uses as
+          much resources as possible. Namely, the last iteration will use the
+          highest value smaller than ``max_resources`` that is a multiple of
+          both ``min_resources`` and ``factor``. In general, using 'exhaust'
+          leads to a more accurate estimator, but is slightly more time
+          consuming. 'exhaust' isn't available when `n_candidates='exhaust'`.
+
+        Note that the amount of resources used at each iteration is always a
+        multiple of ``min_resources``.
+
+    aggressive_elimination : bool, default=False
+        This is only relevant in cases where there isn't enough resources to
+        reduce the remaining candidates to at most `factor` after the last
+        iteration. If ``True``, then the search process will 'replay' the
+        first iteration for as long as needed until the number of candidates
+        is small enough. This is ``False`` by default, which means that the
+        last iteration may evaluate more than ``factor`` candidates. See
+        :ref:`aggressive_elimination` for more details.
+
+    cv : int, cross-validation generator or an iterable, default=5
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. note::
+            Due to implementation details, the folds produced by `cv` must be
+            the same across multiple calls to `cv.split()`. For
+            built-in `scikit-learn` iterators, this can be achieved by
+            deactivating shuffling (`shuffle=False`), or by setting the
+            `cv`'s `random_state` parameter to an integer.
+
+    scoring : str, callable, or None, default=None
+        A single string (see :ref:`scoring_parameter`) or a callable
+        (see :ref:`scoring_callable`) to evaluate the predictions on the test set.
+        If None, the estimator's score method is used.
+
+    refit : bool, default=True
+        If True, refit an estimator using the best found parameters on the
+        whole dataset.
+
+        The refitted estimator is made available at the ``best_estimator_``
+        attribute and permits using ``predict`` directly on this
+        ``HalvingRandomSearchCV`` instance.
+
+    error_score : 'raise' or numeric
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised. If a numeric value is given,
+        FitFailedWarning is raised. This parameter does not affect the refit
+        step, which will always raise the error. Default is ``np.nan``.
+
+    return_train_score : bool, default=False
+        If ``False``, the ``cv_results_`` attribute will not include training
+        scores.
+        Computing training scores is used to get insights on how different
+        parameter settings impact the overfitting/underfitting trade-off.
+        However computing the scores on the training set can be computationally
+        expensive and is not strictly required to select the parameters that
+        yield the best generalization performance.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo random number generator state used for subsampling the dataset
+        when `resources != 'n_samples'`. Also used for random uniform
+        sampling from lists of possible values instead of scipy.stats
+        distributions.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int or None, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int
+        Controls the verbosity: the higher, the more messages.
+
+    Attributes
+    ----------
+    n_resources_ : list of int
+        The amount of resources used at each iteration.
+
+    n_candidates_ : list of int
+        The number of candidate parameters that were evaluated at each
+        iteration.
+
+    n_remaining_candidates_ : int
+        The number of candidate parameters that are left after the last
+        iteration. It corresponds to `ceil(n_candidates[-1] / factor)`
+
+    max_resources_ : int
+        The maximum number of resources that any candidate is allowed to use
+        for a given iteration. Note that since the number of resources used at
+        each iteration must be a multiple of ``min_resources_``, the actual
+        number of resources used at the last iteration may be smaller than
+        ``max_resources_``.
+
+    min_resources_ : int
+        The amount of resources that are allocated for each candidate at the
+        first iteration.
+
+    n_iterations_ : int
+        The actual number of iterations that were run. This is equal to
+        ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``.
+        Else, this is equal to ``min(n_possible_iterations_,
+        n_required_iterations_)``.
+
+    n_possible_iterations_ : int
+        The number of iterations that are possible starting with
+        ``min_resources_`` resources and without exceeding
+        ``max_resources_``.
+
+    n_required_iterations_ : int
+        The number of iterations that are required to end up with less than
+        ``factor`` candidates at the last iteration, starting with
+        ``min_resources_`` resources. This will be smaller than
+        ``n_possible_iterations_`` when there isn't enough resources.
+
+    cv_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas ``DataFrame``. It contains lots of information
+        for analysing the results of a search.
+        Please refer to the :ref:`User guide<successive_halving_cv_results>`
+        for details.
+
+    best_estimator_ : estimator or dict
+        Estimator that was chosen by the search, i.e. estimator
+        which gave highest score (or smallest loss if specified)
+        on the left out data. Not available if ``refit=False``.
+
+    best_score_ : float
+        Mean cross-validated score of the best_estimator.
+
+    best_params_ : dict
+        Parameter setting that gave the best results on the hold out data.
+
+    best_index_ : int
+        The index (of the ``cv_results_`` arrays) which corresponds to the best
+        candidate parameter setting.
+
+        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
+        the parameter setting for the best model, that gives the highest
+        mean score (``search.best_score_``).
+
+    scorer_ : function or a dict
+        Scorer function used on the held out data to choose the best
+        parameters for the model.
+
+    n_splits_ : int
+        The number of cross-validation splits (folds/iterations).
+
+    refit_time_ : float
+        Seconds used for refitting the best model on the whole dataset.
+
+        This is present only if ``refit`` is not False.
+
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `n_features_in_` when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `feature_names_in_` when fit.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    :class:`HalvingGridSearchCV`:
+        Search over a grid of parameters using successive halving.
+
+    Notes
+    -----
+    The parameters selected are those that maximize the score of the held-out
+    data, according to the scoring parameter.
+
+    All parameter combinations scored with a NaN will share the lowest rank.
+
+    Examples
+    --------
+
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.experimental import enable_halving_search_cv  # noqa
+    >>> from sklearn.model_selection import HalvingRandomSearchCV
+    >>> from scipy.stats import randint
+    >>> import numpy as np
+    ...
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = RandomForestClassifier(random_state=0)
+    >>> np.random.seed(0)
+    ...
+    >>> param_distributions = {"max_depth": [3, None],
+    ...                        "min_samples_split": randint(2, 11)}
+    >>> search = HalvingRandomSearchCV(clf, param_distributions,
+    ...                                resource='n_estimators',
+    ...                                max_resources=10,
+    ...                                random_state=0).fit(X, y)
+    >>> search.best_params_  # doctest: +SKIP
+    {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9}
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSuccessiveHalving._parameter_constraints,
+        "param_distributions": [dict, list],
+        "n_candidates": [
+            Interval(Integral, 0, None, closed="neither"),
+            StrOptions({"exhaust"}),
+        ],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        param_distributions,
+        *,
+        n_candidates="exhaust",
+        factor=3,
+        resource="n_samples",
+        max_resources="auto",
+        min_resources="smallest",
+        aggressive_elimination=False,
+        cv=5,
+        scoring=None,
+        refit=True,
+        error_score=np.nan,
+        return_train_score=True,
+        random_state=None,
+        n_jobs=None,
+        verbose=0,
+    ):
+        super().__init__(
+            estimator,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            refit=refit,
+            verbose=verbose,
+            cv=cv,
+            random_state=random_state,
+            error_score=error_score,
+            return_train_score=return_train_score,
+            max_resources=max_resources,
+            resource=resource,
+            factor=factor,
+            min_resources=min_resources,
+            aggressive_elimination=aggressive_elimination,
+        )
+        self.param_distributions = param_distributions
+        self.n_candidates = n_candidates
+
+    def _generate_candidate_params(self):
+        n_candidates_first_iter = self.n_candidates
+        if n_candidates_first_iter == "exhaust":
+            # This will generate enough candidate so that the last iteration
+            # uses as much resources as possible
+            n_candidates_first_iter = self.max_resources_ // self.min_resources_
+        return ParameterSampler(
+            self.param_distributions,
+            n_candidates_first_iter,
+            random_state=self.random_state,
+        )
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_split.py b/.venv/Lib/site-packages/sklearn/model_selection/_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..2acbef99007e91e1724bc8712c9e6aa59aa956b1
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/_split.py
@@ -0,0 +1,2987 @@
+"""
+The :mod:`sklearn.model_selection._split` module includes classes and
+functions to split the data based on a preset strategy.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import defaultdict
+from collections.abc import Iterable
+from inspect import signature
+from itertools import chain, combinations
+from math import ceil, floor
+
+import numpy as np
+from scipy.special import comb
+
+from ..utils import (
+    _safe_indexing,
+    check_random_state,
+    indexable,
+    metadata_routing,
+)
+from ..utils._array_api import (
+    _convert_to_numpy,
+    ensure_common_namespace_device,
+    get_namespace,
+)
+from ..utils._param_validation import Interval, RealNotInt, validate_params
+from ..utils.extmath import _approximate_mode
+from ..utils.metadata_routing import _MetadataRequester
+from ..utils.multiclass import type_of_target
+from ..utils.validation import _num_samples, check_array, column_or_1d
+
+__all__ = [
+    "BaseCrossValidator",
+    "KFold",
+    "GroupKFold",
+    "LeaveOneGroupOut",
+    "LeaveOneOut",
+    "LeavePGroupsOut",
+    "LeavePOut",
+    "RepeatedStratifiedKFold",
+    "RepeatedKFold",
+    "ShuffleSplit",
+    "GroupShuffleSplit",
+    "StratifiedKFold",
+    "StratifiedGroupKFold",
+    "StratifiedShuffleSplit",
+    "PredefinedSplit",
+    "train_test_split",
+    "check_cv",
+]
+
+
+class _UnsupportedGroupCVMixin:
+    """Mixin for splitters that do not support Groups."""
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        if groups is not None:
+            warnings.warn(
+                f"The groups parameter is ignored by {self.__class__.__name__}",
+                UserWarning,
+            )
+        return super().split(X, y, groups=groups)
+
+
+class GroupsConsumerMixin(_MetadataRequester):
+    """A Mixin to ``groups`` by default.
+
+    This Mixin makes the object to request ``groups`` by default as ``True``.
+
+    .. versionadded:: 1.3
+    """
+
+    __metadata_request__split = {"groups": True}
+
+
+class BaseCrossValidator(_MetadataRequester, metaclass=ABCMeta):
+    """Base class for all cross-validators.
+
+    Implementations must define `_iter_test_masks` or `_iter_test_indices`.
+    """
+
+    # This indicates that by default CV splitters don't have a "groups" kwarg,
+    # unless indicated by inheriting from ``GroupsConsumerMixin``.
+    # This also prevents ``set_split_request`` to be generated for splitters
+    # which don't support ``groups``.
+    __metadata_request__split = {"groups": metadata_routing.UNUSED}
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        X, y, groups = indexable(X, y, groups)
+        indices = np.arange(_num_samples(X))
+        for test_index in self._iter_test_masks(X, y, groups):
+            train_index = indices[np.logical_not(test_index)]
+            test_index = indices[test_index]
+            yield train_index, test_index
+
+    # Since subclasses must implement either _iter_test_masks or
+    # _iter_test_indices, neither can be abstract.
+    def _iter_test_masks(self, X=None, y=None, groups=None):
+        """Generates boolean masks corresponding to test sets.
+
+        By default, delegates to _iter_test_indices(X, y, groups)
+        """
+        for test_index in self._iter_test_indices(X, y, groups):
+            test_mask = np.zeros(_num_samples(X), dtype=bool)
+            test_mask[test_index] = True
+            yield test_mask
+
+    def _iter_test_indices(self, X=None, y=None, groups=None):
+        """Generates integer indices corresponding to test sets."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator."""
+
+    def __repr__(self):
+        return _build_repr(self)
+
+
+class LeaveOneOut(_UnsupportedGroupCVMixin, BaseCrossValidator):
+    """Leave-One-Out cross-validator.
+
+    Provides train/test indices to split data in train/test sets. Each
+    sample is used once as a test set (singleton) while the remaining
+    samples form the training set.
+
+    Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and
+    ``LeavePOut(p=1)`` where ``n`` is the number of samples.
+
+    Due to the high number of test sets (which is the same as the
+    number of samples) this cross-validation method can be very costly.
+    For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit`
+    or :class:`StratifiedKFold`.
+
+    Read more in the :ref:`User Guide <leave_one_out>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import LeaveOneOut
+    >>> X = np.array([[1, 2], [3, 4]])
+    >>> y = np.array([1, 2])
+    >>> loo = LeaveOneOut()
+    >>> loo.get_n_splits(X)
+    2
+    >>> print(loo)
+    LeaveOneOut()
+    >>> for i, (train_index, test_index) in enumerate(loo.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[1]
+      Test:  index=[0]
+    Fold 1:
+      Train: index=[0]
+      Test:  index=[1]
+
+    See Also
+    --------
+    LeaveOneGroupOut : For splitting the data according to explicit,
+        domain-specific stratification of the dataset.
+    GroupKFold : K-fold iterator variant with non-overlapping groups.
+    """
+
+    def _iter_test_indices(self, X, y=None, groups=None):
+        n_samples = _num_samples(X)
+        if n_samples <= 1:
+            raise ValueError(
+                "Cannot perform LeaveOneOut with n_samples={}.".format(n_samples)
+            )
+        return range(n_samples)
+
+    def get_n_splits(self, X, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        if X is None:
+            raise ValueError("The 'X' parameter should not be None.")
+        return _num_samples(X)
+
+
+class LeavePOut(_UnsupportedGroupCVMixin, BaseCrossValidator):
+    """Leave-P-Out cross-validator.
+
+    Provides train/test indices to split data in train/test sets. This results
+    in testing on all distinct samples of size p, while the remaining n - p
+    samples form the training set in each iteration.
+
+    Note: ``LeavePOut(p)`` is NOT equivalent to
+    ``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets.
+
+    Due to the high number of iterations which grows combinatorically with the
+    number of samples this cross-validation method can be very costly. For
+    large datasets one should favor :class:`KFold`, :class:`StratifiedKFold`
+    or :class:`ShuffleSplit`.
+
+    Read more in the :ref:`User Guide <leave_p_out>`.
+
+    Parameters
+    ----------
+    p : int
+        Size of the test sets. Must be strictly less than the number of
+        samples.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import LeavePOut
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    >>> y = np.array([1, 2, 3, 4])
+    >>> lpo = LeavePOut(2)
+    >>> lpo.get_n_splits(X)
+    6
+    >>> print(lpo)
+    LeavePOut(p=2)
+    >>> for i, (train_index, test_index) in enumerate(lpo.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[2 3]
+      Test:  index=[0 1]
+    Fold 1:
+      Train: index=[1 3]
+      Test:  index=[0 2]
+    Fold 2:
+      Train: index=[1 2]
+      Test:  index=[0 3]
+    Fold 3:
+      Train: index=[0 3]
+      Test:  index=[1 2]
+    Fold 4:
+      Train: index=[0 2]
+      Test:  index=[1 3]
+    Fold 5:
+      Train: index=[0 1]
+      Test:  index=[2 3]
+    """
+
+    def __init__(self, p):
+        self.p = p
+
+    def _iter_test_indices(self, X, y=None, groups=None):
+        n_samples = _num_samples(X)
+        if n_samples <= self.p:
+            raise ValueError(
+                "p={} must be strictly less than the number of samples={}".format(
+                    self.p, n_samples
+                )
+            )
+        for combination in combinations(range(n_samples), self.p):
+            yield np.array(combination)
+
+    def get_n_splits(self, X, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+        """
+        if X is None:
+            raise ValueError("The 'X' parameter should not be None.")
+        return int(comb(_num_samples(X), self.p, exact=True))
+
+
+class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta):
+    """Base class for K-Fold cross-validators and TimeSeriesSplit."""
+
+    @abstractmethod
+    def __init__(self, n_splits, *, shuffle, random_state):
+        if not isinstance(n_splits, numbers.Integral):
+            raise ValueError(
+                "The number of folds must be of Integral type. "
+                "%s of type %s was passed." % (n_splits, type(n_splits))
+            )
+        n_splits = int(n_splits)
+
+        if n_splits <= 1:
+            raise ValueError(
+                "k-fold cross-validation requires at least one"
+                " train/test split by setting n_splits=2 or more,"
+                " got n_splits={0}.".format(n_splits)
+            )
+
+        if not isinstance(shuffle, bool):
+            raise TypeError("shuffle must be True or False; got {0}".format(shuffle))
+
+        if not shuffle and random_state is not None:  # None is the default
+            raise ValueError(
+                (
+                    "Setting a random_state has no effect since shuffle is "
+                    "False. You should leave "
+                    "random_state to its default (None), or set shuffle=True."
+                ),
+            )
+
+        self.n_splits = n_splits
+        self.shuffle = shuffle
+        self.random_state = random_state
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        X, y, groups = indexable(X, y, groups)
+        n_samples = _num_samples(X)
+        if self.n_splits > n_samples:
+            raise ValueError(
+                (
+                    "Cannot have number of splits n_splits={0} greater"
+                    " than the number of samples: n_samples={1}."
+                ).format(self.n_splits, n_samples)
+            )
+
+        for train, test in super().split(X, y, groups):
+            yield train, test
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return self.n_splits
+
+
+class KFold(_UnsupportedGroupCVMixin, _BaseKFold):
+    """K-Fold cross-validator.
+
+    Provides train/test indices to split data in train/test sets. Split
+    dataset into k consecutive folds (without shuffling by default).
+
+    Each fold is then used once as a validation while the k - 1 remaining
+    folds form the training set.
+
+    Read more in the :ref:`User Guide <k_fold>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+        .. versionchanged:: 0.22
+            ``n_splits`` default value changed from 3 to 5.
+
+    shuffle : bool, default=False
+        Whether to shuffle the data before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+
+    random_state : int, RandomState instance or None, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold. Otherwise, this
+        parameter has no effect.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import KFold
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([1, 2, 3, 4])
+    >>> kf = KFold(n_splits=2)
+    >>> kf.get_n_splits(X)
+    2
+    >>> print(kf)
+    KFold(n_splits=2, random_state=None, shuffle=False)
+    >>> for i, (train_index, test_index) in enumerate(kf.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[2 3]
+      Test:  index=[0 1]
+    Fold 1:
+      Train: index=[0 1]
+      Test:  index=[2 3]
+
+    Notes
+    -----
+    The first ``n_samples % n_splits`` folds have size
+    ``n_samples // n_splits + 1``, other folds have size
+    ``n_samples // n_splits``, where ``n_samples`` is the number of samples.
+
+    Randomized CV splitters may return different results for each call of
+    split. You can make the results identical by setting `random_state`
+    to an integer.
+
+    See Also
+    --------
+    StratifiedKFold : Takes class information into account to avoid building
+        folds with imbalanced class distributions (for binary or multiclass
+        classification tasks).
+
+    GroupKFold : K-fold iterator variant with non-overlapping groups.
+
+    RepeatedKFold : Repeats K-Fold n times.
+    """
+
+    def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
+        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+
+    def _iter_test_indices(self, X, y=None, groups=None):
+        n_samples = _num_samples(X)
+        indices = np.arange(n_samples)
+        if self.shuffle:
+            check_random_state(self.random_state).shuffle(indices)
+
+        n_splits = self.n_splits
+        fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int)
+        fold_sizes[: n_samples % n_splits] += 1
+        current = 0
+        for fold_size in fold_sizes:
+            start, stop = current, current + fold_size
+            yield indices[start:stop]
+            current = stop
+
+
+class GroupKFold(GroupsConsumerMixin, _BaseKFold):
+    """K-fold iterator variant with non-overlapping groups.
+
+    Each group will appear exactly once in the test set across all folds (the
+    number of distinct groups has to be at least equal to the number of folds).
+
+    The folds are approximately balanced in the sense that the number of
+    samples is approximately the same in each test fold when `shuffle` is True.
+
+    Read more in the :ref:`User Guide <group_k_fold>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+        .. versionchanged:: 0.22
+            ``n_splits`` default value changed from 3 to 5.
+
+    shuffle : bool, default=False
+        Whether to shuffle the groups before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+
+        .. versionadded:: 1.6
+
+    random_state : int, RandomState instance or None, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold. Otherwise, this
+        parameter has no effect.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+        .. versionadded:: 1.6
+
+    Notes
+    -----
+    Groups appear in an arbitrary order throughout the folds.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import GroupKFold
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
+    >>> y = np.array([1, 2, 3, 4, 5, 6])
+    >>> groups = np.array([0, 0, 2, 2, 3, 3])
+    >>> group_kfold = GroupKFold(n_splits=2)
+    >>> group_kfold.get_n_splits(X, y, groups)
+    2
+    >>> print(group_kfold)
+    GroupKFold(n_splits=2, random_state=None, shuffle=False)
+    >>> for i, (train_index, test_index) in enumerate(group_kfold.split(X, y, groups)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}, group={groups[train_index]}")
+    ...     print(f"  Test:  index={test_index}, group={groups[test_index]}")
+    Fold 0:
+      Train: index=[2 3], group=[2 2]
+      Test:  index=[0 1 4 5], group=[0 0 3 3]
+    Fold 1:
+      Train: index=[0 1 4 5], group=[0 0 3 3]
+      Test:  index=[2 3], group=[2 2]
+
+    See Also
+    --------
+    LeaveOneGroupOut : For splitting the data according to explicit
+        domain-specific stratification of the dataset.
+
+    StratifiedKFold : Takes class information into account to avoid building
+        folds with imbalanced class proportions (for binary or multiclass
+        classification tasks).
+    """
+
+    def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
+        super().__init__(n_splits, shuffle=shuffle, random_state=random_state)
+
+    def _iter_test_indices(self, X, y, groups):
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
+
+        unique_groups, group_idx = np.unique(groups, return_inverse=True)
+        n_groups = len(unique_groups)
+
+        if self.n_splits > n_groups:
+            raise ValueError(
+                "Cannot have number of splits n_splits=%d greater"
+                " than the number of groups: %d." % (self.n_splits, n_groups)
+            )
+
+        if self.shuffle:
+            # Split and shuffle unique groups across n_splits
+            rng = check_random_state(self.random_state)
+            unique_groups = rng.permutation(unique_groups)
+            split_groups = np.array_split(unique_groups, self.n_splits)
+
+            for test_group_ids in split_groups:
+                test_mask = np.isin(groups, test_group_ids)
+                yield np.where(test_mask)[0]
+
+        else:
+            # Weight groups by their number of occurrences
+            n_samples_per_group = np.bincount(group_idx)
+
+            # Distribute the most frequent groups first
+            indices = np.argsort(n_samples_per_group)[::-1]
+            n_samples_per_group = n_samples_per_group[indices]
+
+            # Total weight of each fold
+            n_samples_per_fold = np.zeros(self.n_splits)
+
+            # Mapping from group index to fold index
+            group_to_fold = np.zeros(len(unique_groups))
+
+            # Distribute samples by adding the largest weight to the lightest fold
+            for group_index, weight in enumerate(n_samples_per_group):
+                lightest_fold = np.argmin(n_samples_per_fold)
+                n_samples_per_fold[lightest_fold] += weight
+                group_to_fold[indices[group_index]] = lightest_fold
+
+            indices = group_to_fold[group_idx]
+
+            for f in range(self.n_splits):
+                yield np.where(indices == f)[0]
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        return super().split(X, y, groups)
+
+
+class StratifiedKFold(_BaseKFold):
+    """Stratified K-Fold cross-validator.
+
+    Provides train/test indices to split data in train/test sets.
+
+    This cross-validation object is a variation of KFold that returns
+    stratified folds. The folds are made by preserving the percentage of
+    samples for each class.
+
+    Read more in the :ref:`User Guide <stratified_k_fold>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+        .. versionchanged:: 0.22
+            ``n_splits`` default value changed from 3 to 5.
+
+    shuffle : bool, default=False
+        Whether to shuffle each class's samples before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+
+    random_state : int, RandomState instance or None, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold for each class.
+        Otherwise, leave `random_state` as `None`.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import StratifiedKFold
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 1, 1])
+    >>> skf = StratifiedKFold(n_splits=2)
+    >>> skf.get_n_splits(X, y)
+    2
+    >>> print(skf)
+    StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
+    >>> for i, (train_index, test_index) in enumerate(skf.split(X, y)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[1 3]
+      Test:  index=[0 2]
+    Fold 1:
+      Train: index=[0 2]
+      Test:  index=[1 3]
+
+    Notes
+    -----
+    The implementation is designed to:
+
+    * Generate test sets such that all contain the same distribution of
+      classes, or as close as possible.
+    * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to
+      ``y = [1, 0]`` should not change the indices generated.
+    * Preserve order dependencies in the dataset ordering, when
+      ``shuffle=False``: all samples from class k in some test set were
+      contiguous in y, or separated in y by samples from classes other than k.
+    * Generate test sets where the smallest and largest differ by at most one
+      sample.
+
+    .. versionchanged:: 0.22
+        The previous implementation did not follow the last constraint.
+
+    See Also
+    --------
+    RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.
+    """
+
+    def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
+        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+
+    def _make_test_folds(self, X, y=None):
+        rng = check_random_state(self.random_state)
+        # XXX: as of now, cross-validation splitters only operate in NumPy-land
+        # without attempting to leverage array API namespace features. However
+        # they might be fed by array API inputs, e.g. in CV-enabled estimators so
+        # we need the following explicit conversion:
+        xp, is_array_api = get_namespace(y)
+        if is_array_api:
+            y = _convert_to_numpy(y, xp)
+        else:
+            y = np.asarray(y)
+        type_of_target_y = type_of_target(y)
+        allowed_target_types = ("binary", "multiclass")
+        if type_of_target_y not in allowed_target_types:
+            raise ValueError(
+                "Supported target types are: {}. Got {!r} instead.".format(
+                    allowed_target_types, type_of_target_y
+                )
+            )
+
+        y = column_or_1d(y)
+
+        _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
+        # y_inv encodes y according to lexicographic order. We invert y_idx to
+        # map the classes so that they are encoded by order of appearance:
+        # 0 represents the first label appearing in y, 1 the second, etc.
+        _, class_perm = np.unique(y_idx, return_inverse=True)
+        y_encoded = class_perm[y_inv]
+
+        n_classes = len(y_idx)
+        y_counts = np.bincount(y_encoded)
+        min_groups = np.min(y_counts)
+        if np.all(self.n_splits > y_counts):
+            raise ValueError(
+                "n_splits=%d cannot be greater than the"
+                " number of members in each class." % (self.n_splits)
+            )
+        if self.n_splits > min_groups:
+            warnings.warn(
+                "The least populated class in y has only %d"
+                " members, which is less than n_splits=%d."
+                % (min_groups, self.n_splits),
+                UserWarning,
+            )
+
+        # Determine the optimal number of samples from each class in each fold,
+        # using round robin over the sorted y. (This can be done direct from
+        # counts, but that code is unreadable.)
+        y_order = np.sort(y_encoded)
+        allocation = np.asarray(
+            [
+                np.bincount(y_order[i :: self.n_splits], minlength=n_classes)
+                for i in range(self.n_splits)
+            ]
+        )
+
+        # To maintain the data order dependencies as best as possible within
+        # the stratification constraint, we assign samples from each class in
+        # blocks (and then mess that up when shuffle=True).
+        test_folds = np.empty(len(y), dtype="i")
+        for k in range(n_classes):
+            # since the kth column of allocation stores the number of samples
+            # of class k in each test set, this generates blocks of fold
+            # indices corresponding to the allocation for class k.
+            folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])
+            if self.shuffle:
+                rng.shuffle(folds_for_class)
+            test_folds[y_encoded == k] = folds_for_class
+        return test_folds
+
+    def _iter_test_masks(self, X, y=None, groups=None):
+        test_folds = self._make_test_folds(X, y)
+        for i in range(self.n_splits):
+            yield test_folds == i
+
+    def split(self, X, y, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+            Note that providing ``y`` is sufficient to generate the splits and
+            hence ``np.zeros(n_samples)`` may be used as a placeholder for
+            ``X`` instead of actual training data.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+            Stratification is done based on the y labels.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        if groups is not None:
+            warnings.warn(
+                f"The groups parameter is ignored by {self.__class__.__name__}",
+                UserWarning,
+            )
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
+        return super().split(X, y, groups)
+
+
+class StratifiedGroupKFold(GroupsConsumerMixin, _BaseKFold):
+    """Stratified K-Fold iterator variant with non-overlapping groups.
+
+    This cross-validation object is a variation of StratifiedKFold attempts to
+    return stratified folds with non-overlapping groups. The folds are made by
+    preserving the percentage of samples for each class.
+
+    Each group will appear exactly once in the test set across all folds (the
+    number of distinct groups has to be at least equal to the number of folds).
+
+    The difference between :class:`GroupKFold`
+    and `StratifiedGroupKFold` is that
+    the former attempts to create balanced folds such that the number of
+    distinct groups is approximately the same in each fold, whereas
+    `StratifiedGroupKFold` attempts to create folds which preserve the
+    percentage of samples for each class as much as possible given the
+    constraint of non-overlapping groups between splits.
+
+    Read more in the :ref:`User Guide <stratified_group_k_fold>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+    shuffle : bool, default=False
+        Whether to shuffle each class's samples before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+        This implementation can only shuffle groups that have approximately the
+        same y distribution, no global shuffle will be performed.
+
+    random_state : int or RandomState instance, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold for each class.
+        Otherwise, leave `random_state` as `None`.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import StratifiedGroupKFold
+    >>> X = np.ones((17, 2))
+    >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8])
+    >>> sgkf = StratifiedGroupKFold(n_splits=3)
+    >>> sgkf.get_n_splits(X, y)
+    3
+    >>> print(sgkf)
+    StratifiedGroupKFold(n_splits=3, random_state=None, shuffle=False)
+    >>> for i, (train_index, test_index) in enumerate(sgkf.split(X, y, groups)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"         group={groups[train_index]}")
+    ...     print(f"  Test:  index={test_index}")
+    ...     print(f"         group={groups[test_index]}")
+    Fold 0:
+      Train: index=[ 0  1  2  3  7  8  9 10 11 15 16]
+             group=[1 1 2 2 4 5 5 5 5 8 8]
+      Test:  index=[ 4  5  6 12 13 14]
+             group=[3 3 3 6 6 7]
+    Fold 1:
+      Train: index=[ 4  5  6  7  8  9 10 11 12 13 14]
+             group=[3 3 3 4 5 5 5 5 6 6 7]
+      Test:  index=[ 0  1  2  3 15 16]
+             group=[1 1 2 2 8 8]
+    Fold 2:
+      Train: index=[ 0  1  2  3  4  5  6 12 13 14 15 16]
+             group=[1 1 2 2 3 3 3 6 6 7 8 8]
+      Test:  index=[ 7  8  9 10 11]
+             group=[4 5 5 5 5]
+
+    Notes
+    -----
+    The implementation is designed to:
+
+    * Mimic the behavior of StratifiedKFold as much as possible for trivial
+      groups (e.g. when each group contains only one sample).
+    * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to
+      ``y = [1, 0]`` should not change the indices generated.
+    * Stratify based on samples as much as possible while keeping
+      non-overlapping groups constraint. That means that in some cases when
+      there is a small number of groups containing a large number of samples
+      the stratification will not be possible and the behavior will be close
+      to GroupKFold.
+
+    See also
+    --------
+    StratifiedKFold: Takes class information into account to build folds which
+        retain class distributions (for binary or multiclass classification
+        tasks).
+
+    GroupKFold: K-fold iterator variant with non-overlapping groups.
+    """
+
+    def __init__(self, n_splits=5, shuffle=False, random_state=None):
+        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+
+    def _iter_test_indices(self, X, y, groups):
+        # Implementation is based on this kaggle kernel:
+        # https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
+        # and is a subject to Apache 2.0 License. You may obtain a copy of the
+        # License at http://www.apache.org/licenses/LICENSE-2.0
+        # Changelist:
+        # - Refactored function to a class following scikit-learn KFold
+        #   interface.
+        # - Added heuristic for assigning group to the least populated fold in
+        #   cases when all other criteria are equal
+        # - Swtch from using python ``Counter`` to ``np.unique`` to get class
+        #   distribution
+        # - Added scikit-learn checks for input: checking that target is binary
+        #   or multiclass, checking passed random state, checking that number
+        #   of splits is less than number of members in each class, checking
+        #   that least populated class has more members than there are splits.
+        rng = check_random_state(self.random_state)
+        y = np.asarray(y)
+        type_of_target_y = type_of_target(y)
+        allowed_target_types = ("binary", "multiclass")
+        if type_of_target_y not in allowed_target_types:
+            raise ValueError(
+                "Supported target types are: {}. Got {!r} instead.".format(
+                    allowed_target_types, type_of_target_y
+                )
+            )
+
+        y = column_or_1d(y)
+        _, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True)
+        if np.all(self.n_splits > y_cnt):
+            raise ValueError(
+                "n_splits=%d cannot be greater than the"
+                " number of members in each class." % (self.n_splits)
+            )
+        n_smallest_class = np.min(y_cnt)
+        if self.n_splits > n_smallest_class:
+            warnings.warn(
+                "The least populated class in y has only %d"
+                " members, which is less than n_splits=%d."
+                % (n_smallest_class, self.n_splits),
+                UserWarning,
+            )
+        n_classes = len(y_cnt)
+
+        _, groups_inv, groups_cnt = np.unique(
+            groups, return_inverse=True, return_counts=True
+        )
+        y_counts_per_group = np.zeros((len(groups_cnt), n_classes))
+        for class_idx, group_idx in zip(y_inv, groups_inv):
+            y_counts_per_group[group_idx, class_idx] += 1
+
+        y_counts_per_fold = np.zeros((self.n_splits, n_classes))
+        groups_per_fold = defaultdict(set)
+
+        if self.shuffle:
+            rng.shuffle(y_counts_per_group)
+
+        # Stable sort to keep shuffled order for groups with the same
+        # class distribution variance
+        sorted_groups_idx = np.argsort(
+            -np.std(y_counts_per_group, axis=1), kind="mergesort"
+        )
+
+        for group_idx in sorted_groups_idx:
+            group_y_counts = y_counts_per_group[group_idx]
+            best_fold = self._find_best_fold(
+                y_counts_per_fold=y_counts_per_fold,
+                y_cnt=y_cnt,
+                group_y_counts=group_y_counts,
+            )
+            y_counts_per_fold[best_fold] += group_y_counts
+            groups_per_fold[best_fold].add(group_idx)
+
+        for i in range(self.n_splits):
+            test_indices = [
+                idx
+                for idx, group_idx in enumerate(groups_inv)
+                if group_idx in groups_per_fold[i]
+            ]
+            yield test_indices
+
+    def _find_best_fold(self, y_counts_per_fold, y_cnt, group_y_counts):
+        best_fold = None
+        min_eval = np.inf
+        min_samples_in_fold = np.inf
+        for i in range(self.n_splits):
+            y_counts_per_fold[i] += group_y_counts
+            # Summarise the distribution over classes in each proposed fold
+            std_per_class = np.std(y_counts_per_fold / y_cnt.reshape(1, -1), axis=0)
+            y_counts_per_fold[i] -= group_y_counts
+            fold_eval = np.mean(std_per_class)
+            samples_in_fold = np.sum(y_counts_per_fold[i])
+            is_current_fold_better = (
+                fold_eval < min_eval
+                or np.isclose(fold_eval, min_eval)
+                and samples_in_fold < min_samples_in_fold
+            )
+            if is_current_fold_better:
+                min_eval = fold_eval
+                min_samples_in_fold = samples_in_fold
+                best_fold = i
+        return best_fold
+
+
+class TimeSeriesSplit(_BaseKFold):
+    """Time Series cross-validator.
+
+    Provides train/test indices to split time series data samples
+    that are observed at fixed time intervals, in train/test sets.
+    In each split, test indices must be higher than before, and thus shuffling
+    in cross validator is inappropriate.
+
+    This cross-validation object is a variation of :class:`KFold`.
+    In the kth split, it returns first k folds as train set and the
+    (k+1)th fold as test set.
+
+    Note that unlike standard cross-validation methods, successive
+    training sets are supersets of those that come before them.
+
+    Read more in the :ref:`User Guide <time_series_split>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of splits. Must be at least 2.
+
+        .. versionchanged:: 0.22
+            ``n_splits`` default value changed from 3 to 5.
+
+    max_train_size : int, default=None
+        Maximum size for a single training set.
+
+    test_size : int, default=None
+        Used to limit the size of the test set. Defaults to
+        ``n_samples // (n_splits + 1)``, which is the maximum allowed value
+        with ``gap=0``.
+
+        .. versionadded:: 0.24
+
+    gap : int, default=0
+        Number of samples to exclude from the end of each train set before
+        the test set.
+
+        .. versionadded:: 0.24
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import TimeSeriesSplit
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([1, 2, 3, 4, 5, 6])
+    >>> tscv = TimeSeriesSplit()
+    >>> print(tscv)
+    TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
+    >>> for i, (train_index, test_index) in enumerate(tscv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0]
+      Test:  index=[1]
+    Fold 1:
+      Train: index=[0 1]
+      Test:  index=[2]
+    Fold 2:
+      Train: index=[0 1 2]
+      Test:  index=[3]
+    Fold 3:
+      Train: index=[0 1 2 3]
+      Test:  index=[4]
+    Fold 4:
+      Train: index=[0 1 2 3 4]
+      Test:  index=[5]
+    >>> # Fix test_size to 2 with 12 samples
+    >>> X = np.random.randn(12, 2)
+    >>> y = np.random.randint(0, 2, 12)
+    >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2)
+    >>> for i, (train_index, test_index) in enumerate(tscv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0 1 2 3 4 5]
+      Test:  index=[6 7]
+    Fold 1:
+      Train: index=[0 1 2 3 4 5 6 7]
+      Test:  index=[8 9]
+    Fold 2:
+      Train: index=[0 1 2 3 4 5 6 7 8 9]
+      Test:  index=[10 11]
+    >>> # Add in a 2 period gap
+    >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2)
+    >>> for i, (train_index, test_index) in enumerate(tscv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0 1 2 3]
+      Test:  index=[6 7]
+    Fold 1:
+      Train: index=[0 1 2 3 4 5]
+      Test:  index=[8 9]
+    Fold 2:
+      Train: index=[0 1 2 3 4 5 6 7]
+      Test:  index=[10 11]
+
+    For a more extended example see
+    :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`.
+
+    Notes
+    -----
+    The training set has size ``i * n_samples // (n_splits + 1)
+    + n_samples % (n_splits + 1)`` in the ``i`` th split,
+    with a test set of size ``n_samples//(n_splits + 1)`` by default,
+    where ``n_samples`` is the number of samples. Note that this
+    formula is only valid when ``test_size`` and ``max_train_size`` are
+    left to their default values.
+    """
+
+    def __init__(self, n_splits=5, *, max_train_size=None, test_size=None, gap=0):
+        super().__init__(n_splits, shuffle=False, random_state=None)
+        self.max_train_size = max_train_size
+        self.test_size = test_size
+        self.gap = gap
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Always ignored, exists for compatibility.
+
+        groups : array-like of shape (n_samples,)
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        if groups is not None:
+            warnings.warn(
+                f"The groups parameter is ignored by {self.__class__.__name__}",
+                UserWarning,
+            )
+        return self._split(X)
+
+    def _split(self, X):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        (X,) = indexable(X)
+        n_samples = _num_samples(X)
+        n_splits = self.n_splits
+        n_folds = n_splits + 1
+        gap = self.gap
+        test_size = (
+            self.test_size if self.test_size is not None else n_samples // n_folds
+        )
+
+        # Make sure we have enough samples for the given split parameters
+        if n_folds > n_samples:
+            raise ValueError(
+                f"Cannot have number of folds={n_folds} greater"
+                f" than the number of samples={n_samples}."
+            )
+        if n_samples - gap - (test_size * n_splits) <= 0:
+            raise ValueError(
+                f"Too many splits={n_splits} for number of samples"
+                f"={n_samples} with test_size={test_size} and gap={gap}."
+            )
+
+        indices = np.arange(n_samples)
+        test_starts = range(n_samples - n_splits * test_size, n_samples, test_size)
+
+        for test_start in test_starts:
+            train_end = test_start - gap
+            if self.max_train_size and self.max_train_size < train_end:
+                yield (
+                    indices[train_end - self.max_train_size : train_end],
+                    indices[test_start : test_start + test_size],
+                )
+            else:
+                yield (
+                    indices[:train_end],
+                    indices[test_start : test_start + test_size],
+                )
+
+
+class LeaveOneGroupOut(GroupsConsumerMixin, BaseCrossValidator):
+    """Leave One Group Out cross-validator.
+
+    Provides train/test indices to split data such that each training set is
+    comprised of all samples except ones belonging to one specific group.
+    Arbitrary domain specific group information is provided as an array of integers
+    that encodes the group of each sample.
+
+    For instance the groups could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    Read more in the :ref:`User Guide <leave_one_group_out>`.
+
+    Notes
+    -----
+    Splits are ordered according to the index of the group left out. The first
+    split has testing set consisting of the group whose index in `groups` is
+    lowest, and so on.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import LeaveOneGroupOut
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    >>> y = np.array([1, 2, 1, 2])
+    >>> groups = np.array([1, 1, 2, 2])
+    >>> logo = LeaveOneGroupOut()
+    >>> logo.get_n_splits(X, y, groups)
+    2
+    >>> logo.get_n_splits(groups=groups)  # 'groups' is always required
+    2
+    >>> print(logo)
+    LeaveOneGroupOut()
+    >>> for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}, group={groups[train_index]}")
+    ...     print(f"  Test:  index={test_index}, group={groups[test_index]}")
+    Fold 0:
+      Train: index=[2 3], group=[2 2]
+      Test:  index=[0 1], group=[1 1]
+    Fold 1:
+      Train: index=[0 1], group=[1 1]
+      Test:  index=[2 3], group=[2 2]
+
+    See also
+    --------
+    GroupKFold: K-fold iterator variant with non-overlapping groups.
+    """
+
+    def _iter_test_masks(self, X, y, groups):
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        # We make a copy of groups to avoid side-effects during iteration
+        groups = check_array(
+            groups, input_name="groups", copy=True, ensure_2d=False, dtype=None
+        )
+        unique_groups = np.unique(groups)
+        if len(unique_groups) <= 1:
+            raise ValueError(
+                "The groups parameter contains fewer than 2 unique groups "
+                "(%s). LeaveOneGroupOut expects at least 2." % unique_groups
+            )
+        for i in unique_groups:
+            yield groups == i
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set. This 'groups' parameter must always be specified to
+            calculate the number of splits, though the other parameters can be
+            omitted.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
+        return len(np.unique(groups))
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        return super().split(X, y, groups)
+
+
+class LeavePGroupsOut(GroupsConsumerMixin, BaseCrossValidator):
+    """Leave P Group(s) Out cross-validator.
+
+    Provides train/test indices to split data according to a third-party
+    provided group. This group information can be used to encode arbitrary
+    domain specific stratifications of the samples as integers.
+
+    For instance the groups could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    The difference between LeavePGroupsOut and LeaveOneGroupOut is that
+    the former builds the test sets with all the samples assigned to
+    ``p`` different values of the groups while the latter uses samples
+    all assigned the same groups.
+
+    Read more in the :ref:`User Guide <leave_p_groups_out>`.
+
+    Parameters
+    ----------
+    n_groups : int
+        Number of groups (``p``) to leave out in the test split.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import LeavePGroupsOut
+    >>> X = np.array([[1, 2], [3, 4], [5, 6]])
+    >>> y = np.array([1, 2, 1])
+    >>> groups = np.array([1, 2, 3])
+    >>> lpgo = LeavePGroupsOut(n_groups=2)
+    >>> lpgo.get_n_splits(X, y, groups)
+    3
+    >>> lpgo.get_n_splits(groups=groups)  # 'groups' is always required
+    3
+    >>> print(lpgo)
+    LeavePGroupsOut(n_groups=2)
+    >>> for i, (train_index, test_index) in enumerate(lpgo.split(X, y, groups)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}, group={groups[train_index]}")
+    ...     print(f"  Test:  index={test_index}, group={groups[test_index]}")
+    Fold 0:
+      Train: index=[2], group=[3]
+      Test:  index=[0 1], group=[1 2]
+    Fold 1:
+      Train: index=[1], group=[2]
+      Test:  index=[0 2], group=[1 3]
+    Fold 2:
+      Train: index=[0], group=[1]
+      Test:  index=[1 2], group=[2 3]
+
+    See Also
+    --------
+    GroupKFold : K-fold iterator variant with non-overlapping groups.
+    """
+
+    def __init__(self, n_groups):
+        self.n_groups = n_groups
+
+    def _iter_test_masks(self, X, y, groups):
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        groups = check_array(
+            groups, input_name="groups", copy=True, ensure_2d=False, dtype=None
+        )
+        unique_groups = np.unique(groups)
+        if self.n_groups >= len(unique_groups):
+            raise ValueError(
+                "The groups parameter contains fewer than (or equal to) "
+                "n_groups (%d) numbers of unique groups (%s). LeavePGroupsOut "
+                "expects that at least n_groups + 1 (%d) unique groups be "
+                "present" % (self.n_groups, unique_groups, self.n_groups + 1)
+            )
+        combi = combinations(range(len(unique_groups)), self.n_groups)
+        for indices in combi:
+            test_index = np.zeros(_num_samples(X), dtype=bool)
+            for l in unique_groups[np.array(indices)]:
+                test_index[groups == l] = True
+            yield test_index
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set. This 'groups' parameter must always be specified to
+            calculate the number of splits, though the other parameters can be
+            omitted.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
+        return int(comb(len(np.unique(groups)), self.n_groups, exact=True))
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        return super().split(X, y, groups)
+
+
+class _RepeatedSplits(_MetadataRequester, metaclass=ABCMeta):
+    """Repeated splits for an arbitrary randomized CV splitter.
+
+    Repeats splits for cross-validators n times with different randomization
+    in each repetition.
+
+    Parameters
+    ----------
+    cv : callable
+        Cross-validator class.
+
+    n_repeats : int, default=10
+        Number of times cross-validator needs to be repeated.
+
+    random_state : int, RandomState instance or None, default=None
+        Passes `random_state` to the arbitrary repeating cross validator.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    **cvargs : additional params
+        Constructor parameters for cv. Must not contain random_state
+        and shuffle.
+    """
+
+    # This indicates that by default CV splitters don't have a "groups" kwarg,
+    # unless indicated by inheriting from ``GroupsConsumerMixin``.
+    # This also prevents ``set_split_request`` to be generated for splitters
+    # which don't support ``groups``.
+    __metadata_request__split = {"groups": metadata_routing.UNUSED}
+
+    def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs):
+        if not isinstance(n_repeats, numbers.Integral):
+            raise ValueError("Number of repetitions must be of Integral type.")
+
+        if n_repeats <= 0:
+            raise ValueError("Number of repetitions must be greater than 0.")
+
+        if any(key in cvargs for key in ("random_state", "shuffle")):
+            raise ValueError("cvargs must not contain random_state or shuffle.")
+
+        self.cv = cv
+        self.n_repeats = n_repeats
+        self.random_state = random_state
+        self.cvargs = cvargs
+
+    def split(self, X, y=None, groups=None):
+        """Generates indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        n_repeats = self.n_repeats
+        rng = check_random_state(self.random_state)
+
+        for idx in range(n_repeats):
+            cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
+            for train_index, test_index in cv.split(X, y, groups):
+                yield train_index, test_index
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+            ``np.zeros(n_samples)`` may be used as a placeholder.
+
+        y : object
+            Always ignored, exists for compatibility.
+            ``np.zeros(n_samples)`` may be used as a placeholder.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        rng = check_random_state(self.random_state)
+        cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
+        return cv.get_n_splits(X, y, groups) * self.n_repeats
+
+    def __repr__(self):
+        return _build_repr(self)
+
+
+class RepeatedKFold(_UnsupportedGroupCVMixin, _RepeatedSplits):
+    """Repeated K-Fold cross validator.
+
+    Repeats K-Fold n times with different randomization in each repetition.
+
+    Read more in the :ref:`User Guide <repeated_k_fold>`.
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+    n_repeats : int, default=10
+        Number of times cross-validator needs to be repeated.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of each repeated cross-validation instance.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import RepeatedKFold
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 1, 1])
+    >>> rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)
+    >>> rkf.get_n_splits(X, y)
+    4
+    >>> print(rkf)
+    RepeatedKFold(n_repeats=2, n_splits=2, random_state=2652124)
+    >>> for i, (train_index, test_index) in enumerate(rkf.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    ...
+    Fold 0:
+      Train: index=[0 1]
+      Test:  index=[2 3]
+    Fold 1:
+      Train: index=[2 3]
+      Test:  index=[0 1]
+    Fold 2:
+      Train: index=[1 2]
+      Test:  index=[0 3]
+    Fold 3:
+      Train: index=[0 3]
+      Test:  index=[1 2]
+
+    Notes
+    -----
+    Randomized CV splitters may return different results for each call of
+    split. You can make the results identical by setting `random_state`
+    to an integer.
+
+    See Also
+    --------
+    RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.
+    """
+
+    def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
+        super().__init__(
+            KFold, n_repeats=n_repeats, random_state=random_state, n_splits=n_splits
+        )
+
+
+class RepeatedStratifiedKFold(_UnsupportedGroupCVMixin, _RepeatedSplits):
+    """Repeated Stratified K-Fold cross validator.
+
+    Repeats Stratified K-Fold n times with different randomization in each
+    repetition.
+
+    Read more in the :ref:`User Guide <repeated_k_fold>`.
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+    n_repeats : int, default=10
+        Number of times cross-validator needs to be repeated.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the generation of the random states for each repetition.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import RepeatedStratifiedKFold
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 1, 1])
+    >>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2,
+    ...     random_state=36851234)
+    >>> rskf.get_n_splits(X, y)
+    4
+    >>> print(rskf)
+    RepeatedStratifiedKFold(n_repeats=2, n_splits=2, random_state=36851234)
+    >>> for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    ...
+    Fold 0:
+      Train: index=[1 2]
+      Test:  index=[0 3]
+    Fold 1:
+      Train: index=[0 3]
+      Test:  index=[1 2]
+    Fold 2:
+      Train: index=[1 3]
+      Test:  index=[0 2]
+    Fold 3:
+      Train: index=[0 2]
+      Test:  index=[1 3]
+
+    Notes
+    -----
+    Randomized CV splitters may return different results for each call of
+    split. You can make the results identical by setting `random_state`
+    to an integer.
+
+    See Also
+    --------
+    RepeatedKFold : Repeats K-Fold n times.
+    """
+
+    def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
+        super().__init__(
+            StratifiedKFold,
+            n_repeats=n_repeats,
+            random_state=random_state,
+            n_splits=n_splits,
+        )
+
+    def split(self, X, y, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+            Note that providing ``y`` is sufficient to generate the splits and
+            hence ``np.zeros(n_samples)`` may be used as a placeholder for
+            ``X`` instead of actual training data.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+            Stratification is done based on the y labels.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
+        return super().split(X, y, groups=groups)
+
+
+class BaseShuffleSplit(_MetadataRequester, metaclass=ABCMeta):
+    """Base class for *ShuffleSplit.
+
+    Parameters
+    ----------
+    n_splits : int, default=10
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. If ``train_size`` is also None, it will
+        be set to 0.1.
+
+    train_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the training and testing indices produced.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+    """
+
+    # This indicates that by default CV splitters don't have a "groups" kwarg,
+    # unless indicated by inheriting from ``GroupsConsumerMixin``.
+    # This also prevents ``set_split_request`` to be generated for splitters
+    # which don't support ``groups``.
+    __metadata_request__split = {"groups": metadata_routing.UNUSED}
+
+    def __init__(
+        self, n_splits=10, *, test_size=None, train_size=None, random_state=None
+    ):
+        self.n_splits = n_splits
+        self.test_size = test_size
+        self.train_size = train_size
+        self.random_state = random_state
+        self._default_test_size = 0.1
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        X, y, groups = indexable(X, y, groups)
+        for train, test in self._iter_indices(X, y, groups):
+            yield train, test
+
+    def _iter_indices(self, X, y=None, groups=None):
+        """Generate (train, test) indices"""
+        n_samples = _num_samples(X)
+        n_train, n_test = _validate_shuffle_split(
+            n_samples,
+            self.test_size,
+            self.train_size,
+            default_test_size=self._default_test_size,
+        )
+
+        rng = check_random_state(self.random_state)
+        for i in range(self.n_splits):
+            # random partition
+            permutation = rng.permutation(n_samples)
+            ind_test = permutation[:n_test]
+            ind_train = permutation[n_test : (n_test + n_train)]
+            yield ind_train, ind_test
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return self.n_splits
+
+    def __repr__(self):
+        return _build_repr(self)
+
+
+class ShuffleSplit(_UnsupportedGroupCVMixin, BaseShuffleSplit):
+    """Random permutation cross-validator.
+
+    Yields indices to split data into training and test sets.
+
+    Note: contrary to other cross-validation strategies, random splits
+    do not guarantee that test sets across all folds will be mutually exclusive,
+    and might include overlapping samples. However, this is still very likely for
+    sizeable datasets.
+
+    Read more in the :ref:`User Guide <ShuffleSplit>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=10
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. If ``train_size`` is also None, it will
+        be set to 0.1.
+
+    train_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the training and testing indices produced.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import ShuffleSplit
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])
+    >>> y = np.array([1, 2, 1, 2, 1, 2])
+    >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
+    >>> rs.get_n_splits(X)
+    5
+    >>> print(rs)
+    ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None)
+    >>> for i, (train_index, test_index) in enumerate(rs.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[1 3 0 4]
+      Test:  index=[5 2]
+    Fold 1:
+      Train: index=[4 0 2 5]
+      Test:  index=[1 3]
+    Fold 2:
+      Train: index=[1 2 4 0]
+      Test:  index=[3 5]
+    Fold 3:
+      Train: index=[3 4 1 0]
+      Test:  index=[5 2]
+    Fold 4:
+      Train: index=[3 5 1 0]
+      Test:  index=[2 4]
+    >>> # Specify train and test size
+    >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,
+    ...                   random_state=0)
+    >>> for i, (train_index, test_index) in enumerate(rs.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[1 3 0]
+      Test:  index=[5 2]
+    Fold 1:
+      Train: index=[4 0 2]
+      Test:  index=[1 3]
+    Fold 2:
+      Train: index=[1 2 4]
+      Test:  index=[3 5]
+    Fold 3:
+      Train: index=[3 4 1]
+      Test:  index=[5 2]
+    Fold 4:
+      Train: index=[3 5 1]
+      Test:  index=[2 4]
+    """
+
+    def __init__(
+        self, n_splits=10, *, test_size=None, train_size=None, random_state=None
+    ):
+        super().__init__(
+            n_splits=n_splits,
+            test_size=test_size,
+            train_size=train_size,
+            random_state=random_state,
+        )
+        self._default_test_size = 0.1
+
+
+class GroupShuffleSplit(GroupsConsumerMixin, BaseShuffleSplit):
+    """Shuffle-Group(s)-Out cross-validation iterator.
+
+    Provides randomized train/test indices to split data according to a
+    third-party provided group. This group information can be used to encode
+    arbitrary domain specific stratifications of the samples as integers.
+
+    For instance the groups could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    The difference between :class:`LeavePGroupsOut` and ``GroupShuffleSplit`` is that
+    the former generates splits using all subsets of size ``p`` unique groups,
+    whereas ``GroupShuffleSplit`` generates a user-determined number of random
+    test splits, each with a user-determined fraction of unique groups.
+
+    For example, a less computationally intensive alternative to
+    ``LeavePGroupsOut(p=10)`` would be
+    ``GroupShuffleSplit(test_size=10, n_splits=100)``.
+
+    Contrary to other cross-validation strategies, the random splits
+    do not guarantee that test sets across all folds will be mutually exclusive,
+    and might include overlapping samples. However, this is still very likely for
+    sizeable datasets.
+
+    Note: The parameters ``test_size`` and ``train_size`` refer to groups, and
+    not to samples as in :class:`ShuffleSplit`.
+
+    Read more in the :ref:`User Guide <group_shuffle_split>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float, int, default=None
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of groups to include in the test split (rounded up). If int,
+        represents the absolute number of test groups. If None, the value is
+        set to the complement of the train size. If ``train_size`` is also None,
+        it will be set to 0.2.
+
+    train_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the groups to include in the train split. If
+        int, represents the absolute number of train groups. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the training and testing indices produced.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import GroupShuffleSplit
+    >>> X = np.ones(shape=(8, 2))
+    >>> y = np.ones(shape=(8, 1))
+    >>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3])
+    >>> print(groups.shape)
+    (8,)
+    >>> gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42)
+    >>> gss.get_n_splits()
+    2
+    >>> print(gss)
+    GroupShuffleSplit(n_splits=2, random_state=42, test_size=None, train_size=0.7)
+    >>> for i, (train_index, test_index) in enumerate(gss.split(X, y, groups)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}, group={groups[train_index]}")
+    ...     print(f"  Test:  index={test_index}, group={groups[test_index]}")
+    Fold 0:
+      Train: index=[2 3 4 5 6 7], group=[2 2 2 3 3 3]
+      Test:  index=[0 1], group=[1 1]
+    Fold 1:
+      Train: index=[0 1 5 6 7], group=[1 1 3 3 3]
+      Test:  index=[2 3 4], group=[2 2 2]
+
+    See Also
+    --------
+    ShuffleSplit : Shuffles samples to create independent test/train sets.
+
+    LeavePGroupsOut : Train set leaves out all possible subsets of `p` groups.
+    """
+
+    def __init__(
+        self, n_splits=5, *, test_size=None, train_size=None, random_state=None
+    ):
+        super().__init__(
+            n_splits=n_splits,
+            test_size=test_size,
+            train_size=train_size,
+            random_state=random_state,
+        )
+        self._default_test_size = 0.2
+
+    def _iter_indices(self, X, y, groups):
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
+        classes, group_indices = np.unique(groups, return_inverse=True)
+        for group_train, group_test in super()._iter_indices(X=classes):
+            # these are the indices of classes in the partition
+            # invert them into data indices
+
+            train = np.flatnonzero(np.isin(group_indices, group_train))
+            test = np.flatnonzero(np.isin(group_indices, group_test))
+
+            yield train, test
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        return super().split(X, y, groups)
+
+
+class StratifiedShuffleSplit(BaseShuffleSplit):
+    """Stratified ShuffleSplit cross-validator.
+
+    Provides train/test indices to split data in train/test sets.
+
+    This cross-validation object is a merge of :class:`StratifiedKFold` and
+    :class:`ShuffleSplit`, which returns stratified randomized folds. The folds
+    are made by preserving the percentage of samples for each class.
+
+    Note: like the :class:`ShuffleSplit` strategy, stratified random splits
+    do not guarantee that test sets across all folds will be mutually exclusive,
+    and might include overlapping samples. However, this is still very likely for
+    sizeable datasets.
+
+    Read more in the :ref:`User Guide <stratified_shuffle_split>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=10
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. If ``train_size`` is also None, it will
+        be set to 0.1.
+
+    train_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the training and testing indices produced.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import StratifiedShuffleSplit
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 0, 1, 1, 1])
+    >>> sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
+    >>> sss.get_n_splits(X, y)
+    5
+    >>> print(sss)
+    StratifiedShuffleSplit(n_splits=5, random_state=0, ...)
+    >>> for i, (train_index, test_index) in enumerate(sss.split(X, y)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[5 2 3]
+      Test:  index=[4 1 0]
+    Fold 1:
+      Train: index=[5 1 4]
+      Test:  index=[0 2 3]
+    Fold 2:
+      Train: index=[5 0 2]
+      Test:  index=[4 3 1]
+    Fold 3:
+      Train: index=[4 1 0]
+      Test:  index=[2 3 5]
+    Fold 4:
+      Train: index=[0 5 1]
+      Test:  index=[3 4 2]
+    """
+
+    def __init__(
+        self, n_splits=10, *, test_size=None, train_size=None, random_state=None
+    ):
+        super().__init__(
+            n_splits=n_splits,
+            test_size=test_size,
+            train_size=train_size,
+            random_state=random_state,
+        )
+        self._default_test_size = 0.1
+
+    def _iter_indices(self, X, y, groups=None):
+        n_samples = _num_samples(X)
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
+        n_train, n_test = _validate_shuffle_split(
+            n_samples,
+            self.test_size,
+            self.train_size,
+            default_test_size=self._default_test_size,
+        )
+
+        # Convert to numpy as not all operations are supported by the Array API.
+        # `y` is probably never a very large array, which means that converting it
+        # should be cheap
+        xp, _ = get_namespace(y)
+        y = _convert_to_numpy(y, xp=xp)
+
+        if y.ndim == 2:
+            # for multi-label y, map each distinct row to a string repr
+            # using join because str(row) uses an ellipsis if len(row) > 1000
+            y = np.array([" ".join(row.astype("str")) for row in y])
+
+        classes, y_indices = np.unique(y, return_inverse=True)
+        n_classes = classes.shape[0]
+
+        class_counts = np.bincount(y_indices)
+        if np.min(class_counts) < 2:
+            raise ValueError(
+                "The least populated class in y has only 1"
+                " member, which is too few. The minimum"
+                " number of groups for any class cannot"
+                " be less than 2."
+            )
+
+        if n_train < n_classes:
+            raise ValueError(
+                "The train_size = %d should be greater or "
+                "equal to the number of classes = %d" % (n_train, n_classes)
+            )
+        if n_test < n_classes:
+            raise ValueError(
+                "The test_size = %d should be greater or "
+                "equal to the number of classes = %d" % (n_test, n_classes)
+            )
+
+        # Find the sorted list of instances for each class:
+        # (np.unique above performs a sort, so code is O(n logn) already)
+        class_indices = np.split(
+            np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
+        )
+
+        rng = check_random_state(self.random_state)
+
+        for _ in range(self.n_splits):
+            # if there are ties in the class-counts, we want
+            # to make sure to break them anew in each iteration
+            n_i = _approximate_mode(class_counts, n_train, rng)
+            class_counts_remaining = class_counts - n_i
+            t_i = _approximate_mode(class_counts_remaining, n_test, rng)
+
+            train = []
+            test = []
+
+            for i in range(n_classes):
+                permutation = rng.permutation(class_counts[i])
+                perm_indices_class_i = class_indices[i].take(permutation, mode="clip")
+
+                train.extend(perm_indices_class_i[: n_i[i]])
+                test.extend(perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]])
+
+            train = rng.permutation(train)
+            test = rng.permutation(test)
+
+            yield train, test
+
+    def split(self, X, y, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+            Note that providing ``y`` is sufficient to generate the splits and
+            hence ``np.zeros(n_samples)`` may be used as a placeholder for
+            ``X`` instead of actual training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_labels)
+            The target variable for supervised learning problems.
+            Stratification is done based on the y labels.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        if groups is not None:
+            warnings.warn(
+                f"The groups parameter is ignored by {self.__class__.__name__}",
+                UserWarning,
+            )
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
+        return super().split(X, y, groups)
+
+
+def _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None):
+    """
+    Validation helper to check if the train/test sizes are meaningful w.r.t. the
+    size of the data (n_samples).
+    """
+    if test_size is None and train_size is None:
+        test_size = default_test_size
+
+    test_size_type = np.asarray(test_size).dtype.kind
+    train_size_type = np.asarray(train_size).dtype.kind
+
+    if (
+        test_size_type == "i"
+        and (test_size >= n_samples or test_size <= 0)
+        or test_size_type == "f"
+        and (test_size <= 0 or test_size >= 1)
+    ):
+        raise ValueError(
+            "test_size={0} should be either positive and smaller"
+            " than the number of samples {1} or a float in the "
+            "(0, 1) range".format(test_size, n_samples)
+        )
+
+    if (
+        train_size_type == "i"
+        and (train_size >= n_samples or train_size <= 0)
+        or train_size_type == "f"
+        and (train_size <= 0 or train_size >= 1)
+    ):
+        raise ValueError(
+            "train_size={0} should be either positive and smaller"
+            " than the number of samples {1} or a float in the "
+            "(0, 1) range".format(train_size, n_samples)
+        )
+
+    if train_size is not None and train_size_type not in ("i", "f"):
+        raise ValueError("Invalid value for train_size: {}".format(train_size))
+    if test_size is not None and test_size_type not in ("i", "f"):
+        raise ValueError("Invalid value for test_size: {}".format(test_size))
+
+    if train_size_type == "f" and test_size_type == "f" and train_size + test_size > 1:
+        raise ValueError(
+            "The sum of test_size and train_size = {}, should be in the (0, 1)"
+            " range. Reduce test_size and/or train_size.".format(train_size + test_size)
+        )
+
+    if test_size_type == "f":
+        n_test = ceil(test_size * n_samples)
+    elif test_size_type == "i":
+        n_test = float(test_size)
+
+    if train_size_type == "f":
+        n_train = floor(train_size * n_samples)
+    elif train_size_type == "i":
+        n_train = float(train_size)
+
+    if train_size is None:
+        n_train = n_samples - n_test
+    elif test_size is None:
+        n_test = n_samples - n_train
+
+    if n_train + n_test > n_samples:
+        raise ValueError(
+            "The sum of train_size and test_size = %d, "
+            "should be smaller than the number of "
+            "samples %d. Reduce test_size and/or "
+            "train_size." % (n_train + n_test, n_samples)
+        )
+
+    n_train, n_test = int(n_train), int(n_test)
+
+    if n_train == 0:
+        raise ValueError(
+            "With n_samples={}, test_size={} and train_size={}, the "
+            "resulting train set will be empty. Adjust any of the "
+            "aforementioned parameters.".format(n_samples, test_size, train_size)
+        )
+
+    return n_train, n_test
+
+
+class PredefinedSplit(BaseCrossValidator):
+    """Predefined split cross-validator.
+
+    Provides train/test indices to split data into train/test sets using a
+    predefined scheme specified by the user with the ``test_fold`` parameter.
+
+    Read more in the :ref:`User Guide <predefined_split>`.
+
+    .. versionadded:: 0.16
+
+    Parameters
+    ----------
+    test_fold : array-like of shape (n_samples,)
+        The entry ``test_fold[i]`` represents the index of the test set that
+        sample ``i`` belongs to. It is possible to exclude sample ``i`` from
+        any test set (i.e. include sample ``i`` in every training set) by
+        setting ``test_fold[i]`` equal to -1.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import PredefinedSplit
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 1, 1])
+    >>> test_fold = [0, 1, -1, 1]
+    >>> ps = PredefinedSplit(test_fold)
+    >>> ps.get_n_splits()
+    2
+    >>> print(ps)
+    PredefinedSplit(test_fold=array([ 0,  1, -1,  1]))
+    >>> for i, (train_index, test_index) in enumerate(ps.split()):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[1 2 3]
+      Test:  index=[0]
+    Fold 1:
+      Train: index=[0 2]
+      Test:  index=[1 3]
+    """
+
+    def __init__(self, test_fold):
+        self.test_fold = np.array(test_fold, dtype=int)
+        self.test_fold = column_or_1d(self.test_fold)
+        self.unique_folds = np.unique(self.test_fold)
+        self.unique_folds = self.unique_folds[self.unique_folds != -1]
+
+    def split(self, X=None, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        if groups is not None:
+            warnings.warn(
+                f"The groups parameter is ignored by {self.__class__.__name__}",
+                UserWarning,
+            )
+        return self._split()
+
+    def _split(self):
+        """Generate indices to split data into training and test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        ind = np.arange(len(self.test_fold))
+        for test_index in self._iter_test_masks():
+            train_index = ind[np.logical_not(test_index)]
+            test_index = ind[test_index]
+            yield train_index, test_index
+
+    def _iter_test_masks(self):
+        """Generates boolean masks corresponding to test sets."""
+        for f in self.unique_folds:
+            test_index = np.where(self.test_fold == f)[0]
+            test_mask = np.zeros(len(self.test_fold), dtype=bool)
+            test_mask[test_index] = True
+            yield test_mask
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return len(self.unique_folds)
+
+
+class _CVIterableWrapper(BaseCrossValidator):
+    """Wrapper class for old style cv objects and iterables."""
+
+    def __init__(self, cv):
+        self.cv = list(cv)
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return len(self.cv)
+
+    def split(self, X=None, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        for train, test in self.cv:
+            yield train, test
+
+
+def check_cv(cv=5, y=None, *, classifier=False):
+    """Input checker utility for building a cross-validator.
+
+    Parameters
+    ----------
+    cv : int, cross-validation generator, iterable or None, default=5
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable that generates (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if classifier is True and ``y`` is either
+        binary or multiclass, :class:`StratifiedKFold` is used. In all other
+        cases, :class:`KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value changed from 3-fold to 5-fold.
+
+    y : array-like, default=None
+        The target variable for supervised learning problems.
+
+    classifier : bool, default=False
+        Whether the task is a classification task, in which case
+        stratified KFold will be used.
+
+    Returns
+    -------
+    checked_cv : a cross-validator instance.
+        The return value is a cross-validator which generates the train/test
+        splits via the ``split`` method.
+
+    Examples
+    --------
+    >>> from sklearn.model_selection import check_cv
+    >>> check_cv(cv=5, y=None, classifier=False)
+    KFold(...)
+    >>> check_cv(cv=5, y=[1, 1, 0, 0, 0, 0], classifier=True)
+    StratifiedKFold(...)
+    """
+    cv = 5 if cv is None else cv
+    if isinstance(cv, numbers.Integral):
+        if (
+            classifier
+            and (y is not None)
+            and (type_of_target(y, input_name="y") in ("binary", "multiclass"))
+        ):
+            return StratifiedKFold(cv)
+        else:
+            return KFold(cv)
+
+    if not hasattr(cv, "split") or isinstance(cv, str):
+        if not isinstance(cv, Iterable) or isinstance(cv, str):
+            raise ValueError(
+                "Expected cv as an integer, cross-validation "
+                "object (from sklearn.model_selection) "
+                "or an iterable. Got %s." % cv
+            )
+        return _CVIterableWrapper(cv)
+
+    return cv  # New style cv objects are passed without any modification
+
+
+@validate_params(
+    {
+        "test_size": [
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            Interval(numbers.Integral, 1, None, closed="left"),
+            None,
+        ],
+        "train_size": [
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            Interval(numbers.Integral, 1, None, closed="left"),
+            None,
+        ],
+        "random_state": ["random_state"],
+        "shuffle": ["boolean"],
+        "stratify": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def train_test_split(
+    *arrays,
+    test_size=None,
+    train_size=None,
+    random_state=None,
+    shuffle=True,
+    stratify=None,
+):
+    """Split arrays or matrices into random train and test subsets.
+
+    Quick utility that wraps input validation,
+    ``next(ShuffleSplit().split(X, y))``, and application to input data
+    into a single call for splitting (and optionally subsampling) data into a
+    one-liner.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    *arrays : sequence of indexables with same length / shape[0]
+        Allowed inputs are lists, numpy arrays, scipy-sparse
+        matrices or pandas dataframes.
+
+    test_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. If ``train_size`` is also None, it will
+        be set to 0.25.
+
+    train_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the shuffling applied to the data before applying the split.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    shuffle : bool, default=True
+        Whether or not to shuffle the data before splitting. If shuffle=False
+        then stratify must be None.
+
+    stratify : array-like, default=None
+        If not None, data is split in a stratified fashion, using this as
+        the class labels.
+        Read more in the :ref:`User Guide <stratification>`.
+
+    Returns
+    -------
+    splitting : list, length=2 * len(arrays)
+        List containing train-test split of inputs.
+
+        .. versionadded:: 0.16
+            If the input is sparse, the output will be a
+            ``scipy.sparse.csr_matrix``. Else, output type is the same as the
+            input type.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = np.arange(10).reshape((5, 2)), range(5)
+    >>> X
+    array([[0, 1],
+           [2, 3],
+           [4, 5],
+           [6, 7],
+           [8, 9]])
+    >>> list(y)
+    [0, 1, 2, 3, 4]
+
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, test_size=0.33, random_state=42)
+    ...
+    >>> X_train
+    array([[4, 5],
+           [0, 1],
+           [6, 7]])
+    >>> y_train
+    [2, 0, 3]
+    >>> X_test
+    array([[2, 3],
+           [8, 9]])
+    >>> y_test
+    [1, 4]
+
+    >>> train_test_split(y, shuffle=False)
+    [[0, 1, 2], [3, 4]]
+    """
+    n_arrays = len(arrays)
+    if n_arrays == 0:
+        raise ValueError("At least one array required as input")
+
+    arrays = indexable(*arrays)
+
+    n_samples = _num_samples(arrays[0])
+    n_train, n_test = _validate_shuffle_split(
+        n_samples, test_size, train_size, default_test_size=0.25
+    )
+
+    if shuffle is False:
+        if stratify is not None:
+            raise ValueError(
+                "Stratified train/test split is not implemented for shuffle=False"
+            )
+
+        train = np.arange(n_train)
+        test = np.arange(n_train, n_train + n_test)
+
+    else:
+        if stratify is not None:
+            CVClass = StratifiedShuffleSplit
+        else:
+            CVClass = ShuffleSplit
+
+        cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state)
+
+        train, test = next(cv.split(X=arrays[0], y=stratify))
+
+    train, test = ensure_common_namespace_device(arrays[0], train, test)
+
+    return list(
+        chain.from_iterable(
+            (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
+        )
+    )
+
+
+# Tell nose that train_test_split is not a test.
+# (Needed for external libraries that may use nose.)
+# Use setattr to avoid mypy errors when monkeypatching.
+setattr(train_test_split, "__test__", False)
+
+
+def _pprint(params, offset=0, printer=repr):
+    """Pretty print the dictionary 'params'
+
+    Parameters
+    ----------
+    params : dict
+        The dictionary to pretty print
+
+    offset : int, default=0
+        The offset in characters to add at the begin of each line.
+
+    printer : callable, default=repr
+        The function to convert entries to strings, typically
+        the builtin str or repr
+
+    """
+    # Do a multi-line justified repr:
+    options = np.get_printoptions()
+    np.set_printoptions(precision=5, threshold=64, edgeitems=2)
+    params_list = list()
+    this_line_length = offset
+    line_sep = ",\n" + (1 + offset // 2) * " "
+    for i, (k, v) in enumerate(sorted(params.items())):
+        if isinstance(v, float):
+            # use str for representing floating point numbers
+            # this way we get consistent representation across
+            # architectures and versions.
+            this_repr = "%s=%s" % (k, str(v))
+        else:
+            # use repr of the rest
+            this_repr = "%s=%s" % (k, printer(v))
+        if len(this_repr) > 500:
+            this_repr = this_repr[:300] + "..." + this_repr[-100:]
+        if i > 0:
+            if this_line_length + len(this_repr) >= 75 or "\n" in this_repr:
+                params_list.append(line_sep)
+                this_line_length = len(line_sep)
+            else:
+                params_list.append(", ")
+                this_line_length += 2
+        params_list.append(this_repr)
+        this_line_length += len(this_repr)
+
+    np.set_printoptions(**options)
+    lines = "".join(params_list)
+    # Strip trailing space to avoid nightmare in doctests
+    lines = "\n".join(l.rstrip(" ") for l in lines.split("\n"))
+    return lines
+
+
+def _build_repr(self):
+    # XXX This is copied from BaseEstimator's get_params
+    cls = self.__class__
+    init = getattr(cls.__init__, "deprecated_original", cls.__init__)
+    # Ignore varargs, kw and default values and pop self
+    init_signature = signature(init)
+    # Consider the constructor parameters excluding 'self'
+    if init is object.__init__:
+        args = []
+    else:
+        args = sorted(
+            [
+                p.name
+                for p in init_signature.parameters.values()
+                if p.name != "self" and p.kind != p.VAR_KEYWORD
+            ]
+        )
+    class_name = self.__class__.__name__
+    params = dict()
+    for key in args:
+        # We need deprecation warnings to always be on in order to
+        # catch deprecated param values.
+        # This is set in utils/__init__.py but it gets overwritten
+        # when running under python3 somehow.
+        warnings.simplefilter("always", FutureWarning)
+        try:
+            with warnings.catch_warnings(record=True) as w:
+                value = getattr(self, key, None)
+                if value is None and hasattr(self, "cvargs"):
+                    value = self.cvargs.get(key, None)
+            if len(w) and w[0].category is FutureWarning:
+                # if the parameter is deprecated, don't show it
+                continue
+        finally:
+            warnings.filters.pop(0)
+        params[key] = value
+
+    return "%s(%s)" % (class_name, _pprint(params, offset=len(class_name)))
+
+
+def _yields_constant_splits(cv):
+    # Return True if calling cv.split() always returns the same splits
+    # We assume that if a cv doesn't have a shuffle parameter, it shuffles by
+    # default (e.g. ShuffleSplit). If it actually doesn't shuffle (e.g.
+    # LeaveOneOut), then it won't have a random_state parameter anyway, in
+    # which case it will default to 0, leading to output=True
+    shuffle = getattr(cv, "shuffle", True)
+    random_state = getattr(cv, "random_state", 0)
+    return isinstance(random_state, numbers.Integral) or not shuffle
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/_validation.py b/.venv/Lib/site-packages/sklearn/model_selection/_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..08e7c4bff566aefdf008c0470cc680fe8c76cdd0
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/_validation.py
@@ -0,0 +1,2550 @@
+"""
+The :mod:`sklearn.model_selection._validation` module includes classes and
+functions to validate the model.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import numbers
+import time
+import warnings
+from collections import Counter
+from contextlib import suppress
+from functools import partial
+from numbers import Real
+from traceback import format_exc
+
+import numpy as np
+import scipy.sparse as sp
+from joblib import logger
+
+from ..base import clone, is_classifier
+from ..exceptions import FitFailedWarning, UnsetMetadataPassedError
+from ..metrics import check_scoring, get_scorer_names
+from ..metrics._scorer import _MultimetricScorer
+from ..preprocessing import LabelEncoder
+from ..utils import Bunch, _safe_indexing, check_random_state, indexable
+from ..utils._array_api import device, get_namespace
+from ..utils._param_validation import (
+    HasMethods,
+    Integral,
+    Interval,
+    StrOptions,
+    validate_params,
+)
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.metaestimators import _safe_split
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import _check_method_params, _num_samples
+from ._split import check_cv
+
+__all__ = [
+    "cross_validate",
+    "cross_val_score",
+    "cross_val_predict",
+    "permutation_test_score",
+    "learning_curve",
+    "validation_curve",
+]
+
+
+def _check_params_groups_deprecation(fit_params, params, groups, version):
+    """A helper function to check deprecations on `groups` and `fit_params`.
+
+    # TODO(SLEP6): To be removed when set_config(enable_metadata_routing=False) is not
+    # possible.
+    """
+    if params is not None and fit_params is not None:
+        raise ValueError(
+            "`params` and `fit_params` cannot both be provided. Pass parameters "
+            "via `params`. `fit_params` is deprecated and will be removed in "
+            f"version {version}."
+        )
+    elif fit_params is not None:
+        warnings.warn(
+            (
+                "`fit_params` is deprecated and will be removed in version {version}. "
+                "Pass parameters via `params` instead."
+            ),
+            FutureWarning,
+        )
+        params = fit_params
+
+    params = {} if params is None else params
+
+    _check_groups_routing_disabled(groups)
+
+    return params
+
+
+# TODO(SLEP6): To be removed when set_config(enable_metadata_routing=False) is not
+# possible.
+def _check_groups_routing_disabled(groups):
+    if groups is not None and _routing_enabled():
+        raise ValueError(
+            "`groups` can only be passed if metadata routing is not enabled via"
+            " `sklearn.set_config(enable_metadata_routing=True)`. When routing is"
+            " enabled, pass `groups` alongside other metadata via the `params` argument"
+            " instead."
+        )
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods("fit")],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "groups": ["array-like", None],
+        "scoring": [
+            StrOptions(set(get_scorer_names())),
+            callable,
+            list,
+            tuple,
+            dict,
+            None,
+        ],
+        "cv": ["cv_object"],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+        "params": [dict, None],
+        "pre_dispatch": [Integral, str],
+        "return_train_score": ["boolean"],
+        "return_estimator": ["boolean"],
+        "return_indices": ["boolean"],
+        "error_score": [StrOptions({"raise"}), Real],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def cross_validate(
+    estimator,
+    X,
+    y=None,
+    *,
+    groups=None,
+    scoring=None,
+    cv=None,
+    n_jobs=None,
+    verbose=0,
+    params=None,
+    pre_dispatch="2*n_jobs",
+    return_train_score=False,
+    return_estimator=False,
+    return_indices=False,
+    error_score=np.nan,
+):
+    """Evaluate metric(s) by cross-validation and also record fit/score times.
+
+    Read more in the :ref:`User Guide <multimetric_cross_validation>`.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to fit. Can be for example a list, or an array.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    groups : array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" :term:`cv`
+        instance (e.g., :class:`GroupKFold`).
+
+        .. versionchanged:: 1.4
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``cross_validate(..., params={'groups': groups})``.
+
+    scoring : str, callable, list, tuple, or dict, default=None
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set. If `None`, the
+        :ref:`default evaluation criterion <scoring_api_overview>` of the estimator
+        is used.
+
+        If `scoring` represents a single score, one can use:
+
+        - a single string (see :ref:`scoring_parameter`);
+        - a callable (see :ref:`scoring_callable`) that returns a single value.
+
+        If `scoring` represents multiple scores, one can use:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables a values.
+
+        See :ref:`multimetric_grid_search` for an example.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the score are parallelized over the cross-validation splits.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    params : dict, default=None
+        Parameters to pass to the underlying estimator's ``fit``, the scorer,
+        and the CV splitter.
+
+        .. versionadded:: 1.4
+
+    pre_dispatch : int or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - An int, giving the exact number of total jobs that are spawned
+        - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'
+
+    return_train_score : bool, default=False
+        Whether to include train scores.
+        Computing training scores is used to get insights on how different
+        parameter settings impact the overfitting/underfitting trade-off.
+        However computing the scores on the training set can be computationally
+        expensive and is not strictly required to select the parameters that
+        yield the best generalization performance.
+
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
+    return_estimator : bool, default=False
+        Whether to return the estimators fitted on each split.
+
+        .. versionadded:: 0.20
+
+    return_indices : bool, default=False
+        Whether to return the train-test indices selected for each split.
+
+        .. versionadded:: 1.3
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised.
+        If a numeric value is given, FitFailedWarning is raised.
+
+        .. versionadded:: 0.20
+
+    Returns
+    -------
+    scores : dict of float arrays of shape (n_splits,)
+        Array of scores of the estimator for each run of the cross validation.
+
+        A dict of arrays containing the score/time arrays for each scorer is
+        returned. The possible keys for this ``dict`` are:
+
+        ``test_score``
+            The score array for test scores on each cv split.
+            Suffix ``_score`` in ``test_score`` changes to a specific
+            metric like ``test_r2`` or ``test_auc`` if there are
+            multiple scoring metrics in the scoring parameter.
+        ``train_score``
+            The score array for train scores on each cv split.
+            Suffix ``_score`` in ``train_score`` changes to a specific
+            metric like ``train_r2`` or ``train_auc`` if there are
+            multiple scoring metrics in the scoring parameter.
+            This is available only if ``return_train_score`` parameter
+            is ``True``.
+        ``fit_time``
+            The time for fitting the estimator on the train
+            set for each cv split.
+        ``score_time``
+            The time for scoring the estimator on the test set for each
+            cv split. (Note time for scoring on the train set is not
+            included even if ``return_train_score`` is set to ``True``
+        ``estimator``
+            The estimator objects for each cv split.
+            This is available only if ``return_estimator`` parameter
+            is set to ``True``.
+        ``indices``
+            The train/test positional indices for each cv split. A dictionary
+            is returned where the keys are either `"train"` or `"test"`
+            and the associated values are a list of integer-dtyped NumPy
+            arrays with the indices. Available only if `return_indices=True`.
+
+    See Also
+    --------
+    cross_val_score : Run cross-validation for single metric evaluation.
+
+    cross_val_predict : Get predictions from each split of cross-validation for
+        diagnostic purposes.
+
+    sklearn.metrics.make_scorer : Make a scorer from a performance metric or
+        loss function.
+
+    Examples
+    --------
+    >>> from sklearn import datasets, linear_model
+    >>> from sklearn.model_selection import cross_validate
+    >>> from sklearn.metrics import make_scorer
+    >>> from sklearn.metrics import confusion_matrix
+    >>> from sklearn.svm import LinearSVC
+    >>> diabetes = datasets.load_diabetes()
+    >>> X = diabetes.data[:150]
+    >>> y = diabetes.target[:150]
+    >>> lasso = linear_model.Lasso()
+
+    Single metric evaluation using ``cross_validate``
+
+    >>> cv_results = cross_validate(lasso, X, y, cv=3)
+    >>> sorted(cv_results.keys())
+    ['fit_time', 'score_time', 'test_score']
+    >>> cv_results['test_score']
+    array([0.3315057 , 0.08022103, 0.03531816])
+
+    Multiple metric evaluation using ``cross_validate``
+    (please refer the ``scoring`` parameter doc for more information)
+
+    >>> scores = cross_validate(lasso, X, y, cv=3,
+    ...                         scoring=('r2', 'neg_mean_squared_error'),
+    ...                         return_train_score=True)
+    >>> print(scores['test_neg_mean_squared_error'])
+    [-3635.5... -3573.3... -6114.7...]
+    >>> print(scores['train_r2'])
+    [0.28009951 0.3908844  0.22784907]
+    """
+    _check_groups_routing_disabled(groups)
+
+    X, y = indexable(X, y)
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+
+    scorers = check_scoring(
+        estimator, scoring=scoring, raise_exc=(error_score == "raise")
+    )
+
+    if _routing_enabled():
+        # For estimators, a MetadataRouter is created in get_metadata_routing
+        # methods. For these router methods, we create the router to use
+        # `process_routing` on it.
+        router = (
+            MetadataRouter(owner="cross_validate")
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                estimator=estimator,
+                # TODO(SLEP6): also pass metadata to the predict method for
+                # scoring?
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+            .add(
+                scorer=scorers,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+        )
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            unrequested_params = sorted(e.unrequested_params)
+            raise UnsetMetadataPassedError(
+                message=(
+                    f"{unrequested_params} are passed to cross validation but are not"
+                    " explicitly set as requested or not requested for cross_validate's"
+                    f" estimator: {estimator.__class__.__name__}. Call"
+                    " `.set_fit_request({{metadata}}=True)` on the estimator for"
+                    f" each metadata in {unrequested_params} that you"
+                    " want to use and `metadata=False` for not using it. See the"
+                    " Metadata Routing User guide"
+                    " <https://scikit-learn.org/stable/metadata_routing.html> for more"
+                    " information."
+                ),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+    else:
+        routed_params = Bunch()
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.estimator = Bunch(fit=params)
+        routed_params.scorer = Bunch(score={})
+
+    indices = cv.split(X, y, **routed_params.splitter.split)
+    if return_indices:
+        # materialize the indices since we need to store them in the returned dict
+        indices = list(indices)
+
+    # We clone the estimator to make sure that all the folds are
+    # independent, and that it is pickle-able.
+    parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
+    results = parallel(
+        delayed(_fit_and_score)(
+            clone(estimator),
+            X,
+            y,
+            scorer=scorers,
+            train=train,
+            test=test,
+            verbose=verbose,
+            parameters=None,
+            fit_params=routed_params.estimator.fit,
+            score_params=routed_params.scorer.score,
+            return_train_score=return_train_score,
+            return_times=True,
+            return_estimator=return_estimator,
+            error_score=error_score,
+        )
+        for train, test in indices
+    )
+
+    _warn_or_raise_about_fit_failures(results, error_score)
+
+    # For callable scoring, the return type is only know after calling. If the
+    # return type is a dictionary, the error scores can now be inserted with
+    # the correct key.
+    if callable(scoring):
+        _insert_error_scores(results, error_score)
+
+    results = _aggregate_score_dicts(results)
+
+    ret = {}
+    ret["fit_time"] = results["fit_time"]
+    ret["score_time"] = results["score_time"]
+
+    if return_estimator:
+        ret["estimator"] = results["estimator"]
+
+    if return_indices:
+        ret["indices"] = {}
+        ret["indices"]["train"], ret["indices"]["test"] = zip(*indices)
+
+    test_scores_dict = _normalize_score_results(results["test_scores"])
+    if return_train_score:
+        train_scores_dict = _normalize_score_results(results["train_scores"])
+
+    for name in test_scores_dict:
+        ret["test_%s" % name] = test_scores_dict[name]
+        if return_train_score:
+            key = "train_%s" % name
+            ret[key] = train_scores_dict[name]
+
+    return ret
+
+
+def _insert_error_scores(results, error_score):
+    """Insert error in `results` by replacing them inplace with `error_score`.
+
+    This only applies to multimetric scores because `_fit_and_score` will
+    handle the single metric case.
+    """
+    successful_score = None
+    failed_indices = []
+    for i, result in enumerate(results):
+        if result["fit_error"] is not None:
+            failed_indices.append(i)
+        elif successful_score is None:
+            successful_score = result["test_scores"]
+
+    if isinstance(successful_score, dict):
+        formatted_error = {name: error_score for name in successful_score}
+        for i in failed_indices:
+            results[i]["test_scores"] = formatted_error.copy()
+            if "train_scores" in results[i]:
+                results[i]["train_scores"] = formatted_error.copy()
+
+
+def _normalize_score_results(scores, scaler_score_key="score"):
+    """Creates a scoring dictionary based on the type of `scores`"""
+    if isinstance(scores[0], dict):
+        # multimetric scoring
+        return _aggregate_score_dicts(scores)
+    # scaler
+    return {scaler_score_key: scores}
+
+
+def _warn_or_raise_about_fit_failures(results, error_score):
+    fit_errors = [
+        result["fit_error"] for result in results if result["fit_error"] is not None
+    ]
+    if fit_errors:
+        num_failed_fits = len(fit_errors)
+        num_fits = len(results)
+        fit_errors_counter = Counter(fit_errors)
+        delimiter = "-" * 80 + "\n"
+        fit_errors_summary = "\n".join(
+            f"{delimiter}{n} fits failed with the following error:\n{error}"
+            for error, n in fit_errors_counter.items()
+        )
+
+        if num_failed_fits == num_fits:
+            all_fits_failed_message = (
+                f"\nAll the {num_fits} fits failed.\n"
+                "It is very likely that your model is misconfigured.\n"
+                "You can try to debug the error by setting error_score='raise'.\n\n"
+                f"Below are more details about the failures:\n{fit_errors_summary}"
+            )
+            raise ValueError(all_fits_failed_message)
+
+        else:
+            some_fits_failed_message = (
+                f"\n{num_failed_fits} fits failed out of a total of {num_fits}.\n"
+                "The score on these train-test partitions for these parameters"
+                f" will be set to {error_score}.\n"
+                "If these failures are not expected, you can try to debug them "
+                "by setting error_score='raise'.\n\n"
+                f"Below are more details about the failures:\n{fit_errors_summary}"
+            )
+            warnings.warn(some_fits_failed_message, FitFailedWarning)
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods("fit")],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "groups": ["array-like", None],
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "cv": ["cv_object"],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+        "params": [dict, None],
+        "pre_dispatch": [Integral, str, None],
+        "error_score": [StrOptions({"raise"}), Real],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def cross_val_score(
+    estimator,
+    X,
+    y=None,
+    *,
+    groups=None,
+    scoring=None,
+    cv=None,
+    n_jobs=None,
+    verbose=0,
+    params=None,
+    pre_dispatch="2*n_jobs",
+    error_score=np.nan,
+):
+    """Evaluate a score by cross-validation.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to fit. Can be for example a list, or an array.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
+            default=None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    groups : array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" :term:`cv`
+        instance (e.g., :class:`GroupKFold`).
+
+        .. versionchanged:: 1.4
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``cross_val_score(..., params={'groups': groups})``.
+
+    scoring : str or callable, default=None
+        A str (see :ref:`scoring_parameter`) or a scorer callable object / function with
+        signature ``scorer(estimator, X, y)`` which should return only a single value.
+
+        Similar to :func:`cross_validate`
+        but only a single metric is permitted.
+
+        If `None`, the estimator's default scorer (if available) is used.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - `None`, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable that generates (train, test) splits as arrays of indices.
+
+        For `int`/`None` inputs, if the estimator is a classifier and `y` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            `cv` default value if `None` changed from 3-fold to 5-fold.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the score are parallelized over the cross-validation splits.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    params : dict, default=None
+        Parameters to pass to the underlying estimator's ``fit``, the scorer,
+        and the CV splitter.
+
+        .. versionadded:: 1.4
+
+    pre_dispatch : int or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - ``None``, in which case all the jobs are immediately created and spawned. Use
+          this for lightweight and fast-running jobs, to avoid delays due to on-demand
+          spawning of the jobs
+        - An int, giving the exact number of total jobs that are spawned
+        - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised.
+        If a numeric value is given, FitFailedWarning is raised.
+
+        .. versionadded:: 0.20
+
+    Returns
+    -------
+    scores : ndarray of float of shape=(len(list(cv)),)
+        Array of scores of the estimator for each run of the cross validation.
+
+    See Also
+    --------
+    cross_validate : To run cross-validation on multiple metrics and also to
+        return train scores, fit times and score times.
+
+    cross_val_predict : Get predictions from each split of cross-validation for
+        diagnostic purposes.
+
+    sklearn.metrics.make_scorer : Make a scorer from a performance metric or
+        loss function.
+
+    Examples
+    --------
+    >>> from sklearn import datasets, linear_model
+    >>> from sklearn.model_selection import cross_val_score
+    >>> diabetes = datasets.load_diabetes()
+    >>> X = diabetes.data[:150]
+    >>> y = diabetes.target[:150]
+    >>> lasso = linear_model.Lasso()
+    >>> print(cross_val_score(lasso, X, y, cv=3))
+    [0.3315057  0.08022103 0.03531816]
+    """
+    # To ensure multimetric format is not supported
+    scorer = check_scoring(estimator, scoring=scoring)
+
+    cv_results = cross_validate(
+        estimator=estimator,
+        X=X,
+        y=y,
+        groups=groups,
+        scoring={"score": scorer},
+        cv=cv,
+        n_jobs=n_jobs,
+        verbose=verbose,
+        params=params,
+        pre_dispatch=pre_dispatch,
+        error_score=error_score,
+    )
+    return cv_results["test_score"]
+
+
+def _fit_and_score(
+    estimator,
+    X,
+    y,
+    *,
+    scorer,
+    train,
+    test,
+    verbose,
+    parameters,
+    fit_params,
+    score_params,
+    return_train_score=False,
+    return_parameters=False,
+    return_n_test_samples=False,
+    return_times=False,
+    return_estimator=False,
+    split_progress=None,
+    candidate_progress=None,
+    error_score=np.nan,
+):
+    """Fit estimator and compute scores for a given dataset split.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : array-like of shape (n_samples, n_features)
+        The data to fit.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    scorer : A single callable or dict mapping scorer name to the callable
+        If it is a single callable, the return value for ``train_scores`` and
+        ``test_scores`` is a single float.
+
+        For a dict, it should be one mapping the scorer name to the scorer
+        callable object / function.
+
+        The callable object / fn should have signature
+        ``scorer(estimator, X, y)``.
+
+    train : array-like of shape (n_train_samples,)
+        Indices of training samples.
+
+    test : array-like of shape (n_test_samples,)
+        Indices of test samples.
+
+    verbose : int
+        The verbosity level.
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised.
+        If a numeric value is given, FitFailedWarning is raised.
+
+    parameters : dict or None
+        Parameters to be set on the estimator.
+
+    fit_params : dict or None
+        Parameters that will be passed to ``estimator.fit``.
+
+    score_params : dict or None
+        Parameters that will be passed to the scorer.
+
+    return_train_score : bool, default=False
+        Compute and return score on training set.
+
+    return_parameters : bool, default=False
+        Return parameters that has been used for the estimator.
+
+    split_progress : {list, tuple} of int, default=None
+        A list or tuple of format (<current_split_id>, <total_num_of_splits>).
+
+    candidate_progress : {list, tuple} of int, default=None
+        A list or tuple of format
+        (<current_candidate_id>, <total_number_of_candidates>).
+
+    return_n_test_samples : bool, default=False
+        Whether to return the ``n_test_samples``.
+
+    return_times : bool, default=False
+        Whether to return the fit/score times.
+
+    return_estimator : bool, default=False
+        Whether to return the fitted estimator.
+
+    Returns
+    -------
+    result : dict with the following attributes
+        train_scores : dict of scorer name -> float
+            Score on training set (for all the scorers),
+            returned only if `return_train_score` is `True`.
+        test_scores : dict of scorer name -> float
+            Score on testing set (for all the scorers).
+        n_test_samples : int
+            Number of test samples.
+        fit_time : float
+            Time spent for fitting in seconds.
+        score_time : float
+            Time spent for scoring in seconds.
+        parameters : dict or None
+            The parameters that have been evaluated.
+        estimator : estimator object
+            The fitted estimator.
+        fit_error : str or None
+            Traceback str if the fit failed, None if the fit succeeded.
+    """
+    xp, _ = get_namespace(X)
+    X_device = device(X)
+
+    # Make sure that we can fancy index X even if train and test are provided
+    # as NumPy arrays by NumPy only cross-validation splitters.
+    train, test = xp.asarray(train, device=X_device), xp.asarray(test, device=X_device)
+
+    if not isinstance(error_score, numbers.Number) and error_score != "raise":
+        raise ValueError(
+            "error_score must be the string 'raise' or a numeric value. "
+            "(Hint: if using 'raise', please make sure that it has been "
+            "spelled correctly.)"
+        )
+
+    progress_msg = ""
+    if verbose > 2:
+        if split_progress is not None:
+            progress_msg = f" {split_progress[0]+1}/{split_progress[1]}"
+        if candidate_progress and verbose > 9:
+            progress_msg += f"; {candidate_progress[0]+1}/{candidate_progress[1]}"
+
+    if verbose > 1:
+        if parameters is None:
+            params_msg = ""
+        else:
+            sorted_keys = sorted(parameters)  # Ensure deterministic o/p
+            params_msg = ", ".join(f"{k}={parameters[k]}" for k in sorted_keys)
+    if verbose > 9:
+        start_msg = f"[CV{progress_msg}] START {params_msg}"
+        print(f"{start_msg}{(80 - len(start_msg)) * '.'}")
+
+    # Adjust length of sample weights
+    fit_params = fit_params if fit_params is not None else {}
+    fit_params = _check_method_params(X, params=fit_params, indices=train)
+    score_params = score_params if score_params is not None else {}
+    score_params_train = _check_method_params(X, params=score_params, indices=train)
+    score_params_test = _check_method_params(X, params=score_params, indices=test)
+
+    if parameters is not None:
+        # here we clone the parameters, since sometimes the parameters
+        # themselves might be estimators, e.g. when we search over different
+        # estimators in a pipeline.
+        # ref: https://github.com/scikit-learn/scikit-learn/pull/26786
+        estimator = estimator.set_params(**clone(parameters, safe=False))
+
+    start_time = time.time()
+
+    X_train, y_train = _safe_split(estimator, X, y, train)
+    X_test, y_test = _safe_split(estimator, X, y, test, train)
+
+    result = {}
+    try:
+        if y_train is None:
+            estimator.fit(X_train, **fit_params)
+        else:
+            estimator.fit(X_train, y_train, **fit_params)
+
+    except Exception:
+        # Note fit time as time until error
+        fit_time = time.time() - start_time
+        score_time = 0.0
+        if error_score == "raise":
+            raise
+        elif isinstance(error_score, numbers.Number):
+            if isinstance(scorer, _MultimetricScorer):
+                test_scores = {name: error_score for name in scorer._scorers}
+                if return_train_score:
+                    train_scores = test_scores.copy()
+            else:
+                test_scores = error_score
+                if return_train_score:
+                    train_scores = error_score
+        result["fit_error"] = format_exc()
+    else:
+        result["fit_error"] = None
+
+        fit_time = time.time() - start_time
+        test_scores = _score(
+            estimator, X_test, y_test, scorer, score_params_test, error_score
+        )
+        score_time = time.time() - start_time - fit_time
+        if return_train_score:
+            train_scores = _score(
+                estimator, X_train, y_train, scorer, score_params_train, error_score
+            )
+
+    if verbose > 1:
+        total_time = score_time + fit_time
+        end_msg = f"[CV{progress_msg}] END "
+        result_msg = params_msg + (";" if params_msg else "")
+        if verbose > 2:
+            if isinstance(test_scores, dict):
+                for scorer_name in sorted(test_scores):
+                    result_msg += f" {scorer_name}: ("
+                    if return_train_score:
+                        scorer_scores = train_scores[scorer_name]
+                        result_msg += f"train={scorer_scores:.3f}, "
+                    result_msg += f"test={test_scores[scorer_name]:.3f})"
+            else:
+                result_msg += ", score="
+                if return_train_score:
+                    result_msg += f"(train={train_scores:.3f}, test={test_scores:.3f})"
+                else:
+                    result_msg += f"{test_scores:.3f}"
+        result_msg += f" total time={logger.short_format_time(total_time)}"
+
+        # Right align the result_msg
+        end_msg += "." * (80 - len(end_msg) - len(result_msg))
+        end_msg += result_msg
+        print(end_msg)
+
+    result["test_scores"] = test_scores
+    if return_train_score:
+        result["train_scores"] = train_scores
+    if return_n_test_samples:
+        result["n_test_samples"] = _num_samples(X_test)
+    if return_times:
+        result["fit_time"] = fit_time
+        result["score_time"] = score_time
+    if return_parameters:
+        result["parameters"] = parameters
+    if return_estimator:
+        result["estimator"] = estimator
+    return result
+
+
+def _score(estimator, X_test, y_test, scorer, score_params, error_score="raise"):
+    """Compute the score(s) of an estimator on a given test set.
+
+    Will return a dict of floats if `scorer` is a _MultiMetricScorer, otherwise a single
+    float is returned.
+    """
+    score_params = {} if score_params is None else score_params
+
+    try:
+        if y_test is None:
+            scores = scorer(estimator, X_test, **score_params)
+        else:
+            scores = scorer(estimator, X_test, y_test, **score_params)
+    except Exception:
+        if isinstance(scorer, _MultimetricScorer):
+            # If `_MultimetricScorer` raises exception, the `error_score`
+            # parameter is equal to "raise".
+            raise
+        else:
+            if error_score == "raise":
+                raise
+            else:
+                scores = error_score
+                warnings.warn(
+                    (
+                        "Scoring failed. The score on this train-test partition for "
+                        f"these parameters will be set to {error_score}. Details: \n"
+                        f"{format_exc()}"
+                    ),
+                    UserWarning,
+                )
+
+    # Check non-raised error messages in `_MultimetricScorer`
+    if isinstance(scorer, _MultimetricScorer):
+        exception_messages = [
+            (name, str_e) for name, str_e in scores.items() if isinstance(str_e, str)
+        ]
+        if exception_messages:
+            # error_score != "raise"
+            for name, str_e in exception_messages:
+                scores[name] = error_score
+                warnings.warn(
+                    (
+                        "Scoring failed. The score on this train-test partition for "
+                        f"these parameters will be set to {error_score}. Details: \n"
+                        f"{str_e}"
+                    ),
+                    UserWarning,
+                )
+
+    error_msg = "scoring must return a number, got %s (%s) instead. (scorer=%s)"
+    if isinstance(scores, dict):
+        for name, score in scores.items():
+            if hasattr(score, "item"):
+                with suppress(ValueError):
+                    # e.g. unwrap memmapped scalars
+                    score = score.item()
+            if not isinstance(score, numbers.Number):
+                raise ValueError(error_msg % (score, type(score), name))
+            scores[name] = score
+    else:  # scalar
+        if hasattr(scores, "item"):
+            with suppress(ValueError):
+                # e.g. unwrap memmapped scalars
+                scores = scores.item()
+        if not isinstance(scores, numbers.Number):
+            raise ValueError(error_msg % (scores, type(scores), scorer))
+    return scores
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods(["fit", "predict"])],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", "sparse matrix", None],
+        "groups": ["array-like", None],
+        "cv": ["cv_object"],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+        "params": [dict, None],
+        "pre_dispatch": [Integral, str, None],
+        "method": [
+            StrOptions(
+                {
+                    "predict",
+                    "predict_proba",
+                    "predict_log_proba",
+                    "decision_function",
+                }
+            )
+        ],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def cross_val_predict(
+    estimator,
+    X,
+    y=None,
+    *,
+    groups=None,
+    cv=None,
+    n_jobs=None,
+    verbose=0,
+    params=None,
+    pre_dispatch="2*n_jobs",
+    method="predict",
+):
+    """Generate cross-validated estimates for each input data point.
+
+    The data is split according to the cv parameter. Each sample belongs
+    to exactly one test set, and its prediction is computed with an
+    estimator fitted on the corresponding training set.
+
+    Passing these predictions into an evaluation metric may not be a valid
+    way to measure generalization performance. Results can differ from
+    :func:`cross_validate` and :func:`cross_val_score` unless all tests sets
+    have equal size and the metric decomposes over samples.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    estimator : estimator
+        The estimator instance to use to fit the data. It must implement a `fit`
+        method and the method given by the `method` parameter.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to fit. Can be, for example a list, or an array at least 2d.
+
+    y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs), \
+            default=None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    groups : array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" :term:`cv`
+        instance (e.g., :class:`GroupKFold`).
+
+        .. versionchanged:: 1.4
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``cross_val_predict(..., params={'groups': groups})``.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable that generates (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and
+        predicting are parallelized over the cross-validation splits.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    params : dict, default=None
+        Parameters to pass to the underlying estimator's ``fit`` and the CV
+        splitter.
+
+        .. versionadded:: 1.4
+
+    pre_dispatch : int or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - None, in which case all the jobs are immediately created and spawned. Use
+          this for lightweight and fast-running jobs, to avoid delays due to on-demand
+          spawning of the jobs
+        - An int, giving the exact number of total jobs that are spawned
+        - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'
+
+    method : {'predict', 'predict_proba', 'predict_log_proba', \
+              'decision_function'}, default='predict'
+        The method to be invoked by `estimator`.
+
+    Returns
+    -------
+    predictions : ndarray
+        This is the result of calling `method`. Shape:
+
+        - When `method` is 'predict' and in special case where `method` is
+          'decision_function' and the target is binary: (n_samples,)
+        - When `method` is one of {'predict_proba', 'predict_log_proba',
+          'decision_function'} (unless special case above):
+          (n_samples, n_classes)
+        - If `estimator` is :term:`multioutput`, an extra dimension
+          'n_outputs' is added to the end of each shape above.
+
+    See Also
+    --------
+    cross_val_score : Calculate score for each CV split.
+    cross_validate : Calculate one or more scores and timings for each CV
+        split.
+
+    Notes
+    -----
+    In the case that one or more classes are absent in a training portion, a
+    default score needs to be assigned to all instances for that class if
+    ``method`` produces columns per class, as in {'decision_function',
+    'predict_proba', 'predict_log_proba'}.  For ``predict_proba`` this value is
+    0.  In order to ensure finite output, we approximate negative infinity by
+    the minimum finite float value for the dtype in other cases.
+
+    Examples
+    --------
+    >>> from sklearn import datasets, linear_model
+    >>> from sklearn.model_selection import cross_val_predict
+    >>> diabetes = datasets.load_diabetes()
+    >>> X = diabetes.data[:150]
+    >>> y = diabetes.target[:150]
+    >>> lasso = linear_model.Lasso()
+    >>> y_pred = cross_val_predict(lasso, X, y, cv=3)
+    """
+    _check_groups_routing_disabled(groups)
+    X, y = indexable(X, y)
+
+    if _routing_enabled():
+        # For estimators, a MetadataRouter is created in get_metadata_routing
+        # methods. For these router methods, we create the router to use
+        # `process_routing` on it.
+        router = (
+            MetadataRouter(owner="cross_validate")
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                estimator=estimator,
+                # TODO(SLEP6): also pass metadata for the predict method.
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+        )
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            unrequested_params = sorted(e.unrequested_params)
+            raise UnsetMetadataPassedError(
+                message=(
+                    f"{unrequested_params} are passed to `cross_val_predict` but are"
+                    " not explicitly set as requested or not requested for"
+                    f" cross_validate's estimator: {estimator.__class__.__name__} Call"
+                    " `.set_fit_request({{metadata}}=True)` on the estimator for"
+                    f" each metadata in {unrequested_params} that you want to use and"
+                    " `metadata=False` for not using it. See the Metadata Routing User"
+                    " guide <https://scikit-learn.org/stable/metadata_routing.html>"
+                    " for more information."
+                ),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+    else:
+        routed_params = Bunch()
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.estimator = Bunch(fit=params)
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+    splits = list(cv.split(X, y, **routed_params.splitter.split))
+
+    test_indices = np.concatenate([test for _, test in splits])
+    if not _check_is_permutation(test_indices, _num_samples(X)):
+        raise ValueError("cross_val_predict only works for partitions")
+
+    # If classification methods produce multiple columns of output,
+    # we need to manually encode classes to ensure consistent column ordering.
+    encode = (
+        method in ["decision_function", "predict_proba", "predict_log_proba"]
+        and y is not None
+    )
+    if encode:
+        y = np.asarray(y)
+        if y.ndim == 1:
+            le = LabelEncoder()
+            y = le.fit_transform(y)
+        elif y.ndim == 2:
+            y_enc = np.zeros_like(y, dtype=int)
+            for i_label in range(y.shape[1]):
+                y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label])
+            y = y_enc
+
+    # We clone the estimator to make sure that all the folds are
+    # independent, and that it is pickle-able.
+    parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
+    predictions = parallel(
+        delayed(_fit_and_predict)(
+            clone(estimator),
+            X,
+            y,
+            train,
+            test,
+            routed_params.estimator.fit,
+            method,
+        )
+        for train, test in splits
+    )
+
+    inv_test_indices = np.empty(len(test_indices), dtype=int)
+    inv_test_indices[test_indices] = np.arange(len(test_indices))
+
+    if sp.issparse(predictions[0]):
+        predictions = sp.vstack(predictions, format=predictions[0].format)
+    elif encode and isinstance(predictions[0], list):
+        # `predictions` is a list of method outputs from each fold.
+        # If each of those is also a list, then treat this as a
+        # multioutput-multiclass task. We need to separately concatenate
+        # the method outputs for each label into an `n_labels` long list.
+        n_labels = y.shape[1]
+        concat_pred = []
+        for i_label in range(n_labels):
+            label_preds = np.concatenate([p[i_label] for p in predictions])
+            concat_pred.append(label_preds)
+        predictions = concat_pred
+    else:
+        predictions = np.concatenate(predictions)
+
+    if isinstance(predictions, list):
+        return [p[inv_test_indices] for p in predictions]
+    else:
+        return predictions[inv_test_indices]
+
+
+def _fit_and_predict(estimator, X, y, train, test, fit_params, method):
+    """Fit estimator and predict values for a given dataset split.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit' and 'predict'
+        The object to use to fit the data.
+
+    X : array-like of shape (n_samples, n_features)
+        The data to fit.
+
+        .. versionchanged:: 0.20
+            X is only required to be an object with finite length or shape now
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    train : array-like of shape (n_train_samples,)
+        Indices of training samples.
+
+    test : array-like of shape (n_test_samples,)
+        Indices of test samples.
+
+    fit_params : dict or None
+        Parameters that will be passed to ``estimator.fit``.
+
+    method : str
+        Invokes the passed method name of the passed estimator.
+
+    Returns
+    -------
+    predictions : sequence
+        Result of calling 'estimator.method'
+    """
+    # Adjust length of sample weights
+    fit_params = fit_params if fit_params is not None else {}
+    fit_params = _check_method_params(X, params=fit_params, indices=train)
+
+    X_train, y_train = _safe_split(estimator, X, y, train)
+    X_test, _ = _safe_split(estimator, X, y, test, train)
+
+    if y_train is None:
+        estimator.fit(X_train, **fit_params)
+    else:
+        estimator.fit(X_train, y_train, **fit_params)
+    func = getattr(estimator, method)
+    predictions = func(X_test)
+
+    encode = (
+        method in ["decision_function", "predict_proba", "predict_log_proba"]
+        and y is not None
+    )
+
+    if encode:
+        if isinstance(predictions, list):
+            predictions = [
+                _enforce_prediction_order(
+                    estimator.classes_[i_label],
+                    predictions[i_label],
+                    n_classes=len(set(y[:, i_label])),
+                    method=method,
+                )
+                for i_label in range(len(predictions))
+            ]
+        else:
+            # A 2D y array should be a binary label indicator matrix
+            n_classes = len(set(y)) if y.ndim == 1 else y.shape[1]
+            predictions = _enforce_prediction_order(
+                estimator.classes_, predictions, n_classes, method
+            )
+    return predictions
+
+
+def _enforce_prediction_order(classes, predictions, n_classes, method):
+    """Ensure that prediction arrays have correct column order
+
+    When doing cross-validation, if one or more classes are
+    not present in the subset of data used for training,
+    then the output prediction array might not have the same
+    columns as other folds. Use the list of class names
+    (assumed to be ints) to enforce the correct column order.
+
+    Note that `classes` is the list of classes in this fold
+    (a subset of the classes in the full training set)
+    and `n_classes` is the number of classes in the full training set.
+    """
+    if n_classes != len(classes):
+        recommendation = (
+            "To fix this, use a cross-validation "
+            "technique resulting in properly "
+            "stratified folds"
+        )
+        warnings.warn(
+            "Number of classes in training fold ({}) does "
+            "not match total number of classes ({}). "
+            "Results may not be appropriate for your use case. "
+            "{}".format(len(classes), n_classes, recommendation),
+            RuntimeWarning,
+        )
+        if method == "decision_function":
+            if predictions.ndim == 2 and predictions.shape[1] != len(classes):
+                # This handles the case when the shape of predictions
+                # does not match the number of classes used to train
+                # it with. This case is found when sklearn.svm.SVC is
+                # set to `decision_function_shape='ovo'`.
+                raise ValueError(
+                    "Output shape {} of {} does not match "
+                    "number of classes ({}) in fold. "
+                    "Irregular decision_function outputs "
+                    "are not currently supported by "
+                    "cross_val_predict".format(predictions.shape, method, len(classes))
+                )
+            if len(classes) <= 2:
+                # In this special case, `predictions` contains a 1D array.
+                raise ValueError(
+                    "Only {} class/es in training fold, but {} "
+                    "in overall dataset. This "
+                    "is not supported for decision_function "
+                    "with imbalanced folds. {}".format(
+                        len(classes), n_classes, recommendation
+                    )
+                )
+
+        float_min = np.finfo(predictions.dtype).min
+        default_values = {
+            "decision_function": float_min,
+            "predict_log_proba": float_min,
+            "predict_proba": 0,
+        }
+        predictions_for_all_classes = np.full(
+            (_num_samples(predictions), n_classes),
+            default_values[method],
+            dtype=predictions.dtype,
+        )
+        predictions_for_all_classes[:, classes] = predictions
+        predictions = predictions_for_all_classes
+    return predictions
+
+
+def _check_is_permutation(indices, n_samples):
+    """Check whether indices is a reordering of the array np.arange(n_samples)
+
+    Parameters
+    ----------
+    indices : ndarray
+        int array to test
+    n_samples : int
+        number of expected elements
+
+    Returns
+    -------
+    is_partition : bool
+        True iff sorted(indices) is np.arange(n)
+    """
+    if len(indices) != n_samples:
+        return False
+    hit = np.zeros(n_samples, dtype=bool)
+    hit[indices] = True
+    if not np.all(hit):
+        return False
+    return True
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods("fit")],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "groups": ["array-like", None],
+        "cv": ["cv_object"],
+        "n_permutations": [Interval(Integral, 1, None, closed="left")],
+        "n_jobs": [Integral, None],
+        "random_state": ["random_state"],
+        "verbose": ["verbose"],
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "fit_params": [dict, None],
+        "params": [dict, None],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def permutation_test_score(
+    estimator,
+    X,
+    y,
+    *,
+    groups=None,
+    cv=None,
+    n_permutations=100,
+    n_jobs=None,
+    random_state=0,
+    verbose=0,
+    scoring=None,
+    fit_params=None,
+    params=None,
+):
+    """Evaluate the significance of a cross-validated score with permutations.
+
+    Permutes targets to generate 'randomized data' and compute the empirical
+    p-value against the null hypothesis that features and targets are
+    independent.
+
+    The p-value represents the fraction of randomized data sets where the
+    estimator performed as well or better than in the original data. A small
+    p-value suggests that there is a real dependency between features and
+    targets which has been used by the estimator to give good predictions.
+    A large p-value may be due to lack of real dependency between features
+    and targets or the estimator was not able to use the dependency to
+    give good predictions.
+
+    Read more in the :ref:`User Guide <permutation_test_score>`.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : array-like of shape at least 2D
+        The data to fit.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    groups : array-like of shape (n_samples,), default=None
+        Labels to constrain permutation within groups, i.e. ``y`` values
+        are permuted among samples with the same group identifier.
+        When not specified, ``y`` values are permuted among all samples.
+
+        When a grouped cross-validator is used, the group labels are
+        also passed on to the ``split`` method of the cross-validator. The
+        cross-validator uses them for grouping the samples  while splitting
+        the dataset into train/test set.
+
+        .. versionchanged:: 1.6
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``permutation_test_score(..., params={'groups': groups})``.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - `None`, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For `int`/`None` inputs, if the estimator is a classifier and `y` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            `cv` default value if `None` changed from 3-fold to 5-fold.
+
+    n_permutations : int, default=100
+        Number of times to permute ``y``.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the cross-validated score are parallelized over the permutations.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance or None, default=0
+        Pass an int for reproducible output for permutation of
+        ``y`` values among samples. See :term:`Glossary <random_state>`.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    scoring : str or callable, default=None
+        A single str (see :ref:`scoring_parameter`) or a callable
+        (see :ref:`scoring_callable`) to evaluate the predictions on the test set.
+
+        If `None` the estimator's score method is used.
+
+    fit_params : dict, default=None
+        Parameters to pass to the fit method of the estimator.
+
+        .. deprecated:: 1.6
+            This parameter is deprecated and will be removed in version 1.6. Use
+            ``params`` instead.
+
+    params : dict, default=None
+        Parameters to pass to the `fit` method of the estimator, the scorer
+        and the cv splitter.
+
+        - If `enable_metadata_routing=False` (default): Parameters directly passed to
+          the `fit` method of the estimator.
+
+        - If `enable_metadata_routing=True`: Parameters safely routed to the `fit`
+          method of the estimator, `cv` object and `scorer`. See :ref:`Metadata Routing
+          User Guide <metadata_routing>` for more details.
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    score : float
+        The true score without permuting targets.
+
+    permutation_scores : array of shape (n_permutations,)
+        The scores obtained for each permutations.
+
+    pvalue : float
+        The p-value, which approximates the probability that the score would
+        be obtained by chance. This is calculated as:
+
+        `(C + 1) / (n_permutations + 1)`
+
+        Where C is the number of permutations whose score >= the true score.
+
+        The best possible p-value is 1/(n_permutations + 1), the worst is 1.0.
+
+    Notes
+    -----
+    This function implements Test 1 in:
+
+    Ojala and Garriga. `Permutation Tests for Studying Classifier Performance
+    <http://www.jmlr.org/papers/volume11/ojala10a/ojala10a.pdf>`_. The
+    Journal of Machine Learning Research (2010) vol. 11
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.model_selection import permutation_test_score
+    >>> X, y = make_classification(random_state=0)
+    >>> estimator = LogisticRegression()
+    >>> score, permutation_scores, pvalue = permutation_test_score(
+    ...     estimator, X, y, random_state=0
+    ... )
+    >>> print(f"Original Score: {score:.3f}")
+    Original Score: 0.810
+    >>> print(
+    ...     f"Permutation Scores: {permutation_scores.mean():.3f} +/- "
+    ...     f"{permutation_scores.std():.3f}"
+    ... )
+    Permutation Scores: 0.505 +/- 0.057
+    >>> print(f"P-value: {pvalue:.3f}")
+    P-value: 0.010
+    """
+    params = _check_params_groups_deprecation(fit_params, params, groups, "1.8")
+
+    X, y, groups = indexable(X, y, groups)
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+    scorer = check_scoring(estimator, scoring=scoring)
+    random_state = check_random_state(random_state)
+
+    if _routing_enabled():
+        router = (
+            MetadataRouter(owner="permutation_test_score")
+            .add(
+                estimator=estimator,
+                # TODO(SLEP6): also pass metadata to the predict method for
+                # scoring?
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                scorer=scorer,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+        )
+
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            unrequested_params = sorted(e.unrequested_params)
+            raise UnsetMetadataPassedError(
+                message=(
+                    f"{unrequested_params} are passed to `permutation_test_score`"
+                    " but are not explicitly set as requested or not requested"
+                    " for permutation_test_score's"
+                    f" estimator: {estimator.__class__.__name__}. Call"
+                    " `.set_fit_request({{metadata}}=True)` on the estimator for"
+                    f" each metadata in {unrequested_params} that you"
+                    " want to use and `metadata=False` for not using it. See the"
+                    " Metadata Routing User guide"
+                    " <https://scikit-learn.org/stable/metadata_routing.html> for more"
+                    " information."
+                ),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+
+    else:
+        routed_params = Bunch()
+        routed_params.estimator = Bunch(fit=params)
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.scorer = Bunch(score={})
+
+    # We clone the estimator to make sure that all the folds are
+    # independent, and that it is pickle-able.
+    score = _permutation_test_score(
+        clone(estimator),
+        X,
+        y,
+        cv,
+        scorer,
+        split_params=routed_params.splitter.split,
+        fit_params=routed_params.estimator.fit,
+        score_params=routed_params.scorer.score,
+    )
+    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
+        delayed(_permutation_test_score)(
+            clone(estimator),
+            X,
+            _shuffle(y, groups, random_state),
+            cv,
+            scorer,
+            split_params=routed_params.splitter.split,
+            fit_params=routed_params.estimator.fit,
+            score_params=routed_params.scorer.score,
+        )
+        for _ in range(n_permutations)
+    )
+    permutation_scores = np.array(permutation_scores)
+    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
+    return score, permutation_scores, pvalue
+
+
+def _permutation_test_score(
+    estimator, X, y, cv, scorer, split_params, fit_params, score_params
+):
+    """Auxiliary function for permutation_test_score"""
+    # Adjust length of sample weights
+    fit_params = fit_params if fit_params is not None else {}
+    score_params = score_params if score_params is not None else {}
+
+    avg_score = []
+    for train, test in cv.split(X, y, **split_params):
+        X_train, y_train = _safe_split(estimator, X, y, train)
+        X_test, y_test = _safe_split(estimator, X, y, test, train)
+        fit_params_train = _check_method_params(X, params=fit_params, indices=train)
+        score_params_test = _check_method_params(X, params=score_params, indices=test)
+        estimator.fit(X_train, y_train, **fit_params_train)
+        avg_score.append(scorer(estimator, X_test, y_test, **score_params_test))
+    return np.mean(avg_score)
+
+
+def _shuffle(y, groups, random_state):
+    """Return a shuffled copy of y eventually shuffle among same groups."""
+    if groups is None:
+        indices = random_state.permutation(len(y))
+    else:
+        indices = np.arange(len(groups))
+        for group in np.unique(groups):
+            this_mask = groups == group
+            indices[this_mask] = random_state.permutation(indices[this_mask])
+    return _safe_indexing(y, indices)
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods(["fit"])],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "groups": ["array-like", None],
+        "train_sizes": ["array-like"],
+        "cv": ["cv_object"],
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "exploit_incremental_learning": ["boolean"],
+        "n_jobs": [Integral, None],
+        "pre_dispatch": [Integral, str],
+        "verbose": ["verbose"],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+        "error_score": [StrOptions({"raise"}), Real],
+        "return_times": ["boolean"],
+        "fit_params": [dict, None],
+        "params": [dict, None],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def learning_curve(
+    estimator,
+    X,
+    y,
+    *,
+    groups=None,
+    train_sizes=np.linspace(0.1, 1.0, 5),
+    cv=None,
+    scoring=None,
+    exploit_incremental_learning=False,
+    n_jobs=None,
+    pre_dispatch="all",
+    verbose=0,
+    shuffle=False,
+    random_state=None,
+    error_score=np.nan,
+    return_times=False,
+    fit_params=None,
+    params=None,
+):
+    """Learning curve.
+
+    Determines cross-validated training and test scores for different training
+    set sizes.
+
+    A cross-validation generator splits the whole dataset k times in training
+    and test data. Subsets of the training set with varying sizes will be used
+    to train the estimator and a score for each training subset size and the
+    test set will be computed. Afterwards, the scores will be averaged over
+    all k runs for each training subset size.
+
+    Read more in the :ref:`User Guide <learning_curve>`.
+
+    Parameters
+    ----------
+    estimator : object type that implements the "fit" method
+        An object of that type which is cloned for each validation. It must
+        also implement "predict" unless `scoring` is a callable that doesn't
+        rely on "predict" to compute a score.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        Target relative to X for classification or regression;
+        None for unsupervised learning.
+
+    groups : array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" :term:`cv`
+        instance (e.g., :class:`GroupKFold`).
+
+        .. versionchanged:: 1.6
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``learning_curve(..., params={'groups': groups})``.
+
+    train_sizes : array-like of shape (n_ticks,), \
+            default=np.linspace(0.1, 1.0, 5)
+        Relative or absolute numbers of training examples that will be used to
+        generate the learning curve. If the dtype is float, it is regarded as a
+        fraction of the maximum size of the training set (that is determined
+        by the selected validation method), i.e. it has to be within (0, 1].
+        Otherwise it is interpreted as absolute sizes of the training sets.
+        Note that for classification the number of samples usually has to
+        be big enough to contain at least one sample from each class.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    scoring : str or callable, default=None
+        A str (see :ref:`scoring_parameter`) or a scorer callable object / function with
+        signature ``scorer(estimator, X, y)``.
+
+    exploit_incremental_learning : bool, default=False
+        If the estimator supports incremental learning, this will be
+        used to speed up fitting for different training set sizes.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the score are parallelized over the different training and test sets.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    pre_dispatch : int or str, default='all'
+        Number of predispatched jobs for parallel execution (default is
+        all). The option can reduce the allocated memory. The str can
+        be an expression like '2*n_jobs'.
+
+    verbose : int, default=0
+        Controls the verbosity: the higher, the more messages.
+
+    shuffle : bool, default=False
+        Whether to shuffle training data before taking prefixes of it
+        based on``train_sizes``.
+
+    random_state : int, RandomState instance or None, default=None
+        Used when ``shuffle`` is True. Pass an int for reproducible
+        output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised.
+        If a numeric value is given, FitFailedWarning is raised.
+
+        .. versionadded:: 0.20
+
+    return_times : bool, default=False
+        Whether to return the fit and score times.
+
+    fit_params : dict, default=None
+        Parameters to pass to the fit method of the estimator.
+
+        .. deprecated:: 1.6
+            This parameter is deprecated and will be removed in version 1.8. Use
+            ``params`` instead.
+
+    params : dict, default=None
+        Parameters to pass to the `fit` method of the estimator and to the scorer.
+
+        - If `enable_metadata_routing=False` (default): Parameters directly passed to
+          the `fit` method of the estimator.
+
+        - If `enable_metadata_routing=True`: Parameters safely routed to the `fit`
+          method of the estimator. See :ref:`Metadata Routing User Guide
+          <metadata_routing>` for more details.
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    train_sizes_abs : array of shape (n_unique_ticks,)
+        Numbers of training examples that has been used to generate the
+        learning curve. Note that the number of ticks might be less
+        than n_ticks because duplicate entries will be removed.
+
+    train_scores : array of shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : array of shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    fit_times : array of shape (n_ticks, n_cv_folds)
+        Times spent for fitting in seconds. Only present if ``return_times``
+        is True.
+
+    score_times : array of shape (n_ticks, n_cv_folds)
+        Times spent for scoring in seconds. Only present if ``return_times``
+        is True.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> from sklearn.model_selection import learning_curve
+    >>> X, y = make_classification(n_samples=100, n_features=10, random_state=42)
+    >>> tree = DecisionTreeClassifier(max_depth=4, random_state=42)
+    >>> train_size_abs, train_scores, test_scores = learning_curve(
+    ...     tree, X, y, train_sizes=[0.3, 0.6, 0.9]
+    ... )
+    >>> for train_size, cv_train_scores, cv_test_scores in zip(
+    ...     train_size_abs, train_scores, test_scores
+    ... ):
+    ...     print(f"{train_size} samples were used to train the model")
+    ...     print(f"The average train accuracy is {cv_train_scores.mean():.2f}")
+    ...     print(f"The average test accuracy is {cv_test_scores.mean():.2f}")
+    24 samples were used to train the model
+    The average train accuracy is 1.00
+    The average test accuracy is 0.85
+    48 samples were used to train the model
+    The average train accuracy is 1.00
+    The average test accuracy is 0.90
+    72 samples were used to train the model
+    The average train accuracy is 1.00
+    The average test accuracy is 0.93
+    """
+    if exploit_incremental_learning and not hasattr(estimator, "partial_fit"):
+        raise ValueError(
+            "An estimator must support the partial_fit interface "
+            "to exploit incremental learning"
+        )
+
+    params = _check_params_groups_deprecation(fit_params, params, groups, "1.8")
+
+    X, y, groups = indexable(X, y, groups)
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+
+    scorer = check_scoring(estimator, scoring=scoring)
+
+    if _routing_enabled():
+        router = (
+            MetadataRouter(owner="learning_curve")
+            .add(
+                estimator=estimator,
+                # TODO(SLEP6): also pass metadata to the predict method for
+                # scoring?
+                method_mapping=MethodMapping()
+                .add(caller="fit", callee="fit")
+                .add(caller="fit", callee="partial_fit"),
+            )
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                scorer=scorer,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+        )
+
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            unrequested_params = sorted(e.unrequested_params)
+            raise UnsetMetadataPassedError(
+                message=(
+                    f"{unrequested_params} are passed to `learning_curve` but are not"
+                    " explicitly set as requested or not requested for learning_curve's"
+                    f" estimator: {estimator.__class__.__name__}. Call"
+                    " `.set_fit_request({{metadata}}=True)` on the estimator for"
+                    f" each metadata in {unrequested_params} that you"
+                    " want to use and `metadata=False` for not using it. See the"
+                    " Metadata Routing User guide"
+                    " <https://scikit-learn.org/stable/metadata_routing.html> for more"
+                    " information."
+                ),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+
+    else:
+        routed_params = Bunch()
+        routed_params.estimator = Bunch(fit=params, partial_fit=params)
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.scorer = Bunch(score={})
+
+    # Store cv as list as we will be iterating over the list multiple times
+    cv_iter = list(cv.split(X, y, **routed_params.splitter.split))
+
+    n_max_training_samples = len(cv_iter[0][0])
+    # Because the lengths of folds can be significantly different, it is
+    # not guaranteed that we use all of the available training data when we
+    # use the first 'n_max_training_samples' samples.
+    train_sizes_abs = _translate_train_sizes(train_sizes, n_max_training_samples)
+    n_unique_ticks = train_sizes_abs.shape[0]
+    if verbose > 0:
+        print("[learning_curve] Training set sizes: " + str(train_sizes_abs))
+
+    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose)
+
+    if shuffle:
+        rng = check_random_state(random_state)
+        cv_iter = ((rng.permutation(train), test) for train, test in cv_iter)
+
+    if exploit_incremental_learning:
+        classes = np.unique(y) if is_classifier(estimator) else None
+        out = parallel(
+            delayed(_incremental_fit_estimator)(
+                clone(estimator),
+                X,
+                y,
+                classes,
+                train,
+                test,
+                train_sizes_abs,
+                scorer,
+                return_times,
+                error_score=error_score,
+                fit_params=routed_params.estimator.partial_fit,
+                score_params=routed_params.scorer.score,
+            )
+            for train, test in cv_iter
+        )
+        out = np.asarray(out).transpose((2, 1, 0))
+    else:
+        train_test_proportions = []
+        for train, test in cv_iter:
+            for n_train_samples in train_sizes_abs:
+                train_test_proportions.append((train[:n_train_samples], test))
+
+        results = parallel(
+            delayed(_fit_and_score)(
+                clone(estimator),
+                X,
+                y,
+                scorer=scorer,
+                train=train,
+                test=test,
+                verbose=verbose,
+                parameters=None,
+                fit_params=routed_params.estimator.fit,
+                score_params=routed_params.scorer.score,
+                return_train_score=True,
+                error_score=error_score,
+                return_times=return_times,
+            )
+            for train, test in train_test_proportions
+        )
+        _warn_or_raise_about_fit_failures(results, error_score)
+        results = _aggregate_score_dicts(results)
+        train_scores = results["train_scores"].reshape(-1, n_unique_ticks).T
+        test_scores = results["test_scores"].reshape(-1, n_unique_ticks).T
+        out = [train_scores, test_scores]
+
+        if return_times:
+            fit_times = results["fit_time"].reshape(-1, n_unique_ticks).T
+            score_times = results["score_time"].reshape(-1, n_unique_ticks).T
+            out.extend([fit_times, score_times])
+
+    ret = train_sizes_abs, out[0], out[1]
+
+    if return_times:
+        ret = ret + (out[2], out[3])
+
+    return ret
+
+
+def _translate_train_sizes(train_sizes, n_max_training_samples):
+    """Determine absolute sizes of training subsets and validate 'train_sizes'.
+
+    Examples:
+        _translate_train_sizes([0.5, 1.0], 10) -> [5, 10]
+        _translate_train_sizes([5, 10], 10) -> [5, 10]
+
+    Parameters
+    ----------
+    train_sizes : array-like of shape (n_ticks,)
+        Numbers of training examples that will be used to generate the
+        learning curve. If the dtype is float, it is regarded as a
+        fraction of 'n_max_training_samples', i.e. it has to be within (0, 1].
+
+    n_max_training_samples : int
+        Maximum number of training samples (upper bound of 'train_sizes').
+
+    Returns
+    -------
+    train_sizes_abs : array of shape (n_unique_ticks,)
+        Numbers of training examples that will be used to generate the
+        learning curve. Note that the number of ticks might be less
+        than n_ticks because duplicate entries will be removed.
+    """
+    train_sizes_abs = np.asarray(train_sizes)
+    n_ticks = train_sizes_abs.shape[0]
+    n_min_required_samples = np.min(train_sizes_abs)
+    n_max_required_samples = np.max(train_sizes_abs)
+    if np.issubdtype(train_sizes_abs.dtype, np.floating):
+        if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0:
+            raise ValueError(
+                "train_sizes has been interpreted as fractions "
+                "of the maximum number of training samples and "
+                "must be within (0, 1], but is within [%f, %f]."
+                % (n_min_required_samples, n_max_required_samples)
+            )
+        train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype(
+            dtype=int, copy=False
+        )
+        train_sizes_abs = np.clip(train_sizes_abs, 1, n_max_training_samples)
+    else:
+        if (
+            n_min_required_samples <= 0
+            or n_max_required_samples > n_max_training_samples
+        ):
+            raise ValueError(
+                "train_sizes has been interpreted as absolute "
+                "numbers of training samples and must be within "
+                "(0, %d], but is within [%d, %d]."
+                % (
+                    n_max_training_samples,
+                    n_min_required_samples,
+                    n_max_required_samples,
+                )
+            )
+
+    train_sizes_abs = np.unique(train_sizes_abs)
+    if n_ticks > train_sizes_abs.shape[0]:
+        warnings.warn(
+            "Removed duplicate entries from 'train_sizes'. Number "
+            "of ticks will be less than the size of "
+            "'train_sizes': %d instead of %d." % (train_sizes_abs.shape[0], n_ticks),
+            RuntimeWarning,
+        )
+
+    return train_sizes_abs
+
+
+def _incremental_fit_estimator(
+    estimator,
+    X,
+    y,
+    classes,
+    train,
+    test,
+    train_sizes,
+    scorer,
+    return_times,
+    error_score,
+    fit_params,
+    score_params,
+):
+    """Train estimator on training subsets incrementally and compute scores."""
+    train_scores, test_scores, fit_times, score_times = [], [], [], []
+    partitions = zip(train_sizes, np.split(train, train_sizes)[:-1])
+    if fit_params is None:
+        fit_params = {}
+    if classes is None:
+        partial_fit_func = partial(estimator.partial_fit, **fit_params)
+    else:
+        partial_fit_func = partial(estimator.partial_fit, classes=classes, **fit_params)
+    score_params = score_params if score_params is not None else {}
+    score_params_train = _check_method_params(X, params=score_params, indices=train)
+    score_params_test = _check_method_params(X, params=score_params, indices=test)
+
+    for n_train_samples, partial_train in partitions:
+        train_subset = train[:n_train_samples]
+        X_train, y_train = _safe_split(estimator, X, y, train_subset)
+        X_partial_train, y_partial_train = _safe_split(estimator, X, y, partial_train)
+        X_test, y_test = _safe_split(estimator, X, y, test, train_subset)
+        start_fit = time.time()
+        if y_partial_train is None:
+            partial_fit_func(X_partial_train)
+        else:
+            partial_fit_func(X_partial_train, y_partial_train)
+        fit_time = time.time() - start_fit
+        fit_times.append(fit_time)
+
+        start_score = time.time()
+
+        test_scores.append(
+            _score(
+                estimator,
+                X_test,
+                y_test,
+                scorer,
+                score_params=score_params_test,
+                error_score=error_score,
+            )
+        )
+        train_scores.append(
+            _score(
+                estimator,
+                X_train,
+                y_train,
+                scorer,
+                score_params=score_params_train,
+                error_score=error_score,
+            )
+        )
+        score_time = time.time() - start_score
+        score_times.append(score_time)
+
+    ret = (
+        (train_scores, test_scores, fit_times, score_times)
+        if return_times
+        else (train_scores, test_scores)
+    )
+
+    return np.array(ret).T
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods(["fit"])],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "param_name": [str],
+        "param_range": ["array-like"],
+        "groups": ["array-like", None],
+        "cv": ["cv_object"],
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "n_jobs": [Integral, None],
+        "pre_dispatch": [Integral, str],
+        "verbose": ["verbose"],
+        "error_score": [StrOptions({"raise"}), Real],
+        "fit_params": [dict, None],
+        "params": [dict, None],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def validation_curve(
+    estimator,
+    X,
+    y,
+    *,
+    param_name,
+    param_range,
+    groups=None,
+    cv=None,
+    scoring=None,
+    n_jobs=None,
+    pre_dispatch="all",
+    verbose=0,
+    error_score=np.nan,
+    fit_params=None,
+    params=None,
+):
+    """Validation curve.
+
+    Determine training and test scores for varying parameter values.
+
+    Compute scores for an estimator with different values of a specified
+    parameter. This is similar to grid search with one parameter. However, this
+    will also compute training scores and is merely a utility for plotting the
+    results.
+
+    Read more in the :ref:`User Guide <validation_curve>`.
+
+    Parameters
+    ----------
+    estimator : object type that implements the "fit" method
+        An object of that type which is cloned for each validation. It must
+        also implement "predict" unless `scoring` is a callable that doesn't
+        rely on "predict" to compute a score.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        Target relative to X for classification or regression;
+        None for unsupervised learning.
+
+    param_name : str
+        Name of the parameter that will be varied.
+
+    param_range : array-like of shape (n_values,)
+        The values of the parameter that will be evaluated.
+
+    groups : array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" :term:`cv`
+        instance (e.g., :class:`GroupKFold`).
+
+        .. versionchanged:: 1.6
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``validation_curve(..., params={'groups': groups})``.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    scoring : str or callable, default=None
+        A str (see :ref:`scoring_parameter`) or a scorer callable object / function with
+        signature ``scorer(estimator, X, y)``.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the score are parallelized over the combinations of each parameter
+        value and each cross-validation split.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    pre_dispatch : int or str, default='all'
+        Number of predispatched jobs for parallel execution (default is
+        all). The option can reduce the allocated memory. The str can
+        be an expression like '2*n_jobs'.
+
+    verbose : int, default=0
+        Controls the verbosity: the higher, the more messages.
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised.
+        If a numeric value is given, FitFailedWarning is raised.
+
+        .. versionadded:: 0.20
+
+    fit_params : dict, default=None
+        Parameters to pass to the fit method of the estimator.
+
+        .. deprecated:: 1.6
+            This parameter is deprecated and will be removed in version 1.8. Use
+            ``params`` instead.
+
+    params : dict, default=None
+        Parameters to pass to the estimator, scorer and cross-validation object.
+
+        - If `enable_metadata_routing=False` (default): Parameters directly passed to
+          the `fit` method of the estimator.
+
+        - If `enable_metadata_routing=True`: Parameters safely routed to the `fit`
+          method of the estimator, to the scorer and to the cross-validation object.
+          See :ref:`Metadata Routing User Guide <metadata_routing>` for more details.
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    train_scores : array of shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : array of shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    Notes
+    -----
+    See :ref:`sphx_glr_auto_examples_model_selection_plot_train_error_vs_test_error.py`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import validation_curve
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> X, y = make_classification(n_samples=1_000, random_state=0)
+    >>> logistic_regression = LogisticRegression()
+    >>> param_name, param_range = "C", np.logspace(-8, 3, 10)
+    >>> train_scores, test_scores = validation_curve(
+    ...     logistic_regression, X, y, param_name=param_name, param_range=param_range
+    ... )
+    >>> print(f"The average train accuracy is {train_scores.mean():.2f}")
+    The average train accuracy is 0.81
+    >>> print(f"The average test accuracy is {test_scores.mean():.2f}")
+    The average test accuracy is 0.81
+    """
+    params = _check_params_groups_deprecation(fit_params, params, groups, "1.8")
+    X, y, groups = indexable(X, y, groups)
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+    scorer = check_scoring(estimator, scoring=scoring)
+
+    if _routing_enabled():
+        router = (
+            MetadataRouter(owner="validation_curve")
+            .add(
+                estimator=estimator,
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                scorer=scorer,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+        )
+
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            unrequested_params = sorted(e.unrequested_params)
+            raise UnsetMetadataPassedError(
+                message=(
+                    f"{unrequested_params} are passed to `validation_curve` but are not"
+                    " explicitly set as requested or not requested for"
+                    f" validation_curve's estimator: {estimator.__class__.__name__}."
+                    " Call `.set_fit_request({{metadata}}=True)` on the estimator for"
+                    f" each metadata in {unrequested_params} that you"
+                    " want to use and `metadata=False` for not using it. See the"
+                    " Metadata Routing User guide"
+                    " <https://scikit-learn.org/stable/metadata_routing.html> for more"
+                    " information."
+                ),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+
+    else:
+        routed_params = Bunch()
+        routed_params.estimator = Bunch(fit=params)
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.scorer = Bunch(score={})
+
+    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose)
+    results = parallel(
+        delayed(_fit_and_score)(
+            clone(estimator),
+            X,
+            y,
+            scorer=scorer,
+            train=train,
+            test=test,
+            verbose=verbose,
+            parameters={param_name: v},
+            fit_params=routed_params.estimator.fit,
+            score_params=routed_params.scorer.score,
+            return_train_score=True,
+            error_score=error_score,
+        )
+        # NOTE do not change order of iteration to allow one time cv splitters
+        for train, test in cv.split(X, y, **routed_params.splitter.split)
+        for v in param_range
+    )
+    n_params = len(param_range)
+
+    results = _aggregate_score_dicts(results)
+    train_scores = results["train_scores"].reshape(-1, n_params).T
+    test_scores = results["test_scores"].reshape(-1, n_params).T
+
+    return train_scores, test_scores
+
+
+def _aggregate_score_dicts(scores):
+    """Aggregate the list of dict to dict of np ndarray
+
+    The aggregated output of _aggregate_score_dicts will be a list of dict
+    of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...]
+    Convert it to a dict of array {'prec': np.array([0.1 ...]), ...}
+
+    Parameters
+    ----------
+
+    scores : list of dict
+        List of dicts of the scores for all scorers. This is a flat list,
+        assumed originally to be of row major order.
+
+    Example
+    -------
+
+    >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},
+    ...           {'a': 10, 'b': 10}]                         # doctest: +SKIP
+    >>> _aggregate_score_dicts(scores)                        # doctest: +SKIP
+    {'a': array([1, 2, 3, 10]),
+     'b': array([10, 2, 3, 10])}
+    """
+    return {
+        key: (
+            np.asarray([score[key] for score in scores])
+            if isinstance(scores[0][key], numbers.Number)
+            else [score[key] for score in scores]
+        )
+        for key in scores[0]
+    }
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/__init__.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/common.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..007843822febc5f4cf9145f41e667cef17b9e988
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/common.py
@@ -0,0 +1,24 @@
+"""
+Common utilities for testing model selection.
+"""
+
+import numpy as np
+
+from sklearn.model_selection import KFold
+
+
+class OneTimeSplitter:
+    """A wrapper to make KFold single entry cv iterator"""
+
+    def __init__(self, n_splits=4, n_samples=99):
+        self.n_splits = n_splits
+        self.n_samples = n_samples
+        self.indices = iter(KFold(n_splits=n_splits).split(np.ones(n_samples)))
+
+    def split(self, X=None, y=None, groups=None):
+        """Split can be called only once"""
+        for index in self.indices:
+            yield index
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        return self.n_splits
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_classification_threshold.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_classification_threshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..2920bc7a68ab9cd6ff1cb8b5b736b9f647d65e46
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_classification_threshold.py
@@ -0,0 +1,618 @@
+import numpy as np
+import pytest
+
+from sklearn import config_context
+from sklearn.base import clone
+from sklearn.datasets import (
+    load_breast_cancer,
+    load_iris,
+    make_classification,
+    make_multilabel_classification,
+)
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    balanced_accuracy_score,
+    f1_score,
+    fbeta_score,
+    make_scorer,
+)
+from sklearn.metrics._scorer import _CurveScorer
+from sklearn.model_selection import (
+    FixedThresholdClassifier,
+    StratifiedShuffleSplit,
+    TunedThresholdClassifierCV,
+)
+from sklearn.model_selection._classification_threshold import (
+    _fit_and_score_over_thresholds,
+)
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils._mocking import CheckingClassifier
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_array_equal,
+)
+
+
+def test_fit_and_score_over_thresholds_curve_scorers():
+    """Check that `_fit_and_score_over_thresholds` returns thresholds in ascending order
+    for the different accepted curve scorers."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    train_idx, val_idx = np.arange(50), np.arange(50, 100)
+    classifier = LogisticRegression()
+
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
+    scores, thresholds = _fit_and_score_over_thresholds(
+        classifier,
+        X,
+        y,
+        fit_params={},
+        train_idx=train_idx,
+        val_idx=val_idx,
+        curve_scorer=curve_scorer,
+        score_params={},
+    )
+
+    assert np.all(thresholds[:-1] <= thresholds[1:])
+    assert isinstance(scores, np.ndarray)
+    assert np.logical_and(scores >= 0, scores <= 1).all()
+
+
+def test_fit_and_score_over_thresholds_prefit():
+    """Check the behaviour with a prefit classifier."""
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    # `train_idx is None` to indicate that the classifier is prefit
+    train_idx, val_idx = None, np.arange(50, 100)
+    classifier = DecisionTreeClassifier(random_state=0).fit(X, y)
+    # make sure that the classifier memorized the full dataset such that
+    # we get perfect predictions and thus match the expected score
+    assert classifier.score(X[val_idx], y[val_idx]) == pytest.approx(1.0)
+
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=2,
+        kwargs={},
+    )
+    scores, thresholds = _fit_and_score_over_thresholds(
+        classifier,
+        X,
+        y,
+        fit_params={},
+        train_idx=train_idx,
+        val_idx=val_idx,
+        curve_scorer=curve_scorer,
+        score_params={},
+    )
+    assert np.all(thresholds[:-1] <= thresholds[1:])
+    assert_allclose(scores, [0.5, 1.0])
+
+
+@config_context(enable_metadata_routing=True)
+def test_fit_and_score_over_thresholds_sample_weight():
+    """Check that we dispatch the sample-weight to fit and score the classifier."""
+    X, y = load_iris(return_X_y=True)
+    X, y = X[:100], y[:100]  # only 2 classes
+
+    # create a dataset and repeat twice the sample of class #0
+    X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
+    # create a sample weight vector that is equivalent to the repeated dataset
+    sample_weight = np.ones_like(y)
+    sample_weight[:50] *= 2
+
+    classifier = LogisticRegression()
+    train_repeated_idx = np.arange(X_repeated.shape[0])
+    val_repeated_idx = np.arange(X_repeated.shape[0])
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
+    scores_repeated, thresholds_repeated = _fit_and_score_over_thresholds(
+        classifier,
+        X_repeated,
+        y_repeated,
+        fit_params={},
+        train_idx=train_repeated_idx,
+        val_idx=val_repeated_idx,
+        curve_scorer=curve_scorer,
+        score_params={},
+    )
+
+    train_idx, val_idx = np.arange(X.shape[0]), np.arange(X.shape[0])
+    scores, thresholds = _fit_and_score_over_thresholds(
+        classifier.set_fit_request(sample_weight=True),
+        X,
+        y,
+        fit_params={"sample_weight": sample_weight},
+        train_idx=train_idx,
+        val_idx=val_idx,
+        curve_scorer=curve_scorer.set_score_request(sample_weight=True),
+        score_params={"sample_weight": sample_weight},
+    )
+
+    assert_allclose(thresholds_repeated, thresholds)
+    assert_allclose(scores_repeated, scores)
+
+
+@pytest.mark.parametrize("fit_params_type", ["list", "array"])
+@config_context(enable_metadata_routing=True)
+def test_fit_and_score_over_thresholds_fit_params(fit_params_type):
+    """Check that we pass `fit_params` to the classifier when calling `fit`."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    fit_params = {
+        "a": _convert_container(y, fit_params_type),
+        "b": _convert_container(y, fit_params_type),
+    }
+
+    classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
+    classifier.set_fit_request(a=True, b=True)
+    train_idx, val_idx = np.arange(50), np.arange(50, 100)
+
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
+    _fit_and_score_over_thresholds(
+        classifier,
+        X,
+        y,
+        fit_params=fit_params,
+        train_idx=train_idx,
+        val_idx=val_idx,
+        curve_scorer=curve_scorer,
+        score_params={},
+    )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        make_classification(n_classes=3, n_clusters_per_class=1, random_state=0),
+        make_multilabel_classification(random_state=0),
+    ],
+)
+def test_tuned_threshold_classifier_no_binary(data):
+    """Check that we raise an informative error message for non-binary problem."""
+    err_msg = "Only binary classification is supported."
+    with pytest.raises(ValueError, match=err_msg):
+        TunedThresholdClassifierCV(LogisticRegression()).fit(*data)
+
+
+@pytest.mark.parametrize(
+    "params, err_type, err_msg",
+    [
+        (
+            {"cv": "prefit", "refit": True},
+            ValueError,
+            "When cv='prefit', refit cannot be True.",
+        ),
+        (
+            {"cv": 10, "refit": False},
+            ValueError,
+            "When cv has several folds, refit cannot be False.",
+        ),
+        (
+            {"cv": "prefit", "refit": False},
+            NotFittedError,
+            "`estimator` must be fitted.",
+        ),
+    ],
+)
+def test_tuned_threshold_classifier_conflict_cv_refit(params, err_type, err_msg):
+    """Check that we raise an informative error message when `cv` and `refit`
+    cannot be used together.
+    """
+    X, y = make_classification(n_samples=100, random_state=0)
+    with pytest.raises(err_type, match=err_msg):
+        TunedThresholdClassifierCV(LogisticRegression(), **params).fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [LogisticRegression(), SVC(), GradientBoostingClassifier(n_estimators=4)],
+)
+@pytest.mark.parametrize(
+    "response_method", ["predict_proba", "predict_log_proba", "decision_function"]
+)
+@pytest.mark.parametrize(
+    "ThresholdClassifier", [FixedThresholdClassifier, TunedThresholdClassifierCV]
+)
+def test_threshold_classifier_estimator_response_methods(
+    ThresholdClassifier, estimator, response_method
+):
+    """Check that `TunedThresholdClassifierCV` exposes the same response methods as the
+    underlying estimator.
+    """
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    model = ThresholdClassifier(estimator=estimator)
+    assert hasattr(model, response_method) == hasattr(estimator, response_method)
+
+    model.fit(X, y)
+    assert hasattr(model, response_method) == hasattr(estimator, response_method)
+
+    if hasattr(model, response_method):
+        y_pred_cutoff = getattr(model, response_method)(X)
+        y_pred_underlying_estimator = getattr(model.estimator_, response_method)(X)
+
+        assert_allclose(y_pred_cutoff, y_pred_underlying_estimator)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+def test_tuned_threshold_classifier_without_constraint_value(response_method):
+    """Check that `TunedThresholdClassifierCV` is optimizing a given objective
+    metric."""
+    X, y = load_breast_cancer(return_X_y=True)
+    # remove feature to degrade performances
+    X = X[:, :5]
+
+    # make the problem completely imbalanced such that the balanced accuracy is low
+    indices_pos = np.flatnonzero(y == 1)
+    indices_pos = indices_pos[: indices_pos.size // 50]
+    indices_neg = np.flatnonzero(y == 0)
+
+    X = np.vstack([X[indices_neg], X[indices_pos]])
+    y = np.hstack([y[indices_neg], y[indices_pos]])
+
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    thresholds = 100
+    model = TunedThresholdClassifierCV(
+        estimator=lr,
+        scoring="balanced_accuracy",
+        response_method=response_method,
+        thresholds=thresholds,
+        store_cv_results=True,
+    )
+    score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
+    score_baseline = balanced_accuracy_score(y, lr.predict(X))
+    assert score_optimized > score_baseline
+    assert model.cv_results_["thresholds"].shape == (thresholds,)
+    assert model.cv_results_["scores"].shape == (thresholds,)
+
+
+def test_tuned_threshold_classifier_metric_with_parameter():
+    """Check that we can pass a metric with a parameter in addition check that
+    `f_beta` with `beta=1` is equivalent to `f1` and different from `f_beta` with
+    `beta=2`.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    model_fbeta_1 = TunedThresholdClassifierCV(
+        estimator=lr, scoring=make_scorer(fbeta_score, beta=1)
+    ).fit(X, y)
+    model_fbeta_2 = TunedThresholdClassifierCV(
+        estimator=lr, scoring=make_scorer(fbeta_score, beta=2)
+    ).fit(X, y)
+    model_f1 = TunedThresholdClassifierCV(
+        estimator=lr, scoring=make_scorer(f1_score)
+    ).fit(X, y)
+
+    assert model_fbeta_1.best_threshold_ == pytest.approx(model_f1.best_threshold_)
+    assert model_fbeta_1.best_threshold_ != pytest.approx(model_fbeta_2.best_threshold_)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+@pytest.mark.parametrize(
+    "metric",
+    [
+        make_scorer(balanced_accuracy_score),
+        make_scorer(f1_score, pos_label="cancer"),
+    ],
+)
+def test_tuned_threshold_classifier_with_string_targets(response_method, metric):
+    """Check that targets represented by str are properly managed.
+    Also, check with several metrics to be sure that `pos_label` is properly
+    dispatched.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    # Encode numeric targets by meaningful strings. We purposely designed the class
+    # names such that the `pos_label` is the first alphabetically sorted class and thus
+    # encoded as 0.
+    classes = np.array(["cancer", "healthy"], dtype=object)
+    y = classes[y]
+    model = TunedThresholdClassifierCV(
+        estimator=make_pipeline(StandardScaler(), LogisticRegression()),
+        scoring=metric,
+        response_method=response_method,
+        thresholds=100,
+    ).fit(X, y)
+    assert_array_equal(model.classes_, np.sort(classes))
+    y_pred = model.predict(X)
+    assert_array_equal(np.unique(y_pred), np.sort(classes))
+
+
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+@config_context(enable_metadata_routing=True)
+def test_tuned_threshold_classifier_refit(with_sample_weight, global_random_seed):
+    """Check the behaviour of the `refit` parameter."""
+    rng = np.random.RandomState(global_random_seed)
+    X, y = make_classification(n_samples=100, random_state=0)
+    if with_sample_weight:
+        sample_weight = rng.randn(X.shape[0])
+        sample_weight = np.abs(sample_weight, out=sample_weight)
+    else:
+        sample_weight = None
+
+    # check that `estimator_` if fitted on the full dataset when `refit=True`
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
+    model = TunedThresholdClassifierCV(estimator, refit=True).fit(
+        X, y, sample_weight=sample_weight
+    )
+
+    assert model.estimator_ is not estimator
+    estimator.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(model.estimator_.coef_, estimator.coef_)
+    assert_allclose(model.estimator_.intercept_, estimator.intercept_)
+
+    # check that `estimator_` was not altered when `refit=False` and `cv="prefit"`
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
+    estimator.fit(X, y, sample_weight=sample_weight)
+    coef = estimator.coef_.copy()
+    model = TunedThresholdClassifierCV(estimator, cv="prefit", refit=False).fit(
+        X, y, sample_weight=sample_weight
+    )
+
+    assert model.estimator_ is estimator
+    assert_allclose(model.estimator_.coef_, coef)
+
+    # check that we train `estimator_` on the training split of a given cross-validation
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
+    cv = [
+        (np.arange(50), np.arange(50, 100)),
+    ]  # single split
+    model = TunedThresholdClassifierCV(estimator, cv=cv, refit=False).fit(
+        X, y, sample_weight=sample_weight
+    )
+
+    assert model.estimator_ is not estimator
+    if with_sample_weight:
+        sw_train = sample_weight[cv[0][0]]
+    else:
+        sw_train = None
+    estimator.fit(X[cv[0][0]], y[cv[0][0]], sample_weight=sw_train)
+    assert_allclose(model.estimator_.coef_, estimator.coef_)
+
+
+@pytest.mark.parametrize("fit_params_type", ["list", "array"])
+@config_context(enable_metadata_routing=True)
+def test_tuned_threshold_classifier_fit_params(fit_params_type):
+    """Check that we pass `fit_params` to the classifier when calling `fit`."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    fit_params = {
+        "a": _convert_container(y, fit_params_type),
+        "b": _convert_container(y, fit_params_type),
+    }
+
+    classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
+    classifier.set_fit_request(a=True, b=True)
+    model = TunedThresholdClassifierCV(classifier)
+    model.fit(X, y, **fit_params)
+
+
+@config_context(enable_metadata_routing=True)
+def test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence():
+    """Check that passing removing some sample from the dataset `X` is
+    equivalent to passing a `sample_weight` with a factor 0."""
+    X, y = load_iris(return_X_y=True)
+    # Scale the data to avoid any convergence issue
+    X = StandardScaler().fit_transform(X)
+    # Only use 2 classes and select samples such that 2-fold cross-validation
+    # split will lead to an equivalence with a `sample_weight` of 0
+    X = np.vstack((X[:40], X[50:90]))
+    y = np.hstack((y[:40], y[50:90]))
+    sample_weight = np.zeros_like(y)
+    sample_weight[::2] = 1
+
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
+    model_without_weights = TunedThresholdClassifierCV(estimator, cv=2)
+    model_with_weights = clone(model_without_weights)
+
+    model_with_weights.fit(X, y, sample_weight=sample_weight)
+    model_without_weights.fit(X[::2], y[::2])
+
+    assert_allclose(
+        model_with_weights.estimator_.coef_, model_without_weights.estimator_.coef_
+    )
+
+    y_pred_with_weights = model_with_weights.predict_proba(X)
+    y_pred_without_weights = model_without_weights.predict_proba(X)
+    assert_allclose(y_pred_with_weights, y_pred_without_weights)
+
+
+def test_tuned_threshold_classifier_thresholds_array():
+    """Check that we can pass an array to `thresholds` and it is used as candidate
+    threshold internally."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression()
+    thresholds = np.linspace(0, 1, 11)
+    tuned_model = TunedThresholdClassifierCV(
+        estimator,
+        thresholds=thresholds,
+        response_method="predict_proba",
+        store_cv_results=True,
+    ).fit(X, y)
+    assert_allclose(tuned_model.cv_results_["thresholds"], thresholds)
+
+
+@pytest.mark.parametrize("store_cv_results", [True, False])
+def test_tuned_threshold_classifier_store_cv_results(store_cv_results):
+    """Check that if `cv_results_` exists depending on `store_cv_results`."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression()
+    tuned_model = TunedThresholdClassifierCV(
+        estimator, store_cv_results=store_cv_results
+    ).fit(X, y)
+    if store_cv_results:
+        assert hasattr(tuned_model, "cv_results_")
+    else:
+        assert not hasattr(tuned_model, "cv_results_")
+
+
+def test_tuned_threshold_classifier_cv_float():
+    """Check the behaviour when `cv` is set to a float."""
+    X, y = make_classification(random_state=0)
+
+    # case where `refit=False` and cv is a float: the underlying estimator will be fit
+    # on the training set given by a ShuffleSplit. We check that we get the same model
+    # coefficients.
+    test_size = 0.3
+    estimator = LogisticRegression()
+    tuned_model = TunedThresholdClassifierCV(
+        estimator, cv=test_size, refit=False, random_state=0
+    ).fit(X, y)
+    tuned_model.fit(X, y)
+
+    cv = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=0)
+    train_idx, val_idx = next(cv.split(X, y))
+    cloned_estimator = clone(estimator).fit(X[train_idx], y[train_idx])
+
+    assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_)
+
+    # case where `refit=True`, then the underlying estimator is fitted on the full
+    # dataset.
+    tuned_model.set_params(refit=True).fit(X, y)
+    cloned_estimator = clone(estimator).fit(X, y)
+
+    assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_)
+
+
+def test_tuned_threshold_classifier_error_constant_predictor():
+    """Check that we raise a ValueError if the underlying classifier returns constant
+    probabilities such that we cannot find any threshold.
+    """
+    X, y = make_classification(random_state=0)
+    estimator = DummyClassifier(strategy="constant", constant=1)
+    tuned_model = TunedThresholdClassifierCV(estimator, response_method="predict_proba")
+    err_msg = "The provided estimator makes constant predictions"
+    with pytest.raises(ValueError, match=err_msg):
+        tuned_model.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "predict_proba", "decision_function"]
+)
+def test_fixed_threshold_classifier_equivalence_default(response_method):
+    """Check that `FixedThresholdClassifier` has the same behaviour as the vanilla
+    classifier.
+    """
+    X, y = make_classification(random_state=0)
+    classifier = LogisticRegression().fit(X, y)
+    classifier_default_threshold = FixedThresholdClassifier(
+        estimator=clone(classifier), response_method=response_method
+    )
+    classifier_default_threshold.fit(X, y)
+
+    # emulate the response method that should take into account the `pos_label`
+    if response_method in ("auto", "predict_proba"):
+        y_score = classifier_default_threshold.predict_proba(X)[:, 1]
+        threshold = 0.5
+    else:  # response_method == "decision_function"
+        y_score = classifier_default_threshold.decision_function(X)
+        threshold = 0.0
+
+    y_pred_lr = (y_score >= threshold).astype(int)
+    assert_allclose(classifier_default_threshold.predict(X), y_pred_lr)
+
+
+@pytest.mark.parametrize(
+    "response_method, threshold", [("predict_proba", 0.7), ("decision_function", 2.0)]
+)
+@pytest.mark.parametrize("pos_label", [0, 1])
+def test_fixed_threshold_classifier(response_method, threshold, pos_label):
+    """Check that applying `predict` lead to the same prediction as applying the
+    threshold to the output of the response method.
+    """
+    X, y = make_classification(n_samples=50, random_state=0)
+    logistic_regression = LogisticRegression().fit(X, y)
+    model = FixedThresholdClassifier(
+        estimator=clone(logistic_regression),
+        threshold=threshold,
+        response_method=response_method,
+        pos_label=pos_label,
+    ).fit(X, y)
+
+    # check that the underlying estimator is the same
+    assert_allclose(model.estimator_.coef_, logistic_regression.coef_)
+
+    # emulate the response method that should take into account the `pos_label`
+    if response_method == "predict_proba":
+        y_score = model.predict_proba(X)[:, pos_label]
+    else:  # response_method == "decision_function"
+        y_score = model.decision_function(X)
+        y_score = y_score if pos_label == 1 else -y_score
+
+    # create a mapping from boolean values to class labels
+    map_to_label = np.array([0, 1]) if pos_label == 1 else np.array([1, 0])
+    y_pred_lr = map_to_label[(y_score >= threshold).astype(int)]
+    assert_allclose(model.predict(X), y_pred_lr)
+
+    for method in ("predict_proba", "predict_log_proba", "decision_function"):
+        assert_allclose(
+            getattr(model, method)(X), getattr(logistic_regression, method)(X)
+        )
+        assert_allclose(
+            getattr(model.estimator_, method)(X),
+            getattr(logistic_regression, method)(X),
+        )
+
+
+@config_context(enable_metadata_routing=True)
+def test_fixed_threshold_classifier_metadata_routing():
+    """Check that everything works with metadata routing."""
+    X, y = make_classification(random_state=0)
+    sample_weight = np.ones_like(y)
+    sample_weight[::2] = 2
+    classifier = LogisticRegression().set_fit_request(sample_weight=True)
+    classifier.fit(X, y, sample_weight=sample_weight)
+    classifier_default_threshold = FixedThresholdClassifier(estimator=clone(classifier))
+    classifier_default_threshold.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(classifier_default_threshold.estimator_.coef_, classifier.coef_)
+
+
+@pytest.mark.parametrize(
+    "method", ["predict_proba", "decision_function", "predict", "predict_log_proba"]
+)
+def test_fixed_threshold_classifier_fitted_estimator(method):
+    """Check that if the underlying estimator is already fitted, no fit is required."""
+    X, y = make_classification(random_state=0)
+    classifier = LogisticRegression().fit(X, y)
+    fixed_threshold_classifier = FixedThresholdClassifier(estimator=classifier)
+    # This should not raise an error
+    getattr(fixed_threshold_classifier, method)(X)
+
+
+def test_fixed_threshold_classifier_classes_():
+    """Check that the classes_ attribute is properly set."""
+    X, y = make_classification(random_state=0)
+    with pytest.raises(
+        AttributeError, match="The underlying estimator is not fitted yet."
+    ):
+        FixedThresholdClassifier(estimator=LogisticRegression()).classes_
+
+    classifier = LogisticRegression().fit(X, y)
+    fixed_threshold_classifier = FixedThresholdClassifier(estimator=classifier)
+    assert_array_equal(fixed_threshold_classifier.classes_, classifier.classes_)
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_plot.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..1566f02acb3d361c2ca783b899a452e2e1d45418
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_plot.py
@@ -0,0 +1,572 @@
+import numpy as np
+import pytest
+
+from sklearn.datasets import load_iris
+from sklearn.model_selection import (
+    LearningCurveDisplay,
+    ValidationCurveDisplay,
+    learning_curve,
+    validation_curve,
+)
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils import shuffle
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+
+
+@pytest.fixture
+def data():
+    return shuffle(*load_iris(return_X_y=True), random_state=0)
+
+
+@pytest.mark.parametrize(
+    "params, err_type, err_msg",
+    [
+        ({"std_display_style": "invalid"}, ValueError, "Unknown std_display_style:"),
+        ({"score_type": "invalid"}, ValueError, "Unknown score_type:"),
+    ],
+)
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_parameters_validation(
+    pyplot, data, params, err_type, err_msg, CurveDisplay, specific_params
+):
+    """Check that we raise a proper error when passing invalid parameters."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    with pytest.raises(err_type, match=err_msg):
+        CurveDisplay.from_estimator(estimator, X, y, **specific_params, **params)
+
+
+def test_learning_curve_display_default_usage(pyplot, data):
+    """Check the default usage of the LearningCurveDisplay class."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    train_sizes = [0.3, 0.6, 0.9]
+    display = LearningCurveDisplay.from_estimator(
+        estimator, X, y, train_sizes=train_sizes
+    )
+
+    import matplotlib as mpl
+
+    assert display.errorbar_ is None
+
+    assert isinstance(display.lines_, list)
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
+
+    assert isinstance(display.fill_between_, list)
+    for fill in display.fill_between_:
+        assert isinstance(fill, mpl.collections.PolyCollection)
+        assert fill.get_alpha() == 0.5
+
+    assert display.score_name == "Score"
+    assert display.ax_.get_xlabel() == "Number of samples in the training set"
+    assert display.ax_.get_ylabel() == "Score"
+
+    _, legend_labels = display.ax_.get_legend_handles_labels()
+    assert legend_labels == ["Train", "Test"]
+
+    train_sizes_abs, train_scores, test_scores = learning_curve(
+        estimator, X, y, train_sizes=train_sizes
+    )
+
+    assert_array_equal(display.train_sizes, train_sizes_abs)
+    assert_allclose(display.train_scores, train_scores)
+    assert_allclose(display.test_scores, test_scores)
+
+
+def test_validation_curve_display_default_usage(pyplot, data):
+    """Check the default usage of the ValidationCurveDisplay class."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    param_name, param_range = "max_depth", [1, 3, 5]
+    display = ValidationCurveDisplay.from_estimator(
+        estimator, X, y, param_name=param_name, param_range=param_range
+    )
+
+    import matplotlib as mpl
+
+    assert display.errorbar_ is None
+
+    assert isinstance(display.lines_, list)
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
+
+    assert isinstance(display.fill_between_, list)
+    for fill in display.fill_between_:
+        assert isinstance(fill, mpl.collections.PolyCollection)
+        assert fill.get_alpha() == 0.5
+
+    assert display.score_name == "Score"
+    assert display.ax_.get_xlabel() == f"{param_name}"
+    assert display.ax_.get_ylabel() == "Score"
+
+    _, legend_labels = display.ax_.get_legend_handles_labels()
+    assert legend_labels == ["Train", "Test"]
+
+    train_scores, test_scores = validation_curve(
+        estimator, X, y, param_name=param_name, param_range=param_range
+    )
+
+    assert_array_equal(display.param_range, param_range)
+    assert_allclose(display.train_scores, train_scores)
+    assert_allclose(display.test_scores, test_scores)
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_negate_score(pyplot, data, CurveDisplay, specific_params):
+    """Check the behaviour of the `negate_score` parameter calling `from_estimator` and
+    `plot`.
+    """
+    X, y = data
+    estimator = DecisionTreeClassifier(max_depth=1, random_state=0)
+
+    negate_score = False
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
+    )
+
+    positive_scores = display.lines_[0].get_data()[1]
+    assert (positive_scores >= 0).all()
+    assert display.ax_.get_ylabel() == "Score"
+
+    negate_score = True
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
+    )
+
+    negative_scores = display.lines_[0].get_data()[1]
+    assert (negative_scores <= 0).all()
+    assert_allclose(negative_scores, -positive_scores)
+    assert display.ax_.get_ylabel() == "Negative score"
+
+    negate_score = False
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
+    )
+    assert display.ax_.get_ylabel() == "Score"
+    display.plot(negate_score=not negate_score)
+    assert display.ax_.get_ylabel() == "Score"
+    assert (display.lines_[0].get_data()[1] < 0).all()
+
+
+@pytest.mark.parametrize(
+    "score_name, ylabel", [(None, "Score"), ("Accuracy", "Accuracy")]
+)
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_score_name(
+    pyplot, data, score_name, ylabel, CurveDisplay, specific_params
+):
+    """Check that we can overwrite the default score name shown on the y-axis."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, score_name=score_name
+    )
+
+    assert display.ax_.get_ylabel() == ylabel
+    X, y = data
+    estimator = DecisionTreeClassifier(max_depth=1, random_state=0)
+
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, score_name=score_name
+    )
+
+    assert display.score_name == ylabel
+
+
+@pytest.mark.parametrize("std_display_style", (None, "errorbar"))
+def test_learning_curve_display_score_type(pyplot, data, std_display_style):
+    """Check the behaviour of setting the `score_type` parameter."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    train_sizes = [0.3, 0.6, 0.9]
+    train_sizes_abs, train_scores, test_scores = learning_curve(
+        estimator, X, y, train_sizes=train_sizes
+    )
+
+    score_type = "train"
+    display = LearningCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        train_sizes=train_sizes,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, train_sizes_abs)
+    assert_allclose(y_data, train_scores.mean(axis=1))
+
+    score_type = "test"
+    display = LearningCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        train_sizes=train_sizes,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, train_sizes_abs)
+    assert_allclose(y_data, test_scores.mean(axis=1))
+
+    score_type = "both"
+    display = LearningCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        train_sizes=train_sizes,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train", "Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 2
+        assert display.errorbar_ is None
+        x_data_train, y_data_train = display.lines_[0].get_data()
+        x_data_test, y_data_test = display.lines_[1].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 2
+        x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data()
+        x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data()
+
+    assert_array_equal(x_data_train, train_sizes_abs)
+    assert_allclose(y_data_train, train_scores.mean(axis=1))
+    assert_array_equal(x_data_test, train_sizes_abs)
+    assert_allclose(y_data_test, test_scores.mean(axis=1))
+
+
+@pytest.mark.parametrize("std_display_style", (None, "errorbar"))
+def test_validation_curve_display_score_type(pyplot, data, std_display_style):
+    """Check the behaviour of setting the `score_type` parameter."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    param_name, param_range = "max_depth", [1, 3, 5]
+    train_scores, test_scores = validation_curve(
+        estimator, X, y, param_name=param_name, param_range=param_range
+    )
+
+    score_type = "train"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, param_range)
+    assert_allclose(y_data, train_scores.mean(axis=1))
+
+    score_type = "test"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, param_range)
+    assert_allclose(y_data, test_scores.mean(axis=1))
+
+    score_type = "both"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train", "Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 2
+        assert display.errorbar_ is None
+        x_data_train, y_data_train = display.lines_[0].get_data()
+        x_data_test, y_data_test = display.lines_[1].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 2
+        x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data()
+        x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data()
+
+    assert_array_equal(x_data_train, param_range)
+    assert_allclose(y_data_train, train_scores.mean(axis=1))
+    assert_array_equal(x_data_test, param_range)
+    assert_allclose(y_data_test, test_scores.mean(axis=1))
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params, expected_xscale",
+    [
+        (
+            ValidationCurveDisplay,
+            {"param_name": "max_depth", "param_range": np.arange(1, 5)},
+            "linear",
+        ),
+        (LearningCurveDisplay, {"train_sizes": np.linspace(0.1, 0.9, num=5)}, "linear"),
+        (
+            ValidationCurveDisplay,
+            {
+                "param_name": "max_depth",
+                "param_range": np.round(np.logspace(0, 2, num=5)).astype(np.int64),
+            },
+            "log",
+        ),
+        (LearningCurveDisplay, {"train_sizes": np.logspace(-1, 0, num=5)}, "log"),
+    ],
+)
+def test_curve_display_xscale_auto(
+    pyplot, data, CurveDisplay, specific_params, expected_xscale
+):
+    """Check the behaviour of the x-axis scaling depending on the data provided."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    display = CurveDisplay.from_estimator(estimator, X, y, **specific_params)
+    assert display.ax_.get_xscale() == expected_xscale
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_std_display_style(pyplot, data, CurveDisplay, specific_params):
+    """Check the behaviour of the parameter `std_display_style`."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    import matplotlib as mpl
+
+    std_display_style = None
+    display = CurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        **specific_params,
+        std_display_style=std_display_style,
+    )
+
+    assert len(display.lines_) == 2
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
+    assert display.errorbar_ is None
+    assert display.fill_between_ is None
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert len(legend_label) == 2
+
+    std_display_style = "fill_between"
+    display = CurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        **specific_params,
+        std_display_style=std_display_style,
+    )
+
+    assert len(display.lines_) == 2
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
+    assert display.errorbar_ is None
+    assert len(display.fill_between_) == 2
+    for fill_between in display.fill_between_:
+        assert isinstance(fill_between, mpl.collections.PolyCollection)
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert len(legend_label) == 2
+
+    std_display_style = "errorbar"
+    display = CurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        **specific_params,
+        std_display_style=std_display_style,
+    )
+
+    assert display.lines_ is None
+    assert len(display.errorbar_) == 2
+    for errorbar in display.errorbar_:
+        assert isinstance(errorbar, mpl.container.ErrorbarContainer)
+    assert display.fill_between_ is None
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert len(legend_label) == 2
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_plot_kwargs(pyplot, data, CurveDisplay, specific_params):
+    """Check the behaviour of the different plotting keyword arguments: `line_kw`,
+    `fill_between_kw`, and `errorbar_kw`."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    std_display_style = "fill_between"
+    line_kw = {"color": "red"}
+    fill_between_kw = {"color": "red", "alpha": 1.0}
+    display = CurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        **specific_params,
+        std_display_style=std_display_style,
+        line_kw=line_kw,
+        fill_between_kw=fill_between_kw,
+    )
+
+    assert display.lines_[0].get_color() == "red"
+    assert_allclose(
+        display.fill_between_[0].get_facecolor(),
+        [[1.0, 0.0, 0.0, 1.0]],  # trust me, it's red
+    )
+
+    std_display_style = "errorbar"
+    errorbar_kw = {"color": "red"}
+    display = CurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        **specific_params,
+        std_display_style=std_display_style,
+        errorbar_kw=errorbar_kw,
+    )
+
+    assert display.errorbar_[0].lines[0].get_color() == "red"
+
+
+@pytest.mark.parametrize(
+    "param_range, xscale",
+    [([5, 10, 15], "linear"), ([-50, 5, 50, 500], "symlog"), ([5, 50, 500], "log")],
+)
+def test_validation_curve_xscale_from_param_range_provided_as_a_list(
+    pyplot, data, param_range, xscale
+):
+    """Check the induced xscale from the provided param_range values."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    param_name = "max_depth"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+    )
+
+    assert display.ax_.get_xscale() == xscale
+
+
+@pytest.mark.parametrize(
+    "Display, params",
+    [
+        (LearningCurveDisplay, {}),
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+    ],
+)
+def test_subclassing_displays(pyplot, data, Display, params):
+    """Check that named constructors return the correct type when subclassed.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/pull/27675
+    """
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    class SubclassOfDisplay(Display):
+        pass
+
+    display = SubclassOfDisplay.from_estimator(estimator, X, y, **params)
+    assert isinstance(display, SubclassOfDisplay)
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_search.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c387d62b80d9321a3dcb9c595f471bfcdba4d70
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_search.py
@@ -0,0 +1,2874 @@
+"""Test the search module"""
+
+import pickle
+import re
+import sys
+import warnings
+from collections.abc import Iterable, Sized
+from functools import partial
+from io import StringIO
+from itertools import chain, product
+from types import GeneratorType
+
+import numpy as np
+import pytest
+from scipy.stats import bernoulli, expon, uniform
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator, ClassifierMixin, is_classifier
+from sklearn.cluster import KMeans
+from sklearn.compose import ColumnTransformer
+from sklearn.datasets import (
+    make_blobs,
+    make_classification,
+    make_multilabel_classification,
+)
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.exceptions import FitFailedWarning
+from sklearn.experimental import enable_halving_search_cv  # noqa
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import (
+    LinearRegression,
+    LogisticRegression,
+    Ridge,
+    SGDClassifier,
+)
+from sklearn.metrics import (
+    accuracy_score,
+    confusion_matrix,
+    f1_score,
+    make_scorer,
+    r2_score,
+    recall_score,
+    roc_auc_score,
+)
+from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.model_selection import (
+    GridSearchCV,
+    GroupKFold,
+    GroupShuffleSplit,
+    HalvingGridSearchCV,
+    KFold,
+    LeaveOneGroupOut,
+    LeavePGroupsOut,
+    ParameterGrid,
+    ParameterSampler,
+    RandomizedSearchCV,
+    StratifiedKFold,
+    StratifiedShuffleSplit,
+    train_test_split,
+)
+from sklearn.model_selection._search import (
+    BaseSearchCV,
+    _yield_masked_array_for_each_param,
+)
+from sklearn.model_selection.tests.common import OneTimeSplitter
+from sklearn.naive_bayes import ComplementNB
+from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import (
+    OneHotEncoder,
+    OrdinalEncoder,
+    SplineTransformer,
+    StandardScaler,
+)
+from sklearn.svm import SVC, LinearSVC
+from sklearn.tests.metadata_routing_common import (
+    ConsumingScorer,
+    _Registry,
+    check_recorded_metadata,
+)
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
+from sklearn.utils._mocking import CheckingClassifier, MockDataFrame
+from sklearn.utils._testing import (
+    MinimalClassifier,
+    MinimalRegressor,
+    MinimalTransformer,
+    _array_api_for_tests,
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+from sklearn.utils.validation import _num_samples
+
+
+# Neither of the following two estimators inherit from BaseEstimator,
+# to test hyperparameter search on user-defined classifiers.
+class MockClassifier(ClassifierMixin, BaseEstimator):
+    """Dummy classifier to test the parameter search algorithms"""
+
+    def __init__(self, foo_param=0):
+        self.foo_param = foo_param
+
+    def fit(self, X, Y):
+        assert len(X) == len(Y)
+        self.classes_ = np.unique(Y)
+        return self
+
+    def predict(self, T):
+        return T.shape[0]
+
+    def transform(self, X):
+        return X + self.foo_param
+
+    def inverse_transform(self, X):
+        return X - self.foo_param
+
+    predict_proba = predict
+    predict_log_proba = predict
+    decision_function = predict
+
+    def score(self, X=None, Y=None):
+        if self.foo_param > 1:
+            score = 1.0
+        else:
+            score = 0.0
+        return score
+
+    def get_params(self, deep=False):
+        return {"foo_param": self.foo_param}
+
+    def set_params(self, **params):
+        self.foo_param = params["foo_param"]
+        return self
+
+
+class LinearSVCNoScore(LinearSVC):
+    """A LinearSVC classifier that has no score method."""
+
+    @property
+    def score(self):
+        raise AttributeError
+
+
+X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+y = np.array([1, 1, 2, 2])
+
+
+def assert_grid_iter_equals_getitem(grid):
+    assert list(grid) == [grid[i] for i in range(len(grid))]
+
+
+@pytest.mark.parametrize("klass", [ParameterGrid, partial(ParameterSampler, n_iter=10)])
+@pytest.mark.parametrize(
+    "input, error_type, error_message",
+    [
+        (0, TypeError, r"Parameter .* a dict or a list, got: 0 of type int"),
+        ([{"foo": [0]}, 0], TypeError, r"Parameter .* is not a dict \(0\)"),
+        (
+            {"foo": 0},
+            TypeError,
+            r"Parameter (grid|distribution) for parameter 'foo' (is not|needs to be) "
+            r"(a list or a numpy array|iterable or a distribution).*",
+        ),
+    ],
+)
+def test_validate_parameter_input(klass, input, error_type, error_message):
+    with pytest.raises(error_type, match=error_message):
+        klass(input)
+
+
+def test_parameter_grid():
+    # Test basic properties of ParameterGrid.
+    params1 = {"foo": [1, 2, 3]}
+    grid1 = ParameterGrid(params1)
+    assert isinstance(grid1, Iterable)
+    assert isinstance(grid1, Sized)
+    assert len(grid1) == 3
+    assert_grid_iter_equals_getitem(grid1)
+
+    params2 = {"foo": [4, 2], "bar": ["ham", "spam", "eggs"]}
+    grid2 = ParameterGrid(params2)
+    assert len(grid2) == 6
+
+    # loop to assert we can iterate over the grid multiple times
+    for i in range(2):
+        # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2)
+        points = set(tuple(chain(*(sorted(p.items())))) for p in grid2)
+        assert points == set(
+            ("bar", x, "foo", y) for x, y in product(params2["bar"], params2["foo"])
+        )
+    assert_grid_iter_equals_getitem(grid2)
+
+    # Special case: empty grid (useful to get default estimator settings)
+    empty = ParameterGrid({})
+    assert len(empty) == 1
+    assert list(empty) == [{}]
+    assert_grid_iter_equals_getitem(empty)
+    with pytest.raises(IndexError):
+        empty[1]
+
+    has_empty = ParameterGrid([{"C": [1, 10]}, {}, {"C": [0.5]}])
+    assert len(has_empty) == 4
+    assert list(has_empty) == [{"C": 1}, {"C": 10}, {}, {"C": 0.5}]
+    assert_grid_iter_equals_getitem(has_empty)
+
+
+def test_grid_search():
+    # Test that the best estimator contains the right value for foo_param
+    clf = MockClassifier()
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, verbose=3)
+    # make sure it selects the smallest parameter in case of ties
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    grid_search.fit(X, y)
+    sys.stdout = old_stdout
+    assert grid_search.best_estimator_.foo_param == 2
+
+    assert_array_equal(grid_search.cv_results_["param_foo_param"].data, [1, 2, 3])
+
+    # Smoke test the score etc:
+    grid_search.score(X, y)
+    grid_search.predict_proba(X)
+    grid_search.decision_function(X)
+    grid_search.transform(X)
+
+    # Test exception handling on scoring
+    grid_search.scoring = "sklearn"
+    with pytest.raises(ValueError):
+        grid_search.fit(X, y)
+
+
+def test_grid_search_pipeline_steps():
+    # check that parameters that are estimators are cloned before fitting
+    pipe = Pipeline([("regressor", LinearRegression())])
+    param_grid = {"regressor": [LinearRegression(), Ridge()]}
+    grid_search = GridSearchCV(pipe, param_grid, cv=2)
+    grid_search.fit(X, y)
+    regressor_results = grid_search.cv_results_["param_regressor"]
+    assert isinstance(regressor_results[0], LinearRegression)
+    assert isinstance(regressor_results[1], Ridge)
+    assert not hasattr(regressor_results[0], "coef_")
+    assert not hasattr(regressor_results[1], "coef_")
+    assert regressor_results[0] is not grid_search.best_estimator_
+    assert regressor_results[1] is not grid_search.best_estimator_
+    # check that we didn't modify the parameter grid that was passed
+    assert not hasattr(param_grid["regressor"][0], "coef_")
+    assert not hasattr(param_grid["regressor"][1], "coef_")
+
+
+@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV])
+def test_SearchCV_with_fit_params(SearchCV):
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+    clf = CheckingClassifier(expected_fit_params=["spam", "eggs"])
+    searcher = SearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, error_score="raise")
+
+    # The CheckingClassifier generates an assertion error if
+    # a parameter is missing or has length != len(X).
+    err_msg = r"Expected fit parameter\(s\) \['eggs'\] not seen."
+    with pytest.raises(AssertionError, match=err_msg):
+        searcher.fit(X, y, spam=np.ones(10))
+
+    err_msg = "Fit parameter spam has length 1; expected"
+    with pytest.raises(AssertionError, match=err_msg):
+        searcher.fit(X, y, spam=np.ones(1), eggs=np.zeros(10))
+    searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10))
+
+
+def test_grid_search_no_score():
+    # Test grid-search on classifier that has no score function.
+    clf = LinearSVC(random_state=0)
+    X, y = make_blobs(random_state=0, centers=2)
+    Cs = [0.1, 1, 10]
+    clf_no_score = LinearSVCNoScore(random_state=0)
+    grid_search = GridSearchCV(clf, {"C": Cs}, scoring="accuracy")
+    grid_search.fit(X, y)
+
+    grid_search_no_score = GridSearchCV(clf_no_score, {"C": Cs}, scoring="accuracy")
+    # smoketest grid search
+    grid_search_no_score.fit(X, y)
+
+    # check that best params are equal
+    assert grid_search_no_score.best_params_ == grid_search.best_params_
+    # check that we can call score and that it gives the correct result
+    assert grid_search.score(X, y) == grid_search_no_score.score(X, y)
+
+    # giving no scoring function raises an error
+    grid_search_no_score = GridSearchCV(clf_no_score, {"C": Cs})
+    with pytest.raises(TypeError, match="no scoring"):
+        grid_search_no_score.fit([[1]])
+
+
+def test_grid_search_score_method():
+    X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0)
+    clf = LinearSVC(random_state=0)
+    grid = {"C": [0.1]}
+
+    search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
+    search_accuracy = GridSearchCV(clf, grid, scoring="accuracy").fit(X, y)
+    search_no_score_method_auc = GridSearchCV(
+        LinearSVCNoScore(), grid, scoring="roc_auc"
+    ).fit(X, y)
+    search_auc = GridSearchCV(clf, grid, scoring="roc_auc").fit(X, y)
+
+    # Check warning only occurs in situation where behavior changed:
+    # estimator requires score method to compete with scoring parameter
+    score_no_scoring = search_no_scoring.score(X, y)
+    score_accuracy = search_accuracy.score(X, y)
+    score_no_score_auc = search_no_score_method_auc.score(X, y)
+    score_auc = search_auc.score(X, y)
+
+    # ensure the test is sane
+    assert score_auc < 1.0
+    assert score_accuracy < 1.0
+    assert score_auc != score_accuracy
+
+    assert_almost_equal(score_accuracy, score_no_scoring)
+    assert_almost_equal(score_auc, score_no_score_auc)
+
+
+def test_grid_search_groups():
+    # Check if ValueError (when groups is None) propagates to GridSearchCV
+    # And also check if groups is correctly passed to the cv object
+    rng = np.random.RandomState(0)
+
+    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
+    groups = rng.randint(0, 3, 15)
+
+    clf = LinearSVC(random_state=0)
+    grid = {"C": [1]}
+
+    group_cvs = [
+        LeaveOneGroupOut(),
+        LeavePGroupsOut(2),
+        GroupKFold(n_splits=3),
+        GroupShuffleSplit(),
+    ]
+    error_msg = "The 'groups' parameter should not be None."
+    for cv in group_cvs:
+        gs = GridSearchCV(clf, grid, cv=cv)
+        with pytest.raises(ValueError, match=error_msg):
+            gs.fit(X, y)
+        gs.fit(X, y, groups=groups)
+
+    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
+    for cv in non_group_cvs:
+        gs = GridSearchCV(clf, grid, cv=cv)
+        # Should not raise an error
+        gs.fit(X, y)
+
+
+def test_classes__property():
+    # Test that classes_ property matches best_estimator_.classes_
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+    Cs = [0.1, 1, 10]
+
+    grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs})
+    grid_search.fit(X, y)
+    assert_array_equal(grid_search.best_estimator_.classes_, grid_search.classes_)
+
+    # Test that regressors do not have a classes_ attribute
+    grid_search = GridSearchCV(Ridge(), {"alpha": [1.0, 2.0]})
+    grid_search.fit(X, y)
+    assert not hasattr(grid_search, "classes_")
+
+    # Test that the grid searcher has no classes_ attribute before it's fit
+    grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs})
+    assert not hasattr(grid_search, "classes_")
+
+    # Test that the grid searcher has no classes_ attribute without a refit
+    grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs}, refit=False)
+    grid_search.fit(X, y)
+    assert not hasattr(grid_search, "classes_")
+
+
+def test_trivial_cv_results_attr():
+    # Test search over a "grid" with only one point.
+    clf = MockClassifier()
+    grid_search = GridSearchCV(clf, {"foo_param": [1]}, cv=2)
+    grid_search.fit(X, y)
+    assert hasattr(grid_search, "cv_results_")
+
+    random_search = RandomizedSearchCV(clf, {"foo_param": [0]}, n_iter=1, cv=2)
+    random_search.fit(X, y)
+    assert hasattr(grid_search, "cv_results_")
+
+
+def test_no_refit():
+    # Test that GSCV can be used for model selection alone without refitting
+    clf = MockClassifier()
+    for scoring in [None, ["accuracy", "precision"]]:
+        grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=False, cv=2)
+        grid_search.fit(X, y)
+        assert (
+            not hasattr(grid_search, "best_estimator_")
+            and hasattr(grid_search, "best_index_")
+            and hasattr(grid_search, "best_params_")
+        )
+
+        # Make sure the functions predict/transform etc. raise meaningful
+        # error messages
+        for fn_name in (
+            "predict",
+            "predict_proba",
+            "predict_log_proba",
+            "transform",
+            "inverse_transform",
+        ):
+            outer_msg = f"has no attribute '{fn_name}'"
+            inner_msg = (
+                f"`refit=False`. {fn_name} is available only after "
+                "refitting on the best parameters"
+            )
+            with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+                getattr(grid_search, fn_name)(X)
+
+            assert isinstance(exec_info.value.__cause__, AttributeError)
+            assert inner_msg in str(exec_info.value.__cause__)
+
+    # Test that an invalid refit param raises appropriate error messages
+    error_msg = (
+        "For multi-metric scoring, the parameter refit must be set to a scorer key"
+    )
+    for refit in [True, "recall", "accuracy"]:
+        with pytest.raises(ValueError, match=error_msg):
+            GridSearchCV(
+                clf, {}, refit=refit, scoring={"acc": "accuracy", "prec": "precision"}
+            ).fit(X, y)
+
+
+def test_grid_search_error():
+    # Test that grid search will capture errors on data with different length
+    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
+
+    clf = LinearSVC()
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]})
+    with pytest.raises(ValueError):
+        cv.fit(X_[:180], y_)
+
+
+def test_grid_search_one_grid_point():
+    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
+    param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}
+
+    clf = SVC(gamma="auto")
+    cv = GridSearchCV(clf, param_dict)
+    cv.fit(X_, y_)
+
+    clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
+    clf.fit(X_, y_)
+
+    assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
+
+
+def test_grid_search_when_param_grid_includes_range():
+    # Test that the best estimator contains the right value for foo_param
+    clf = MockClassifier()
+    grid_search = None
+    grid_search = GridSearchCV(clf, {"foo_param": range(1, 4)}, cv=2)
+    grid_search.fit(X, y)
+    assert grid_search.best_estimator_.foo_param == 2
+
+
+def test_grid_search_bad_param_grid():
+    X, y = make_classification(n_samples=10, n_features=5, random_state=0)
+    param_dict = {"C": 1}
+    clf = SVC(gamma="auto")
+    error_msg = re.escape(
+        "Parameter grid for parameter 'C' needs to be a list or "
+        "a numpy array, but got 1 (of type int) instead. Single "
+        "values need to be wrapped in a list with one element."
+    )
+    search = GridSearchCV(clf, param_dict)
+    with pytest.raises(TypeError, match=error_msg):
+        search.fit(X, y)
+
+    param_dict = {"C": []}
+    clf = SVC()
+    error_msg = re.escape(
+        "Parameter grid for parameter 'C' need to be a non-empty sequence, got: []"
+    )
+    search = GridSearchCV(clf, param_dict)
+    with pytest.raises(ValueError, match=error_msg):
+        search.fit(X, y)
+
+    param_dict = {"C": "1,2,3"}
+    clf = SVC(gamma="auto")
+    error_msg = re.escape(
+        "Parameter grid for parameter 'C' needs to be a list or a numpy array, "
+        "but got '1,2,3' (of type str) instead. Single values need to be "
+        "wrapped in a list with one element."
+    )
+    search = GridSearchCV(clf, param_dict)
+    with pytest.raises(TypeError, match=error_msg):
+        search.fit(X, y)
+
+    param_dict = {"C": np.ones((3, 2))}
+    clf = SVC()
+    search = GridSearchCV(clf, param_dict)
+    with pytest.raises(ValueError):
+        search.fit(X, y)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_grid_search_sparse(csr_container):
+    # Test that grid search works with both dense and sparse matrices
+    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
+
+    clf = LinearSVC()
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]})
+    cv.fit(X_[:180], y_[:180])
+    y_pred = cv.predict(X_[180:])
+    C = cv.best_estimator_.C
+
+    X_ = csr_container(X_)
+    clf = LinearSVC()
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]})
+    cv.fit(X_[:180].tocoo(), y_[:180])
+    y_pred2 = cv.predict(X_[180:])
+    C2 = cv.best_estimator_.C
+
+    assert np.mean(y_pred == y_pred2) >= 0.9
+    assert C == C2
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_grid_search_sparse_scoring(csr_container):
+    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
+
+    clf = LinearSVC()
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1")
+    cv.fit(X_[:180], y_[:180])
+    y_pred = cv.predict(X_[180:])
+    C = cv.best_estimator_.C
+
+    X_ = csr_container(X_)
+    clf = LinearSVC()
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1")
+    cv.fit(X_[:180], y_[:180])
+    y_pred2 = cv.predict(X_[180:])
+    C2 = cv.best_estimator_.C
+
+    assert_array_equal(y_pred, y_pred2)
+    assert C == C2
+    # Smoke test the score
+    # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
+    #                            cv.score(X_[:180], y[:180]))
+
+    # test loss where greater is worse
+    def f1_loss(y_true_, y_pred_):
+        return -f1_score(y_true_, y_pred_)
+
+    F1Loss = make_scorer(f1_loss, greater_is_better=False)
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring=F1Loss)
+    cv.fit(X_[:180], y_[:180])
+    y_pred3 = cv.predict(X_[180:])
+    C3 = cv.best_estimator_.C
+
+    assert C == C3
+    assert_array_equal(y_pred, y_pred3)
+
+
+def test_grid_search_precomputed_kernel():
+    # Test that grid search works when the input features are given in the
+    # form of a precomputed kernel matrix
+    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
+
+    # compute the training kernel matrix corresponding to the linear kernel
+    K_train = np.dot(X_[:180], X_[:180].T)
+    y_train = y_[:180]
+
+    clf = SVC(kernel="precomputed")
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]})
+    cv.fit(K_train, y_train)
+
+    assert cv.best_score_ >= 0
+
+    # compute the test kernel matrix
+    K_test = np.dot(X_[180:], X_[:180].T)
+    y_test = y_[180:]
+
+    y_pred = cv.predict(K_test)
+
+    assert np.mean(y_pred == y_test) >= 0
+
+    # test error is raised when the precomputed kernel is not array-like
+    # or sparse
+    with pytest.raises(ValueError):
+        cv.fit(K_train.tolist(), y_train)
+
+
+def test_grid_search_precomputed_kernel_error_nonsquare():
+    # Test that grid search returns an error with a non-square precomputed
+    # training kernel matrix
+    K_train = np.zeros((10, 20))
+    y_train = np.ones((10,))
+    clf = SVC(kernel="precomputed")
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]})
+    with pytest.raises(ValueError):
+        cv.fit(K_train, y_train)
+
+
+class BrokenClassifier(BaseEstimator):
+    """Broken classifier that cannot be fit twice"""
+
+    def __init__(self, parameter=None):
+        self.parameter = parameter
+
+    def fit(self, X, y):
+        assert not hasattr(self, "has_been_fit_")
+        self.has_been_fit_ = True
+
+    def predict(self, X):
+        return np.zeros(X.shape[0])
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.UndefinedMetricWarning")
+def test_refit():
+    # Regression test for bug in refitting
+    # Simulates re-fitting a broken estimator; this used to break with
+    # sparse SVMs.
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+
+    clf = GridSearchCV(
+        BrokenClassifier(), [{"parameter": [0, 1]}], scoring="precision", refit=True
+    )
+    clf.fit(X, y)
+
+
+def test_refit_callable():
+    """
+    Test refit=callable, which adds flexibility in identifying the
+    "best" estimator.
+    """
+
+    def refit_callable(cv_results):
+        """
+        A dummy function tests `refit=callable` interface.
+        Return the index of a model that has the least
+        `mean_test_score`.
+        """
+        # Fit a dummy clf with `refit=True` to get a list of keys in
+        # clf.cv_results_.
+        X, y = make_classification(n_samples=100, n_features=4, random_state=42)
+        clf = GridSearchCV(
+            LinearSVC(random_state=42),
+            {"C": [0.01, 0.1, 1]},
+            scoring="precision",
+            refit=True,
+        )
+        clf.fit(X, y)
+        # Ensure that `best_index_ != 0` for this dummy clf
+        assert clf.best_index_ != 0
+
+        # Assert every key matches those in `cv_results`
+        for key in clf.cv_results_.keys():
+            assert key in cv_results
+
+        return cv_results["mean_test_score"].argmin()
+
+    X, y = make_classification(n_samples=100, n_features=4, random_state=42)
+    clf = GridSearchCV(
+        LinearSVC(random_state=42),
+        {"C": [0.01, 0.1, 1]},
+        scoring="precision",
+        refit=refit_callable,
+    )
+    clf.fit(X, y)
+
+    assert clf.best_index_ == 0
+    # Ensure `best_score_` is disabled when using `refit=callable`
+    assert not hasattr(clf, "best_score_")
+
+
+def test_refit_callable_invalid_type():
+    """
+    Test implementation catches the errors when 'best_index_' returns an
+    invalid result.
+    """
+
+    def refit_callable_invalid_type(cv_results):
+        """
+        A dummy function tests when returned 'best_index_' is not integer.
+        """
+        return None
+
+    X, y = make_classification(n_samples=100, n_features=4, random_state=42)
+
+    clf = GridSearchCV(
+        LinearSVC(random_state=42),
+        {"C": [0.1, 1]},
+        scoring="precision",
+        refit=refit_callable_invalid_type,
+    )
+    with pytest.raises(TypeError, match="best_index_ returned is not an integer"):
+        clf.fit(X, y)
+
+
+@pytest.mark.parametrize("out_bound_value", [-1, 2])
+@pytest.mark.parametrize("search_cv", [RandomizedSearchCV, GridSearchCV])
+def test_refit_callable_out_bound(out_bound_value, search_cv):
+    """
+    Test implementation catches the errors when 'best_index_' returns an
+    out of bound result.
+    """
+
+    def refit_callable_out_bound(cv_results):
+        """
+        A dummy function tests when returned 'best_index_' is out of bounds.
+        """
+        return out_bound_value
+
+    X, y = make_classification(n_samples=100, n_features=4, random_state=42)
+
+    clf = search_cv(
+        LinearSVC(random_state=42),
+        {"C": [0.1, 1]},
+        scoring="precision",
+        refit=refit_callable_out_bound,
+    )
+    with pytest.raises(IndexError, match="best_index_ index out of range"):
+        clf.fit(X, y)
+
+
+def test_refit_callable_multi_metric():
+    """
+    Test refit=callable in multiple metric evaluation setting
+    """
+
+    def refit_callable(cv_results):
+        """
+        A dummy function tests `refit=callable` interface.
+        Return the index of a model that has the least
+        `mean_test_prec`.
+        """
+        assert "mean_test_prec" in cv_results
+        return cv_results["mean_test_prec"].argmin()
+
+    X, y = make_classification(n_samples=100, n_features=4, random_state=42)
+    scoring = {"Accuracy": make_scorer(accuracy_score), "prec": "precision"}
+    clf = GridSearchCV(
+        LinearSVC(random_state=42),
+        {"C": [0.01, 0.1, 1]},
+        scoring=scoring,
+        refit=refit_callable,
+    )
+    clf.fit(X, y)
+
+    assert clf.best_index_ == 0
+    # Ensure `best_score_` is disabled when using `refit=callable`
+    assert not hasattr(clf, "best_score_")
+
+
+def test_gridsearch_nd():
+    # Pass X as list in GridSearchCV
+    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
+    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
+
+    def check_X(x):
+        return x.shape[1:] == (5, 3, 2)
+
+    def check_y(x):
+        return x.shape[1:] == (7, 11)
+
+    clf = CheckingClassifier(
+        check_X=check_X,
+        check_y=check_y,
+        methods_to_check=["fit"],
+    )
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]})
+    grid_search.fit(X_4d, y_3d).score(X, y)
+    assert hasattr(grid_search, "cv_results_")
+
+
+def test_X_as_list():
+    # Pass X as list in GridSearchCV
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+
+    clf = CheckingClassifier(
+        check_X=lambda x: isinstance(x, list),
+        methods_to_check=["fit"],
+    )
+    cv = KFold(n_splits=3)
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=cv)
+    grid_search.fit(X.tolist(), y).score(X, y)
+    assert hasattr(grid_search, "cv_results_")
+
+
+def test_y_as_list():
+    # Pass y as list in GridSearchCV
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+
+    clf = CheckingClassifier(
+        check_y=lambda x: isinstance(x, list),
+        methods_to_check=["fit"],
+    )
+    cv = KFold(n_splits=3)
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=cv)
+    grid_search.fit(X, y.tolist()).score(X, y)
+    assert hasattr(grid_search, "cv_results_")
+
+
+def test_pandas_input():
+    # check cross_val_score doesn't destroy pandas dataframe
+    types = [(MockDataFrame, MockDataFrame)]
+    try:
+        from pandas import DataFrame, Series
+
+        types.append((DataFrame, Series))
+    except ImportError:
+        pass
+
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+
+    for InputFeatureType, TargetType in types:
+        # X dataframe, y series
+        X_df, y_ser = InputFeatureType(X), TargetType(y)
+
+        def check_df(x):
+            return isinstance(x, InputFeatureType)
+
+        def check_series(x):
+            return isinstance(x, TargetType)
+
+        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
+
+        grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]})
+        grid_search.fit(X_df, y_ser).score(X_df, y_ser)
+        grid_search.predict(X_df)
+        assert hasattr(grid_search, "cv_results_")
+
+
+def test_unsupervised_grid_search():
+    # test grid-search with unsupervised estimator
+    X, y = make_blobs(n_samples=50, random_state=0)
+    km = KMeans(random_state=0, init="random", n_init=1)
+
+    # Multi-metric evaluation unsupervised
+    scoring = ["adjusted_rand_score", "fowlkes_mallows_score"]
+    for refit in ["adjusted_rand_score", "fowlkes_mallows_score"]:
+        grid_search = GridSearchCV(
+            km, param_grid=dict(n_clusters=[2, 3, 4]), scoring=scoring, refit=refit
+        )
+        grid_search.fit(X, y)
+        # Both ARI and FMS can find the right number :)
+        assert grid_search.best_params_["n_clusters"] == 3
+
+    # Single metric evaluation unsupervised
+    grid_search = GridSearchCV(
+        km, param_grid=dict(n_clusters=[2, 3, 4]), scoring="fowlkes_mallows_score"
+    )
+    grid_search.fit(X, y)
+    assert grid_search.best_params_["n_clusters"] == 3
+
+    # Now without a score, and without y
+    grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]))
+    grid_search.fit(X)
+    assert grid_search.best_params_["n_clusters"] == 4
+
+
+def test_gridsearch_no_predict():
+    # test grid-search with an estimator without predict.
+    # slight duplication of a test from KDE
+    def custom_scoring(estimator, X):
+        return 42 if estimator.bandwidth == 0.1 else 0
+
+    X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]])
+    search = GridSearchCV(
+        KernelDensity(),
+        param_grid=dict(bandwidth=[0.01, 0.1, 1]),
+        scoring=custom_scoring,
+    )
+    search.fit(X)
+    assert search.best_params_["bandwidth"] == 0.1
+    assert search.best_score_ == 42
+
+
+def test_param_sampler():
+    # test basic properties of param sampler
+    param_distributions = {"kernel": ["rbf", "linear"], "C": uniform(0, 1)}
+    sampler = ParameterSampler(
+        param_distributions=param_distributions, n_iter=10, random_state=0
+    )
+    samples = [x for x in sampler]
+    assert len(samples) == 10
+    for sample in samples:
+        assert sample["kernel"] in ["rbf", "linear"]
+        assert 0 <= sample["C"] <= 1
+
+    # test that repeated calls yield identical parameters
+    param_distributions = {"C": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
+    sampler = ParameterSampler(
+        param_distributions=param_distributions, n_iter=3, random_state=0
+    )
+    assert [x for x in sampler] == [x for x in sampler]
+
+    param_distributions = {"C": uniform(0, 1)}
+    sampler = ParameterSampler(
+        param_distributions=param_distributions, n_iter=10, random_state=0
+    )
+    assert [x for x in sampler] == [x for x in sampler]
+
+
+def check_cv_results_array_types(
+    search, param_keys, score_keys, expected_cv_results_kinds
+):
+    # Check if the search `cv_results`'s array are of correct types
+    cv_results = search.cv_results_
+    assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys)
+    assert {
+        key: cv_results[key].dtype.kind for key in param_keys
+    } == expected_cv_results_kinds
+    assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys)
+    assert all(
+        cv_results[key].dtype == np.float64
+        for key in score_keys
+        if not key.startswith("rank")
+    )
+
+    scorer_keys = search.scorer_.keys() if search.multimetric_ else ["score"]
+
+    for key in scorer_keys:
+        assert cv_results["rank_test_%s" % key].dtype == np.int32
+
+
+def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys=()):
+    # Test the search.cv_results_ contains all the required results
+    all_keys = param_keys + score_keys + extra_keys
+    assert_array_equal(sorted(cv_results.keys()), sorted(all_keys + ("params",)))
+    assert all(cv_results[key].shape == (n_cand,) for key in param_keys + score_keys)
+
+
+def test_grid_search_cv_results():
+    X, y = make_classification(n_samples=50, n_features=4, random_state=42)
+
+    n_grid_points = 6
+    params = [
+        dict(
+            kernel=[
+                "rbf",
+            ],
+            C=[1, 10],
+            gamma=[0.1, 1],
+        ),
+        dict(
+            kernel=[
+                "poly",
+            ],
+            degree=[1, 2],
+        ),
+    ]
+
+    param_keys = ("param_C", "param_degree", "param_gamma", "param_kernel")
+    score_keys = (
+        "mean_test_score",
+        "mean_train_score",
+        "rank_test_score",
+        "split0_test_score",
+        "split1_test_score",
+        "split2_test_score",
+        "split0_train_score",
+        "split1_train_score",
+        "split2_train_score",
+        "std_test_score",
+        "std_train_score",
+        "mean_fit_time",
+        "std_fit_time",
+        "mean_score_time",
+        "std_score_time",
+    )
+    n_candidates = n_grid_points
+
+    search = GridSearchCV(SVC(), cv=3, param_grid=params, return_train_score=True)
+    search.fit(X, y)
+    cv_results = search.cv_results_
+    # Check if score and timing are reasonable
+    assert all(cv_results["rank_test_score"] >= 1)
+    assert (all(cv_results[k] >= 0) for k in score_keys if k != "rank_test_score")
+    assert (
+        all(cv_results[k] <= 1)
+        for k in score_keys
+        if "time" not in k and k != "rank_test_score"
+    )
+    # Check cv_results structure
+    expected_cv_results_kinds = {
+        "param_C": "i",
+        "param_degree": "i",
+        "param_gamma": "f",
+        "param_kernel": "O",
+    }
+    check_cv_results_array_types(
+        search, param_keys, score_keys, expected_cv_results_kinds
+    )
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
+    # Check masking
+    cv_results = search.cv_results_
+
+    poly_results = [
+        (
+            cv_results["param_C"].mask[i]
+            and cv_results["param_gamma"].mask[i]
+            and not cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "poly"
+    ]
+    assert all(poly_results)
+    assert len(poly_results) == 2
+
+    rbf_results = [
+        (
+            not cv_results["param_C"].mask[i]
+            and not cv_results["param_gamma"].mask[i]
+            and cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "rbf"
+    ]
+    assert all(rbf_results)
+    assert len(rbf_results) == 4
+
+
+def test_random_search_cv_results():
+    X, y = make_classification(n_samples=50, n_features=4, random_state=42)
+
+    n_search_iter = 30
+
+    params = [
+        {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)},
+        {"kernel": ["poly"], "degree": [2, 3]},
+    ]
+    param_keys = ("param_C", "param_degree", "param_gamma", "param_kernel")
+    score_keys = (
+        "mean_test_score",
+        "mean_train_score",
+        "rank_test_score",
+        "split0_test_score",
+        "split1_test_score",
+        "split2_test_score",
+        "split0_train_score",
+        "split1_train_score",
+        "split2_train_score",
+        "std_test_score",
+        "std_train_score",
+        "mean_fit_time",
+        "std_fit_time",
+        "mean_score_time",
+        "std_score_time",
+    )
+    n_candidates = n_search_iter
+
+    search = RandomizedSearchCV(
+        SVC(),
+        n_iter=n_search_iter,
+        cv=3,
+        param_distributions=params,
+        return_train_score=True,
+    )
+    search.fit(X, y)
+    cv_results = search.cv_results_
+    # Check results structure
+    expected_cv_results_kinds = {
+        "param_C": "f",
+        "param_degree": "i",
+        "param_gamma": "f",
+        "param_kernel": "O",
+    }
+    check_cv_results_array_types(
+        search, param_keys, score_keys, expected_cv_results_kinds
+    )
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
+    assert all(
+        (
+            cv_results["param_C"].mask[i]
+            and cv_results["param_gamma"].mask[i]
+            and not cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "poly"
+    )
+    assert all(
+        (
+            not cv_results["param_C"].mask[i]
+            and not cv_results["param_gamma"].mask[i]
+            and cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "rbf"
+    )
+
+
+@pytest.mark.parametrize(
+    "SearchCV, specialized_params",
+    [
+        (GridSearchCV, {"param_grid": {"C": [1, 10]}}),
+        (RandomizedSearchCV, {"param_distributions": {"C": [1, 10]}, "n_iter": 2}),
+    ],
+)
+def test_search_default_iid(SearchCV, specialized_params):
+    # Test the IID parameter  TODO: Clearly this test does something else???
+    # noise-free simple 2d-data
+    X, y = make_blobs(
+        centers=[[0, 0], [1, 0], [0, 1], [1, 1]],
+        random_state=0,
+        cluster_std=0.1,
+        shuffle=False,
+        n_samples=80,
+    )
+    # split dataset into two folds that are not iid
+    # first one contains data of all 4 blobs, second only from two.
+    mask = np.ones(X.shape[0], dtype=bool)
+    mask[np.where(y == 1)[0][::2]] = 0
+    mask[np.where(y == 2)[0][::2]] = 0
+    # this leads to perfect classification on one fold and a score of 1/3 on
+    # the other
+    # create "cv" for splits
+    cv = [[mask, ~mask], [~mask, mask]]
+
+    common_params = {"estimator": SVC(), "cv": cv, "return_train_score": True}
+    search = SearchCV(**common_params, **specialized_params)
+    search.fit(X, y)
+
+    test_cv_scores = np.array(
+        [
+            search.cv_results_["split%d_test_score" % s][0]
+            for s in range(search.n_splits_)
+        ]
+    )
+    test_mean = search.cv_results_["mean_test_score"][0]
+    test_std = search.cv_results_["std_test_score"][0]
+
+    train_cv_scores = np.array(
+        [
+            search.cv_results_["split%d_train_score" % s][0]
+            for s in range(search.n_splits_)
+        ]
+    )
+    train_mean = search.cv_results_["mean_train_score"][0]
+    train_std = search.cv_results_["std_train_score"][0]
+
+    assert search.cv_results_["param_C"][0] == 1
+    # scores are the same as above
+    assert_allclose(test_cv_scores, [1, 1.0 / 3.0])
+    assert_allclose(train_cv_scores, [1, 1])
+    # Unweighted mean/std is used
+    assert test_mean == pytest.approx(np.mean(test_cv_scores))
+    assert test_std == pytest.approx(np.std(test_cv_scores))
+
+    # For the train scores, we do not take a weighted mean irrespective of
+    # i.i.d. or not
+    assert train_mean == pytest.approx(1)
+    assert train_std == pytest.approx(0)
+
+
+def test_grid_search_cv_results_multimetric():
+    X, y = make_classification(n_samples=50, n_features=4, random_state=42)
+
+    n_splits = 3
+    params = [
+        dict(
+            kernel=[
+                "rbf",
+            ],
+            C=[1, 10],
+            gamma=[0.1, 1],
+        ),
+        dict(
+            kernel=[
+                "poly",
+            ],
+            degree=[1, 2],
+        ),
+    ]
+
+    grid_searches = []
+    for scoring in (
+        {"accuracy": make_scorer(accuracy_score), "recall": make_scorer(recall_score)},
+        "accuracy",
+        "recall",
+    ):
+        grid_search = GridSearchCV(
+            SVC(), cv=n_splits, param_grid=params, scoring=scoring, refit=False
+        )
+        grid_search.fit(X, y)
+        grid_searches.append(grid_search)
+
+    compare_cv_results_multimetric_with_single(*grid_searches)
+
+
+def test_random_search_cv_results_multimetric():
+    X, y = make_classification(n_samples=50, n_features=4, random_state=42)
+
+    n_splits = 3
+    n_search_iter = 30
+
+    # Scipy 0.12's stats dists do not accept seed, hence we use param grid
+    params = dict(C=np.logspace(-4, 1, 3), gamma=np.logspace(-5, 0, 3, base=0.1))
+    for refit in (True, False):
+        random_searches = []
+        for scoring in (("accuracy", "recall"), "accuracy", "recall"):
+            # If True, for multi-metric pass refit='accuracy'
+            if refit:
+                probability = True
+                refit = "accuracy" if isinstance(scoring, tuple) else refit
+            else:
+                probability = False
+            clf = SVC(probability=probability, random_state=42)
+            random_search = RandomizedSearchCV(
+                clf,
+                n_iter=n_search_iter,
+                cv=n_splits,
+                param_distributions=params,
+                scoring=scoring,
+                refit=refit,
+                random_state=0,
+            )
+            random_search.fit(X, y)
+            random_searches.append(random_search)
+
+        compare_cv_results_multimetric_with_single(*random_searches)
+        compare_refit_methods_when_refit_with_acc(
+            random_searches[0], random_searches[1], refit
+        )
+
+
+def compare_cv_results_multimetric_with_single(search_multi, search_acc, search_rec):
+    """Compare multi-metric cv_results with the ensemble of multiple
+    single metric cv_results from single metric grid/random search"""
+
+    assert search_multi.multimetric_
+    assert_array_equal(sorted(search_multi.scorer_), ("accuracy", "recall"))
+
+    cv_results_multi = search_multi.cv_results_
+    cv_results_acc_rec = {
+        re.sub("_score$", "_accuracy", k): v for k, v in search_acc.cv_results_.items()
+    }
+    cv_results_acc_rec.update(
+        {re.sub("_score$", "_recall", k): v for k, v in search_rec.cv_results_.items()}
+    )
+
+    # Check if score and timing are reasonable, also checks if the keys
+    # are present
+    assert all(
+        (
+            np.all(cv_results_multi[k] <= 1)
+            for k in (
+                "mean_score_time",
+                "std_score_time",
+                "mean_fit_time",
+                "std_fit_time",
+            )
+        )
+    )
+
+    # Compare the keys, other than time keys, among multi-metric and
+    # single metric grid search results. np.testing.assert_equal performs a
+    # deep nested comparison of the two cv_results dicts
+    np.testing.assert_equal(
+        {k: v for k, v in cv_results_multi.items() if not k.endswith("_time")},
+        {k: v for k, v in cv_results_acc_rec.items() if not k.endswith("_time")},
+    )
+
+
+def compare_refit_methods_when_refit_with_acc(search_multi, search_acc, refit):
+    """Compare refit multi-metric search methods with single metric methods"""
+    assert search_acc.refit == refit
+    if refit:
+        assert search_multi.refit == "accuracy"
+    else:
+        assert not search_multi.refit
+        return  # search cannot predict/score without refit
+
+    X, y = make_blobs(n_samples=100, n_features=4, random_state=42)
+    for method in ("predict", "predict_proba", "predict_log_proba"):
+        assert_almost_equal(
+            getattr(search_multi, method)(X), getattr(search_acc, method)(X)
+        )
+    assert_almost_equal(search_multi.score(X, y), search_acc.score(X, y))
+    for key in ("best_index_", "best_score_", "best_params_"):
+        assert getattr(search_multi, key) == getattr(search_acc, key)
+
+
+@pytest.mark.parametrize(
+    "search_cv",
+    [
+        RandomizedSearchCV(
+            estimator=DecisionTreeClassifier(),
+            param_distributions={"max_depth": [5, 10]},
+        ),
+        GridSearchCV(
+            estimator=DecisionTreeClassifier(), param_grid={"max_depth": [5, 10]}
+        ),
+    ],
+)
+def test_search_cv_score_samples_error(search_cv):
+    X, y = make_blobs(n_samples=100, n_features=4, random_state=42)
+    search_cv.fit(X, y)
+
+    # Make sure to error out when underlying estimator does not implement
+    # the method `score_samples`
+    outer_msg = f"'{search_cv.__class__.__name__}' has no attribute 'score_samples'"
+    inner_msg = "'DecisionTreeClassifier' object has no attribute 'score_samples'"
+
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        search_cv.score_samples(X)
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg == str(exec_info.value.__cause__)
+
+
+@pytest.mark.parametrize(
+    "search_cv",
+    [
+        RandomizedSearchCV(
+            estimator=LocalOutlierFactor(novelty=True),
+            param_distributions={"n_neighbors": [5, 10]},
+            scoring="precision",
+        ),
+        GridSearchCV(
+            estimator=LocalOutlierFactor(novelty=True),
+            param_grid={"n_neighbors": [5, 10]},
+            scoring="precision",
+        ),
+    ],
+)
+def test_search_cv_score_samples_method(search_cv):
+    # Set parameters
+    rng = np.random.RandomState(42)
+    n_samples = 300
+    outliers_fraction = 0.15
+    n_outliers = int(outliers_fraction * n_samples)
+    n_inliers = n_samples - n_outliers
+
+    # Create dataset
+    X = make_blobs(
+        n_samples=n_inliers,
+        n_features=2,
+        centers=[[0, 0], [0, 0]],
+        cluster_std=0.5,
+        random_state=0,
+    )[0]
+    # Add some noisy points
+    X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], axis=0)
+
+    # Define labels to be able to score the estimator with `search_cv`
+    y_true = np.array([1] * n_samples)
+    y_true[-n_outliers:] = -1
+
+    # Fit on data
+    search_cv.fit(X, y_true)
+
+    # Verify that the stand alone estimator yields the same results
+    # as the ones obtained with *SearchCV
+    assert_allclose(
+        search_cv.score_samples(X), search_cv.best_estimator_.score_samples(X)
+    )
+
+
+def test_search_cv_results_rank_tie_breaking():
+    X, y = make_blobs(n_samples=50, random_state=42)
+
+    # The two C values are close enough to give similar models
+    # which would result in a tie of their mean cv-scores
+    param_grid = {"C": [1, 1.001, 0.001]}
+
+    grid_search = GridSearchCV(SVC(), param_grid=param_grid, return_train_score=True)
+    random_search = RandomizedSearchCV(
+        SVC(), n_iter=3, param_distributions=param_grid, return_train_score=True
+    )
+
+    for search in (grid_search, random_search):
+        search.fit(X, y)
+        cv_results = search.cv_results_
+        # Check tie breaking strategy -
+        # Check that there is a tie in the mean scores between
+        # candidates 1 and 2 alone
+        assert_almost_equal(
+            cv_results["mean_test_score"][0], cv_results["mean_test_score"][1]
+        )
+        assert_almost_equal(
+            cv_results["mean_train_score"][0], cv_results["mean_train_score"][1]
+        )
+        assert not np.allclose(
+            cv_results["mean_test_score"][1], cv_results["mean_test_score"][2]
+        )
+        assert not np.allclose(
+            cv_results["mean_train_score"][1], cv_results["mean_train_score"][2]
+        )
+        # 'min' rank should be assigned to the tied candidates
+        assert_almost_equal(search.cv_results_["rank_test_score"], [1, 1, 3])
+
+
+def test_search_cv_results_none_param():
+    X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1]
+    estimators = (DecisionTreeRegressor(), DecisionTreeClassifier())
+    est_parameters = {"random_state": [0, None]}
+    cv = KFold()
+
+    for est in estimators:
+        grid_search = GridSearchCV(
+            est,
+            est_parameters,
+            cv=cv,
+        ).fit(X, y)
+        assert_array_equal(grid_search.cv_results_["param_random_state"], [0, None])
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.FitFailedWarning")
+def test_search_cv_timing():
+    svc = LinearSVC(random_state=0)
+
+    X = [
+        [
+            1,
+        ],
+        [
+            2,
+        ],
+        [
+            3,
+        ],
+        [
+            4,
+        ],
+    ]
+    y = [0, 1, 1, 0]
+
+    gs = GridSearchCV(svc, {"C": [0, 1]}, cv=2, error_score=0)
+    rs = RandomizedSearchCV(svc, {"C": [0, 1]}, cv=2, error_score=0, n_iter=2)
+
+    for search in (gs, rs):
+        search.fit(X, y)
+        for key in ["mean_fit_time", "std_fit_time"]:
+            # NOTE The precision of time.time in windows is not high
+            # enough for the fit/score times to be non-zero for trivial X and y
+            assert np.all(search.cv_results_[key] >= 0)
+            assert np.all(search.cv_results_[key] < 1)
+
+        for key in ["mean_score_time", "std_score_time"]:
+            assert search.cv_results_[key][1] >= 0
+            assert search.cv_results_[key][0] == 0.0
+            assert np.all(search.cv_results_[key] < 1)
+
+        assert hasattr(search, "refit_time_")
+        assert isinstance(search.refit_time_, float)
+        assert search.refit_time_ >= 0
+
+
+def test_grid_search_correct_score_results():
+    # test that correct scores are used
+    n_splits = 3
+    clf = LinearSVC(random_state=0)
+    X, y = make_blobs(random_state=0, centers=2)
+    Cs = [0.1, 1, 10]
+    for score in ["f1", "roc_auc"]:
+        grid_search = GridSearchCV(clf, {"C": Cs}, scoring=score, cv=n_splits)
+        cv_results = grid_search.fit(X, y).cv_results_
+
+        # Test scorer names
+        result_keys = list(cv_results.keys())
+        expected_keys = ("mean_test_score", "rank_test_score") + tuple(
+            "split%d_test_score" % cv_i for cv_i in range(n_splits)
+        )
+        assert all(np.isin(expected_keys, result_keys))
+
+        cv = StratifiedKFold(n_splits=n_splits)
+        n_splits = grid_search.n_splits_
+        for candidate_i, C in enumerate(Cs):
+            clf.set_params(C=C)
+            cv_scores = np.array(
+                [
+                    grid_search.cv_results_["split%d_test_score" % s][candidate_i]
+                    for s in range(n_splits)
+                ]
+            )
+            for i, (train, test) in enumerate(cv.split(X, y)):
+                clf.fit(X[train], y[train])
+                if score == "f1":
+                    correct_score = f1_score(y[test], clf.predict(X[test]))
+                elif score == "roc_auc":
+                    dec = clf.decision_function(X[test])
+                    correct_score = roc_auc_score(y[test], dec)
+                assert_almost_equal(correct_score, cv_scores[i])
+
+
+def test_pickle():
+    # Test that a fit search can be pickled
+    clf = MockClassifier()
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True, cv=2)
+    grid_search.fit(X, y)
+    grid_search_pickled = pickle.loads(pickle.dumps(grid_search))
+    assert_array_almost_equal(grid_search.predict(X), grid_search_pickled.predict(X))
+
+    random_search = RandomizedSearchCV(
+        clf, {"foo_param": [1, 2, 3]}, refit=True, n_iter=3, cv=2
+    )
+    random_search.fit(X, y)
+    random_search_pickled = pickle.loads(pickle.dumps(random_search))
+    assert_array_almost_equal(
+        random_search.predict(X), random_search_pickled.predict(X)
+    )
+
+
+def test_grid_search_with_multioutput_data():
+    # Test search with multi-output estimator
+
+    X, y = make_multilabel_classification(return_indicator=True, random_state=0)
+
+    est_parameters = {"max_depth": [1, 2, 3, 4]}
+    cv = KFold()
+
+    estimators = [
+        DecisionTreeRegressor(random_state=0),
+        DecisionTreeClassifier(random_state=0),
+    ]
+
+    # Test with grid search cv
+    for est in estimators:
+        grid_search = GridSearchCV(est, est_parameters, cv=cv)
+        grid_search.fit(X, y)
+        res_params = grid_search.cv_results_["params"]
+        for cand_i in range(len(res_params)):
+            est.set_params(**res_params[cand_i])
+
+            for i, (train, test) in enumerate(cv.split(X, y)):
+                est.fit(X[train], y[train])
+                correct_score = est.score(X[test], y[test])
+                assert_almost_equal(
+                    correct_score,
+                    grid_search.cv_results_["split%d_test_score" % i][cand_i],
+                )
+
+    # Test with a randomized search
+    for est in estimators:
+        random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3)
+        random_search.fit(X, y)
+        res_params = random_search.cv_results_["params"]
+        for cand_i in range(len(res_params)):
+            est.set_params(**res_params[cand_i])
+
+            for i, (train, test) in enumerate(cv.split(X, y)):
+                est.fit(X[train], y[train])
+                correct_score = est.score(X[test], y[test])
+                assert_almost_equal(
+                    correct_score,
+                    random_search.cv_results_["split%d_test_score" % i][cand_i],
+                )
+
+
+def test_predict_proba_disabled():
+    # Test predict_proba when disabled on estimator.
+    X = np.arange(20).reshape(5, -1)
+    y = [0, 0, 1, 1, 1]
+    clf = SVC(probability=False)
+    gs = GridSearchCV(clf, {}, cv=2).fit(X, y)
+    assert not hasattr(gs, "predict_proba")
+
+
+def test_grid_search_allows_nans():
+    # Test GridSearchCV with SimpleImputer
+    X = np.arange(20, dtype=np.float64).reshape(5, -1)
+    X[2, :] = np.nan
+    y = [0, 0, 1, 1, 1]
+    p = Pipeline(
+        [
+            ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)),
+            ("classifier", MockClassifier()),
+        ]
+    )
+    GridSearchCV(p, {"classifier__foo_param": [1, 2, 3]}, cv=2).fit(X, y)
+
+
+class FailingClassifier(BaseEstimator):
+    """Classifier that raises a ValueError on fit()"""
+
+    FAILING_PARAMETER = 2
+
+    def __init__(self, parameter=None):
+        self.parameter = parameter
+
+    def fit(self, X, y=None):
+        if self.parameter == FailingClassifier.FAILING_PARAMETER:
+            raise ValueError("Failing classifier failed as required")
+
+    def predict(self, X):
+        return np.zeros(X.shape[0])
+
+    def score(self, X=None, Y=None):
+        return 0.0
+
+
+def test_grid_search_failing_classifier():
+    # GridSearchCV with on_error != 'raise'
+    # Ensures that a warning is raised and score reset where appropriate.
+
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+
+    # refit=False because we only want to check that errors caused by fits
+    # to individual folds will be caught and warnings raised instead. If
+    # refit was done, then an exception would be raised on refit and not
+    # caught by grid_search (expected behavior), and this would cause an
+    # error in this test.
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [0, 1, 2]}],
+        scoring="accuracy",
+        refit=False,
+        error_score=0.0,
+    )
+
+    warning_message = re.compile(
+        "5 fits failed.+total of 15.+The score on these"
+        r" train-test partitions for these parameters will be set to 0\.0.+"
+        "5 fits failed with the following error.+ValueError.+Failing classifier failed"
+        " as required",
+        flags=re.DOTALL,
+    )
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        gs.fit(X, y)
+    n_candidates = len(gs.cv_results_["params"])
+
+    # Ensure that grid scores were set to zero as required for those fits
+    # that are expected to fail.
+    def get_cand_scores(i):
+        return np.array(
+            [gs.cv_results_["split%d_test_score" % s][i] for s in range(gs.n_splits_)]
+        )
+
+    assert all(
+        (
+            np.all(get_cand_scores(cand_i) == 0.0)
+            for cand_i in range(n_candidates)
+            if gs.cv_results_["param_parameter"][cand_i]
+            == FailingClassifier.FAILING_PARAMETER
+        )
+    )
+
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [0, 1, 2]}],
+        scoring="accuracy",
+        refit=False,
+        error_score=float("nan"),
+    )
+    warning_message = re.compile(
+        "5 fits failed.+total of 15.+The score on these"
+        r" train-test partitions for these parameters will be set to nan.+"
+        "5 fits failed with the following error.+ValueError.+Failing classifier failed"
+        " as required",
+        flags=re.DOTALL,
+    )
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        gs.fit(X, y)
+    n_candidates = len(gs.cv_results_["params"])
+    assert all(
+        np.all(np.isnan(get_cand_scores(cand_i)))
+        for cand_i in range(n_candidates)
+        if gs.cv_results_["param_parameter"][cand_i]
+        == FailingClassifier.FAILING_PARAMETER
+    )
+
+    ranks = gs.cv_results_["rank_test_score"]
+
+    # Check that succeeded estimators have lower ranks
+    assert ranks[0] <= 2 and ranks[1] <= 2
+    # Check that failed estimator has the highest rank
+    assert ranks[clf.FAILING_PARAMETER] == 3
+    assert gs.best_index_ != clf.FAILING_PARAMETER
+
+
+def test_grid_search_classifier_all_fits_fail():
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [FailingClassifier.FAILING_PARAMETER] * 3}],
+        error_score=0.0,
+    )
+
+    warning_message = re.compile(
+        (
+            "All the 15 fits failed.+15 fits failed with the following"
+            " error.+ValueError.+Failing classifier failed as required"
+        ),
+        flags=re.DOTALL,
+    )
+    with pytest.raises(ValueError, match=warning_message):
+        gs.fit(X, y)
+
+
+def test_grid_search_failing_classifier_raise():
+    # GridSearchCV with on_error == 'raise' raises the error
+
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+
+    # refit=False because we want to test the behaviour of the grid search part
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [0, 1, 2]}],
+        scoring="accuracy",
+        refit=False,
+        error_score="raise",
+    )
+
+    # FailingClassifier issues a ValueError so this is what we look for.
+    with pytest.raises(ValueError):
+        gs.fit(X, y)
+
+
+def test_parameters_sampler_replacement():
+    # raise warning if n_iter is bigger than total parameter space
+    params = [
+        {"first": [0, 1], "second": ["a", "b", "c"]},
+        {"third": ["two", "values"]},
+    ]
+    sampler = ParameterSampler(params, n_iter=9)
+    n_iter = 9
+    grid_size = 8
+    expected_warning = (
+        "The total space of parameters %d is smaller "
+        "than n_iter=%d. Running %d iterations. For "
+        "exhaustive searches, use GridSearchCV." % (grid_size, n_iter, grid_size)
+    )
+    with pytest.warns(UserWarning, match=expected_warning):
+        list(sampler)
+
+    # degenerates to GridSearchCV if n_iter the same as grid_size
+    sampler = ParameterSampler(params, n_iter=8)
+    samples = list(sampler)
+    assert len(samples) == 8
+    for values in ParameterGrid(params):
+        assert values in samples
+    assert len(ParameterSampler(params, n_iter=1000)) == 8
+
+    # test sampling without replacement in a large grid
+    params = {"a": range(10), "b": range(10), "c": range(10)}
+    sampler = ParameterSampler(params, n_iter=99, random_state=42)
+    samples = list(sampler)
+    assert len(samples) == 99
+    hashable_samples = ["a%db%dc%d" % (p["a"], p["b"], p["c"]) for p in samples]
+    assert len(set(hashable_samples)) == 99
+
+    # doesn't go into infinite loops
+    params_distribution = {"first": bernoulli(0.5), "second": ["a", "b", "c"]}
+    sampler = ParameterSampler(params_distribution, n_iter=7)
+    samples = list(sampler)
+    assert len(samples) == 7
+
+
+def test_stochastic_gradient_loss_param():
+    # Make sure the predict_proba works when loss is specified
+    # as one of the parameters in the param_grid.
+    param_grid = {
+        "loss": ["log_loss"],
+    }
+    X = np.arange(24).reshape(6, -1)
+    y = [0, 0, 0, 1, 1, 1]
+    clf = GridSearchCV(
+        estimator=SGDClassifier(loss="hinge"), param_grid=param_grid, cv=3
+    )
+
+    # When the estimator is not fitted, `predict_proba` is not available as the
+    # loss is 'hinge'.
+    assert not hasattr(clf, "predict_proba")
+    clf.fit(X, y)
+    clf.predict_proba(X)
+    clf.predict_log_proba(X)
+
+    # Make sure `predict_proba` is not available when setting loss=['hinge']
+    # in param_grid
+    param_grid = {
+        "loss": ["hinge"],
+    }
+    clf = GridSearchCV(
+        estimator=SGDClassifier(loss="hinge"), param_grid=param_grid, cv=3
+    )
+    assert not hasattr(clf, "predict_proba")
+    clf.fit(X, y)
+    assert not hasattr(clf, "predict_proba")
+
+
+def test_search_train_scores_set_to_false():
+    X = np.arange(6).reshape(6, -1)
+    y = [0, 0, 0, 1, 1, 1]
+    clf = LinearSVC(random_state=0)
+
+    gs = GridSearchCV(clf, param_grid={"C": [0.1, 0.2]}, cv=3)
+    gs.fit(X, y)
+
+
+def test_grid_search_cv_splits_consistency():
+    # Check if a one time iterable is accepted as a cv parameter.
+    n_samples = 100
+    n_splits = 5
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+
+    gs = GridSearchCV(
+        LinearSVC(random_state=0),
+        param_grid={"C": [0.1, 0.2, 0.3]},
+        cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples),
+        return_train_score=True,
+    )
+    gs.fit(X, y)
+
+    gs2 = GridSearchCV(
+        LinearSVC(random_state=0),
+        param_grid={"C": [0.1, 0.2, 0.3]},
+        cv=KFold(n_splits=n_splits),
+        return_train_score=True,
+    )
+    gs2.fit(X, y)
+
+    # Give generator as a cv parameter
+    assert isinstance(
+        KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X, y),
+        GeneratorType,
+    )
+    gs3 = GridSearchCV(
+        LinearSVC(random_state=0),
+        param_grid={"C": [0.1, 0.2, 0.3]},
+        cv=KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X, y),
+        return_train_score=True,
+    )
+    gs3.fit(X, y)
+
+    gs4 = GridSearchCV(
+        LinearSVC(random_state=0),
+        param_grid={"C": [0.1, 0.2, 0.3]},
+        cv=KFold(n_splits=n_splits, shuffle=True, random_state=0),
+        return_train_score=True,
+    )
+    gs4.fit(X, y)
+
+    def _pop_time_keys(cv_results):
+        for key in (
+            "mean_fit_time",
+            "std_fit_time",
+            "mean_score_time",
+            "std_score_time",
+        ):
+            cv_results.pop(key)
+        return cv_results
+
+    # Check if generators are supported as cv and
+    # that the splits are consistent
+    np.testing.assert_equal(
+        _pop_time_keys(gs3.cv_results_), _pop_time_keys(gs4.cv_results_)
+    )
+
+    # OneTimeSplitter is a non-re-entrant cv where split can be called only
+    # once if ``cv.split`` is called once per param setting in GridSearchCV.fit
+    # the 2nd and 3rd parameter will not be evaluated as no train/test indices
+    # will be generated for the 2nd and subsequent cv.split calls.
+    # This is a check to make sure cv.split is not called once per param
+    # setting.
+    np.testing.assert_equal(
+        {k: v for k, v in gs.cv_results_.items() if not k.endswith("_time")},
+        {k: v for k, v in gs2.cv_results_.items() if not k.endswith("_time")},
+    )
+
+    # Check consistency of folds across the parameters
+    gs = GridSearchCV(
+        LinearSVC(random_state=0),
+        param_grid={"C": [0.1, 0.1, 0.2, 0.2]},
+        cv=KFold(n_splits=n_splits, shuffle=True),
+        return_train_score=True,
+    )
+    gs.fit(X, y)
+
+    # As the first two param settings (C=0.1) and the next two param
+    # settings (C=0.2) are same, the test and train scores must also be
+    # same as long as the same train/test indices are generated for all
+    # the cv splits, for both param setting
+    for score_type in ("train", "test"):
+        per_param_scores = {}
+        for param_i in range(4):
+            per_param_scores[param_i] = [
+                gs.cv_results_["split%d_%s_score" % (s, score_type)][param_i]
+                for s in range(5)
+            ]
+
+        assert_array_almost_equal(per_param_scores[0], per_param_scores[1])
+        assert_array_almost_equal(per_param_scores[2], per_param_scores[3])
+
+
+def test_transform_inverse_transform_round_trip():
+    clf = MockClassifier()
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, verbose=3)
+
+    grid_search.fit(X, y)
+    X_round_trip = grid_search.inverse_transform(grid_search.transform(X))
+    assert_array_equal(X, X_round_trip)
+
+
+def test_custom_run_search():
+    def check_results(results, gscv):
+        exp_results = gscv.cv_results_
+        assert sorted(results.keys()) == sorted(exp_results)
+        for k in results:
+            if not k.endswith("_time"):
+                # XXX: results['params'] is a list :|
+                results[k] = np.asanyarray(results[k])
+                if results[k].dtype.kind == "O":
+                    assert_array_equal(
+                        exp_results[k], results[k], err_msg="Checking " + k
+                    )
+                else:
+                    assert_allclose(exp_results[k], results[k], err_msg="Checking " + k)
+
+    def fit_grid(param_grid):
+        return GridSearchCV(clf, param_grid, return_train_score=True).fit(X, y)
+
+    class CustomSearchCV(BaseSearchCV):
+        def __init__(self, estimator, **kwargs):
+            super().__init__(estimator, **kwargs)
+
+        def _run_search(self, evaluate):
+            results = evaluate([{"max_depth": 1}, {"max_depth": 2}])
+            check_results(results, fit_grid({"max_depth": [1, 2]}))
+            results = evaluate([{"min_samples_split": 5}, {"min_samples_split": 10}])
+            check_results(
+                results,
+                fit_grid([{"max_depth": [1, 2]}, {"min_samples_split": [5, 10]}]),
+            )
+
+    # Using regressor to make sure each score differs
+    clf = DecisionTreeRegressor(random_state=0)
+    X, y = make_classification(n_samples=100, n_informative=4, random_state=0)
+    mycv = CustomSearchCV(clf, return_train_score=True).fit(X, y)
+    gscv = fit_grid([{"max_depth": [1, 2]}, {"min_samples_split": [5, 10]}])
+
+    results = mycv.cv_results_
+    check_results(results, gscv)
+    for attr in dir(gscv):
+        if (
+            attr[0].islower()
+            and attr[-1:] == "_"
+            and attr
+            not in {
+                "cv_results_",
+                "best_estimator_",
+                "refit_time_",
+                "classes_",
+                "scorer_",
+            }
+        ):
+            assert getattr(gscv, attr) == getattr(mycv, attr), (
+                "Attribute %s not equal" % attr
+            )
+
+
+def test__custom_fit_no_run_search():
+    class NoRunSearchSearchCV(BaseSearchCV):
+        def __init__(self, estimator, **kwargs):
+            super().__init__(estimator, **kwargs)
+
+        def fit(self, X, y=None, groups=None, **fit_params):
+            return self
+
+    # this should not raise any exceptions
+    NoRunSearchSearchCV(SVC()).fit(X, y)
+
+    class BadSearchCV(BaseSearchCV):
+        def __init__(self, estimator, **kwargs):
+            super().__init__(estimator, **kwargs)
+
+    with pytest.raises(NotImplementedError, match="_run_search not implemented."):
+        # this should raise a NotImplementedError
+        BadSearchCV(SVC()).fit(X, y)
+
+
+def test_empty_cv_iterator_error():
+    # Use global X, y
+
+    # create cv
+    cv = KFold(n_splits=3).split(X)
+
+    # pop all of it, this should cause the expected ValueError
+    [u for u in cv]
+    # cv is empty now
+
+    train_size = 100
+    ridge = RandomizedSearchCV(Ridge(), {"alpha": [1e-3, 1e-2, 1e-1]}, cv=cv, n_jobs=4)
+
+    # assert that this raises an error
+    with pytest.raises(
+        ValueError,
+        match=(
+            "No fits were performed. "
+            "Was the CV iterator empty\\? "
+            "Were there no candidates\\?"
+        ),
+    ):
+        ridge.fit(X[:train_size], y[:train_size])
+
+
+def test_random_search_bad_cv():
+    # Use global X, y
+
+    class BrokenKFold(KFold):
+        def get_n_splits(self, *args, **kw):
+            return 1
+
+    # create bad cv
+    cv = BrokenKFold(n_splits=3)
+
+    train_size = 100
+    ridge = RandomizedSearchCV(Ridge(), {"alpha": [1e-3, 1e-2, 1e-1]}, cv=cv, n_jobs=4)
+
+    # assert that this raises an error
+    with pytest.raises(
+        ValueError,
+        match=(
+            "cv.split and cv.get_n_splits returned "
+            "inconsistent results. Expected \\d+ "
+            "splits, got \\d+"
+        ),
+    ):
+        ridge.fit(X[:train_size], y[:train_size])
+
+
+@pytest.mark.parametrize("return_train_score", [False, True])
+@pytest.mark.parametrize(
+    "SearchCV, specialized_params",
+    [
+        (GridSearchCV, {"param_grid": {"max_depth": [2, 3, 5, 8]}}),
+        (
+            RandomizedSearchCV,
+            {"param_distributions": {"max_depth": [2, 3, 5, 8]}, "n_iter": 4},
+        ),
+    ],
+)
+def test_searchcv_raise_warning_with_non_finite_score(
+    SearchCV, specialized_params, return_train_score
+):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/10529
+    # Check that we raise a UserWarning when a non-finite score is
+    # computed in the SearchCV
+    X, y = make_classification(n_classes=2, random_state=0)
+
+    class FailingScorer:
+        """Scorer that will fail for some split but not all."""
+
+        def __init__(self):
+            self.n_counts = 0
+
+        def __call__(self, estimator, X, y):
+            self.n_counts += 1
+            if self.n_counts % 5 == 0:
+                return np.nan
+            return 1
+
+    grid = SearchCV(
+        DecisionTreeClassifier(),
+        scoring=FailingScorer(),
+        cv=3,
+        return_train_score=return_train_score,
+        **specialized_params,
+    )
+
+    with pytest.warns(UserWarning) as warn_msg:
+        grid.fit(X, y)
+
+    set_with_warning = ["test", "train"] if return_train_score else ["test"]
+    assert len(warn_msg) == len(set_with_warning)
+    for msg, dataset in zip(warn_msg, set_with_warning):
+        assert f"One or more of the {dataset} scores are non-finite" in str(msg.message)
+
+    # all non-finite scores should be equally ranked last
+    last_rank = grid.cv_results_["rank_test_score"].max()
+    non_finite_mask = np.isnan(grid.cv_results_["mean_test_score"])
+    assert_array_equal(grid.cv_results_["rank_test_score"][non_finite_mask], last_rank)
+    # all finite scores should be better ranked than the non-finite scores
+    assert np.all(grid.cv_results_["rank_test_score"][~non_finite_mask] < last_rank)
+
+
+def test_callable_multimetric_confusion_matrix():
+    # Test callable with many metrics inserts the correct names and metrics
+    # into the search cv object
+    def custom_scorer(clf, X, y):
+        y_pred = clf.predict(X)
+        cm = confusion_matrix(y, y_pred)
+        return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
+
+    X, y = make_classification(n_samples=40, n_features=4, random_state=42)
+    est = LinearSVC(random_state=42)
+    search = GridSearchCV(est, {"C": [0.1, 1]}, scoring=custom_scorer, refit="fp")
+
+    search.fit(X, y)
+
+    score_names = ["tn", "fp", "fn", "tp"]
+    for name in score_names:
+        assert "mean_test_{}".format(name) in search.cv_results_
+
+    y_pred = search.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    assert search.score(X, y) == pytest.approx(cm[0, 1])
+
+
+def test_callable_multimetric_same_as_list_of_strings():
+    # Test callable multimetric is the same as a list of strings
+    def custom_scorer(est, X, y):
+        y_pred = est.predict(X)
+        return {
+            "recall": recall_score(y, y_pred),
+            "accuracy": accuracy_score(y, y_pred),
+        }
+
+    X, y = make_classification(n_samples=40, n_features=4, random_state=42)
+    est = LinearSVC(random_state=42)
+    search_callable = GridSearchCV(
+        est, {"C": [0.1, 1]}, scoring=custom_scorer, refit="recall"
+    )
+    search_str = GridSearchCV(
+        est, {"C": [0.1, 1]}, scoring=["recall", "accuracy"], refit="recall"
+    )
+
+    search_callable.fit(X, y)
+    search_str.fit(X, y)
+
+    assert search_callable.best_score_ == pytest.approx(search_str.best_score_)
+    assert search_callable.best_index_ == search_str.best_index_
+    assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y))
+
+
+def test_callable_single_metric_same_as_single_string():
+    # Tests callable scorer is the same as scoring with a single string
+    def custom_scorer(est, X, y):
+        y_pred = est.predict(X)
+        return recall_score(y, y_pred)
+
+    X, y = make_classification(n_samples=40, n_features=4, random_state=42)
+    est = LinearSVC(random_state=42)
+    search_callable = GridSearchCV(
+        est, {"C": [0.1, 1]}, scoring=custom_scorer, refit=True
+    )
+    search_str = GridSearchCV(est, {"C": [0.1, 1]}, scoring="recall", refit="recall")
+    search_list_str = GridSearchCV(
+        est, {"C": [0.1, 1]}, scoring=["recall"], refit="recall"
+    )
+    search_callable.fit(X, y)
+    search_str.fit(X, y)
+    search_list_str.fit(X, y)
+
+    assert search_callable.best_score_ == pytest.approx(search_str.best_score_)
+    assert search_callable.best_index_ == search_str.best_index_
+    assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y))
+
+    assert search_list_str.best_score_ == pytest.approx(search_str.best_score_)
+    assert search_list_str.best_index_ == search_str.best_index_
+    assert search_list_str.score(X, y) == pytest.approx(search_str.score(X, y))
+
+
+def test_callable_multimetric_error_on_invalid_key():
+    # Raises when the callable scorer does not return a dict with `refit` key.
+    def bad_scorer(est, X, y):
+        return {"bad_name": 1}
+
+    X, y = make_classification(n_samples=40, n_features=4, random_state=42)
+    clf = GridSearchCV(
+        LinearSVC(random_state=42),
+        {"C": [0.1, 1]},
+        scoring=bad_scorer,
+        refit="good_name",
+    )
+
+    msg = (
+        "For multi-metric scoring, the parameter refit must be set to a "
+        "scorer key or a callable to refit"
+    )
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_callable_multimetric_error_failing_clf():
+    # Warns when there is an estimator the fails to fit with a float
+    # error_score
+    def custom_scorer(est, X, y):
+        return {"acc": 1}
+
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [0, 1, 2]}],
+        scoring=custom_scorer,
+        refit=False,
+        error_score=0.1,
+    )
+
+    warning_message = re.compile(
+        "5 fits failed.+total of 15.+The score on these"
+        r" train-test partitions for these parameters will be set to 0\.1",
+        flags=re.DOTALL,
+    )
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        gs.fit(X, y)
+
+    assert_allclose(gs.cv_results_["mean_test_acc"], [1, 1, 0.1])
+
+
+def test_callable_multimetric_clf_all_fits_fail():
+    # Warns and raises when all estimator fails to fit.
+    def custom_scorer(est, X, y):
+        return {"acc": 1}
+
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [FailingClassifier.FAILING_PARAMETER] * 3}],
+        scoring=custom_scorer,
+        refit=False,
+        error_score=0.1,
+    )
+
+    individual_fit_error_message = "ValueError: Failing classifier failed as required"
+    error_message = re.compile(
+        (
+            "All the 15 fits failed.+your model is misconfigured.+"
+            f"{individual_fit_error_message}"
+        ),
+        flags=re.DOTALL,
+    )
+
+    with pytest.raises(ValueError, match=error_message):
+        gs.fit(X, y)
+
+
+def test_n_features_in():
+    # make sure grid search and random search delegate n_features_in to the
+    # best estimator
+    n_features = 4
+    X, y = make_classification(n_features=n_features)
+    gbdt = HistGradientBoostingClassifier()
+    param_grid = {"max_iter": [3, 4]}
+    gs = GridSearchCV(gbdt, param_grid)
+    rs = RandomizedSearchCV(gbdt, param_grid, n_iter=1)
+    assert not hasattr(gs, "n_features_in_")
+    assert not hasattr(rs, "n_features_in_")
+    gs.fit(X, y)
+    rs.fit(X, y)
+    assert gs.n_features_in_ == n_features
+    assert rs.n_features_in_ == n_features
+
+
+@pytest.mark.parametrize("pairwise", [True, False])
+def test_search_cv_pairwise_property_delegated_to_base_estimator(pairwise):
+    """
+    Test implementation of BaseSearchCV has the pairwise tag
+    which matches the pairwise tag of its estimator.
+    This test make sure pairwise tag is delegated to the base estimator.
+
+    Non-regression test for issue #13920.
+    """
+
+    class TestEstimator(BaseEstimator):
+        def __sklearn_tags__(self):
+            tags = super().__sklearn_tags__()
+            tags.input_tags.pairwise = pairwise
+            return tags
+
+    est = TestEstimator()
+    attr_message = "BaseSearchCV pairwise tag must match estimator"
+    cv = GridSearchCV(est, {"n_neighbors": [10]})
+    assert pairwise == cv.__sklearn_tags__().input_tags.pairwise, attr_message
+
+
+def test_search_cv__pairwise_property_delegated_to_base_estimator():
+    """
+    Test implementation of BaseSearchCV has the pairwise property
+    which matches the pairwise tag of its estimator.
+    This test make sure pairwise tag is delegated to the base estimator.
+
+    Non-regression test for issue #13920.
+    """
+
+    class EstimatorPairwise(BaseEstimator):
+        def __init__(self, pairwise=True):
+            self.pairwise = pairwise
+
+        def __sklearn_tags__(self):
+            tags = super().__sklearn_tags__()
+            tags.input_tags.pairwise = self.pairwise
+            return tags
+
+    est = EstimatorPairwise()
+    attr_message = "BaseSearchCV _pairwise property must match estimator"
+
+    for _pairwise_setting in [True, False]:
+        est.set_params(pairwise=_pairwise_setting)
+        cv = GridSearchCV(est, {"n_neighbors": [10]})
+        assert (
+            _pairwise_setting == cv.__sklearn_tags__().input_tags.pairwise
+        ), attr_message
+
+
+def test_search_cv_pairwise_property_equivalence_of_precomputed():
+    """
+    Test implementation of BaseSearchCV has the pairwise tag
+    which matches the pairwise tag of its estimator.
+    This test ensures the equivalence of 'precomputed'.
+
+    Non-regression test for issue #13920.
+    """
+    n_samples = 50
+    n_splits = 2
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    grid_params = {"n_neighbors": [10]}
+
+    # defaults to euclidean metric (minkowski p = 2)
+    clf = KNeighborsClassifier()
+    cv = GridSearchCV(clf, grid_params, cv=n_splits)
+    cv.fit(X, y)
+    preds_original = cv.predict(X)
+
+    # precompute euclidean metric to validate pairwise is working
+    X_precomputed = euclidean_distances(X)
+    clf = KNeighborsClassifier(metric="precomputed")
+    cv = GridSearchCV(clf, grid_params, cv=n_splits)
+    cv.fit(X_precomputed, y)
+    preds_precomputed = cv.predict(X_precomputed)
+
+    attr_message = "GridSearchCV not identical with precomputed metric"
+    assert (preds_original == preds_precomputed).all(), attr_message
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [(GridSearchCV, {"a": [0.1, 0.01]}), (RandomizedSearchCV, {"a": uniform(1, 3)})],
+)
+def test_scalar_fit_param(SearchCV, param_search):
+    # unofficially sanctioned tolerance for scalar values in fit_params
+    # non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15805
+    class TestEstimator(ClassifierMixin, BaseEstimator):
+        def __init__(self, a=None):
+            self.a = a
+
+        def fit(self, X, y, r=None):
+            self.r_ = r
+
+        def predict(self, X):
+            return np.zeros(shape=(len(X)))
+
+    model = SearchCV(TestEstimator(), param_search)
+    X, y = make_classification(random_state=42)
+    model.fit(X, y, r=42)
+    assert model.best_estimator_.r_ == 42
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [
+        (GridSearchCV, {"alpha": [0.1, 0.01]}),
+        (RandomizedSearchCV, {"alpha": uniform(0.01, 0.1)}),
+    ],
+)
+def test_scalar_fit_param_compat(SearchCV, param_search):
+    # check support for scalar values in fit_params, for instance in LightGBM
+    # that do not exactly respect the scikit-learn API contract but that we do
+    # not want to break without an explicit deprecation cycle and API
+    # recommendations for implementing early stopping with a user provided
+    # validation set. non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15805
+    X_train, X_valid, y_train, y_valid = train_test_split(
+        *make_classification(random_state=42), random_state=42
+    )
+
+    class _FitParamClassifier(SGDClassifier):
+        def fit(
+            self,
+            X,
+            y,
+            sample_weight=None,
+            tuple_of_arrays=None,
+            scalar_param=None,
+            callable_param=None,
+        ):
+            super().fit(X, y, sample_weight=sample_weight)
+            assert scalar_param > 0
+            assert callable(callable_param)
+
+            # The tuple of arrays should be preserved as tuple.
+            assert isinstance(tuple_of_arrays, tuple)
+            assert tuple_of_arrays[0].ndim == 2
+            assert tuple_of_arrays[1].ndim == 1
+            return self
+
+    def _fit_param_callable():
+        pass
+
+    model = SearchCV(_FitParamClassifier(), param_search)
+
+    # NOTE: `fit_params` should be data dependent (e.g. `sample_weight`) which
+    # is not the case for the following parameters. But this abuse is common in
+    # popular third-party libraries and we should tolerate this behavior for
+    # now and be careful not to break support for those without following
+    # proper deprecation cycle.
+    fit_params = {
+        "tuple_of_arrays": (X_valid, y_valid),
+        "callable_param": _fit_param_callable,
+        "scalar_param": 42,
+    }
+    model.fit(X_train, y_train, **fit_params)
+
+
+# FIXME: Replace this test with a full `check_estimator` once we have API only
+# checks.
+@pytest.mark.filterwarnings("ignore:The total space of parameters 4 is")
+@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV])
+@pytest.mark.parametrize("Predictor", [MinimalRegressor, MinimalClassifier])
+def test_search_cv_using_minimal_compatible_estimator(SearchCV, Predictor):
+    # Check that third-party library can run tests without inheriting from
+    # BaseEstimator.
+    rng = np.random.RandomState(0)
+    X, y = rng.randn(25, 2), np.array([0] * 5 + [1] * 20)
+
+    model = Pipeline(
+        [("transformer", MinimalTransformer()), ("predictor", Predictor())]
+    )
+
+    params = {
+        "transformer__param": [1, 10],
+        "predictor__parama": [1, 10],
+    }
+    search = SearchCV(model, params, error_score="raise")
+    search.fit(X, y)
+
+    assert search.best_params_.keys() == params.keys()
+
+    y_pred = search.predict(X)
+    if is_classifier(search):
+        assert_array_equal(y_pred, 1)
+        assert search.score(X, y) == pytest.approx(accuracy_score(y, y_pred))
+    else:
+        assert_allclose(y_pred, y.mean())
+        assert search.score(X, y) == pytest.approx(r2_score(y, y_pred))
+
+
+@pytest.mark.parametrize("return_train_score", [True, False])
+def test_search_cv_verbose_3(capsys, return_train_score):
+    """Check that search cv with verbose>2 shows the score for single
+    metrics. non-regression test for #19658."""
+    X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0)
+    clf = LinearSVC(random_state=0)
+    grid = {"C": [0.1]}
+
+    GridSearchCV(
+        clf,
+        grid,
+        scoring="accuracy",
+        verbose=3,
+        cv=3,
+        return_train_score=return_train_score,
+    ).fit(X, y)
+    captured = capsys.readouterr().out
+    if return_train_score:
+        match = re.findall(r"score=\(train=[\d\.]+, test=[\d.]+\)", captured)
+    else:
+        match = re.findall(r"score=[\d\.]+", captured)
+    assert len(match) == 3
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [
+        (GridSearchCV, "param_grid"),
+        (RandomizedSearchCV, "param_distributions"),
+        (HalvingGridSearchCV, "param_grid"),
+    ],
+)
+def test_search_estimator_param(SearchCV, param_search):
+    # test that SearchCV object doesn't change the object given in the parameter grid
+    X, y = make_classification(random_state=42)
+
+    params = {"clf": [LinearSVC()], "clf__C": [0.01]}
+    orig_C = params["clf"][0].C
+
+    pipe = Pipeline([("trs", MinimalTransformer()), ("clf", None)])
+
+    param_grid_search = {param_search: params}
+    gs = SearchCV(pipe, refit=True, cv=2, scoring="accuracy", **param_grid_search).fit(
+        X, y
+    )
+
+    # testing that the original object in params is not changed
+    assert params["clf"][0].C == orig_C
+    # testing that the GS is setting the parameter of the step correctly
+    assert gs.best_estimator_.named_steps["clf"].C == 0.01
+
+
+def test_search_with_2d_array():
+    parameter_grid = {
+        "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
+        "vect__norm": ("l1", "l2"),
+    }
+    pipeline = Pipeline(
+        [
+            ("vect", TfidfVectorizer()),
+            ("clf", ComplementNB()),
+        ]
+    )
+    random_search = RandomizedSearchCV(
+        estimator=pipeline,
+        param_distributions=parameter_grid,
+        n_iter=3,
+        random_state=0,
+        n_jobs=2,
+        verbose=1,
+        cv=3,
+    )
+    data_train = ["one", "two", "three", "four", "five"]
+    data_target = [0, 0, 1, 0, 1]
+    random_search.fit(data_train, data_target)
+    result = random_search.cv_results_["param_vect__ngram_range"]
+    expected_data = np.empty(3, dtype=object)
+    expected_data[:] = [(1, 2), (1, 2), (1, 1)]
+    np.testing.assert_array_equal(result.data, expected_data)
+
+
+def test_search_html_repr():
+    """Test different HTML representations for GridSearchCV."""
+    X, y = make_classification(random_state=42)
+
+    pipeline = Pipeline([("scale", StandardScaler()), ("clf", DummyClassifier())])
+    param_grid = {"clf": [DummyClassifier(), LogisticRegression()]}
+
+    # Unfitted shows the original pipeline
+    search_cv = GridSearchCV(pipeline, param_grid=param_grid, refit=False)
+    with config_context(display="diagram"):
+        repr_html = search_cv._repr_html_()
+        assert "<pre>DummyClassifier()</pre>" in repr_html
+
+    # Fitted with `refit=False` shows the original pipeline
+    search_cv.fit(X, y)
+    with config_context(display="diagram"):
+        repr_html = search_cv._repr_html_()
+        assert "<pre>DummyClassifier()</pre>" in repr_html
+
+    # Fitted with `refit=True` shows the best estimator
+    search_cv = GridSearchCV(pipeline, param_grid=param_grid, refit=True)
+    search_cv.fit(X, y)
+    with config_context(display="diagram"):
+        repr_html = search_cv._repr_html_()
+        assert "<pre>DummyClassifier()</pre>" not in repr_html
+        assert "<pre>LogisticRegression()</pre>" in repr_html
+
+
+# TODO(1.7): remove this test
+@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV])
+def test_inverse_transform_Xt_deprecation(SearchCV):
+    clf = MockClassifier()
+    search = SearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, verbose=3)
+
+    X2 = search.fit(X, y).transform(X)
+
+    with pytest.raises(TypeError, match="Missing required positional argument"):
+        search.inverse_transform()
+
+    with pytest.raises(TypeError, match="Cannot use both X and Xt. Use X only"):
+        search.inverse_transform(X=X2, Xt=X2)
+
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("error")
+        search.inverse_transform(X2)
+
+    with pytest.warns(FutureWarning, match="Xt was renamed X in version 1.5"):
+        search.inverse_transform(Xt=X2)
+
+
+# Metadata Routing Tests
+# ======================
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [
+        (GridSearchCV, "param_grid"),
+        (RandomizedSearchCV, "param_distributions"),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_multi_metric_search_forwards_metadata(SearchCV, param_search):
+    """Test that *SearchCV forwards metadata correctly when passed multiple metrics."""
+    X, y = make_classification(random_state=42)
+    n_samples = _num_samples(X)
+    rng = np.random.RandomState(0)
+    score_weights = rng.rand(n_samples)
+    score_metadata = rng.rand(n_samples)
+
+    est = LinearSVC()
+    param_grid_search = {param_search: {"C": [1]}}
+
+    scorer_registry = _Registry()
+    scorer = ConsumingScorer(registry=scorer_registry).set_score_request(
+        sample_weight="score_weights", metadata="score_metadata"
+    )
+    scoring = dict(my_scorer=scorer, accuracy="accuracy")
+    SearchCV(est, refit="accuracy", cv=2, scoring=scoring, **param_grid_search).fit(
+        X, y, score_weights=score_weights, score_metadata=score_metadata
+    )
+    assert len(scorer_registry)
+    for _scorer in scorer_registry:
+        check_recorded_metadata(
+            obj=_scorer,
+            method="score",
+            parent="_score",
+            split_params=("sample_weight", "metadata"),
+            sample_weight=score_weights,
+            metadata=score_metadata,
+        )
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [
+        (GridSearchCV, "param_grid"),
+        (RandomizedSearchCV, "param_distributions"),
+        (HalvingGridSearchCV, "param_grid"),
+    ],
+)
+def test_score_rejects_params_with_no_routing_enabled(SearchCV, param_search):
+    """*SearchCV should reject **params when metadata routing is not enabled
+    since this is added only when routing is enabled."""
+    X, y = make_classification(random_state=42)
+    est = LinearSVC()
+    param_grid_search = {param_search: {"C": [1]}}
+
+    gs = SearchCV(est, cv=2, **param_grid_search).fit(X, y)
+
+    with pytest.raises(ValueError, match="is only supported if"):
+        gs.score(X, y, metadata=1)
+
+
+# End of Metadata Routing Tests
+# =============================
+
+
+def test_cv_results_dtype_issue_29074():
+    """Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/29074"""
+
+    class MetaEstimator(BaseEstimator, ClassifierMixin):
+        def __init__(
+            self,
+            base_clf,
+            parameter1=None,
+            parameter2=None,
+            parameter3=None,
+            parameter4=None,
+        ):
+            self.base_clf = base_clf
+            self.parameter1 = parameter1
+            self.parameter2 = parameter2
+            self.parameter3 = parameter3
+            self.parameter4 = parameter4
+
+        def fit(self, X, y=None):
+            self.base_clf.fit(X, y)
+            return self
+
+        def score(self, X, y):
+            return self.base_clf.score(X, y)
+
+    # Values of param_grid are such that np.result_type gives slightly
+    # different errors, in particular ValueError and TypeError
+    param_grid = {
+        "parameter1": [None, {"option": "A"}, {"option": "B"}],
+        "parameter2": [None, [1, 2]],
+        "parameter3": [{"a": 1}],
+        "parameter4": ["str1", "str2"],
+    }
+    grid_search = GridSearchCV(
+        estimator=MetaEstimator(LogisticRegression()),
+        param_grid=param_grid,
+        cv=3,
+    )
+
+    X, y = make_blobs(random_state=0)
+    grid_search.fit(X, y)
+    for param in param_grid:
+        assert grid_search.cv_results_[f"param_{param}"].dtype == object
+
+
+def test_search_with_estimators_issue_29157():
+    """Check cv_results_ for estimators with a `dtype` parameter, e.g. OneHotEncoder."""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(
+        {
+            "numeric_1": [1, 2, 3, 4, 5],
+            "object_1": ["a", "a", "a", "a", "a"],
+            "target": [1.0, 4.1, 2.0, 3.0, 1.0],
+        }
+    )
+    X = df.drop("target", axis=1)
+    y = df["target"]
+    enc = ColumnTransformer(
+        [("enc", OneHotEncoder(sparse_output=False), ["object_1"])],
+        remainder="passthrough",
+    )
+    pipe = Pipeline(
+        [
+            ("enc", enc),
+            ("regressor", LinearRegression()),
+        ]
+    )
+    grid_params = {
+        "enc__enc": [
+            OneHotEncoder(sparse_output=False),
+            OrdinalEncoder(),
+        ]
+    }
+    grid_search = GridSearchCV(pipe, grid_params, cv=2)
+    grid_search.fit(X, y)
+    assert grid_search.cv_results_["param_enc__enc"].dtype == object
+
+
+def test_cv_results_multi_size_array():
+    """Check that GridSearchCV works with params that are arrays of different sizes.
+
+    Non-regression test for #29277.
+    """
+    n_features = 10
+    X, y = make_classification(n_features=10)
+
+    spline_reg_pipe = make_pipeline(
+        SplineTransformer(extrapolation="periodic"),
+        LogisticRegression(),
+    )
+
+    n_knots_list = [n_features * i for i in [10, 11, 12]]
+    knots_list = [
+        np.linspace(0, np.pi * 2, n_knots).reshape((-1, n_features))
+        for n_knots in n_knots_list
+    ]
+    spline_reg_pipe_cv = GridSearchCV(
+        estimator=spline_reg_pipe,
+        param_grid={
+            "splinetransformer__knots": knots_list,
+        },
+    )
+
+    spline_reg_pipe_cv.fit(X, y)
+    assert (
+        spline_reg_pipe_cv.cv_results_["param_splinetransformer__knots"].dtype == object
+    )
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype", yield_namespace_device_dtype_combinations()
+)
+@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV])
+def test_array_api_search_cv_classifier(SearchCV, array_namespace, device, dtype):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    X = np.arange(100).reshape((10, 10))
+    X_np = X.astype(dtype)
+    X_xp = xp.asarray(X_np, device=device)
+
+    # y should always be an integer, no matter what `dtype` is
+    y_np = np.array([0] * 5 + [1] * 5)
+    y_xp = xp.asarray(y_np, device=device)
+
+    with config_context(array_api_dispatch=True):
+        searcher = SearchCV(
+            LinearDiscriminantAnalysis(),
+            {"tol": [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]},
+            cv=2,
+            error_score="raise",
+        )
+        searcher.fit(X_xp, y_xp)
+        searcher.score(X_xp, y_xp)
+
+
+# Construct these outside the tests so that the same object is used
+# for both input and `expected`
+one_hot_encoder = OneHotEncoder()
+ordinal_encoder = OrdinalEncoder()
+
+# If we construct this directly via `MaskedArray`, the list of tuples
+# gets auto-converted to a 2D array.
+ma_with_tuples = np.ma.MaskedArray(np.empty(2), mask=True, dtype=object)
+ma_with_tuples[0] = (1, 2)
+ma_with_tuples[1] = (3, 4)
+
+
+@pytest.mark.parametrize(
+    ("candidate_params", "expected"),
+    [
+        pytest.param(
+            [{"foo": 1}, {"foo": 2}],
+            [
+                ("param_foo", np.ma.MaskedArray(np.array([1, 2]))),
+            ],
+            id="simple numeric, single param",
+        ),
+        pytest.param(
+            [{"foo": 1, "bar": 3}, {"foo": 2, "bar": 4}, {"foo": 3}],
+            [
+                ("param_foo", np.ma.MaskedArray(np.array([1, 2, 3]))),
+                (
+                    "param_bar",
+                    np.ma.MaskedArray(np.array([3, 4, 0]), mask=[False, False, True]),
+                ),
+            ],
+            id="simple numeric, one param is missing in one round",
+        ),
+        pytest.param(
+            [{"foo": [[1], [2], [3]]}, {"foo": [[1], [2]]}],
+            [
+                (
+                    "param_foo",
+                    np.ma.MaskedArray([[[1], [2], [3]], [[1], [2]]], dtype=object),
+                ),
+            ],
+            id="lists of different lengths",
+        ),
+        pytest.param(
+            [{"foo": (1, 2)}, {"foo": (3, 4)}],
+            [
+                (
+                    "param_foo",
+                    ma_with_tuples,
+                ),
+            ],
+            id="lists tuples",
+        ),
+        pytest.param(
+            [{"foo": ordinal_encoder}, {"foo": one_hot_encoder}],
+            [
+                (
+                    "param_foo",
+                    np.ma.MaskedArray([ordinal_encoder, one_hot_encoder], dtype=object),
+                ),
+            ],
+            id="estimators",
+        ),
+    ],
+)
+def test_yield_masked_array_for_each_param(candidate_params, expected):
+    result = list(_yield_masked_array_for_each_param(candidate_params))
+    for (key, value), (expected_key, expected_value) in zip(result, expected):
+        assert key == expected_key
+        assert value.dtype == expected_value.dtype
+        np.testing.assert_array_equal(value, expected_value)
+        np.testing.assert_array_equal(value.mask, expected_value.mask)
+
+
+def test_yield_masked_array_no_runtime_warning():
+    # non-regression test for https://github.com/scikit-learn/scikit-learn/issues/29929
+    candidate_params = [{"param": i} for i in range(1000)]
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        list(_yield_masked_array_for_each_param(candidate_params))
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_split.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..b333d49f5058efcde5edce15d27f0b8c9f0e053f
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_split.py
@@ -0,0 +1,2099 @@
+"""Test the split module"""
+
+import re
+import warnings
+from itertools import combinations, combinations_with_replacement, permutations
+
+import numpy as np
+import pytest
+from scipy import stats
+from scipy.sparse import issparse
+from scipy.special import comb
+
+from sklearn import config_context
+from sklearn.datasets import load_digits, make_classification
+from sklearn.dummy import DummyClassifier
+from sklearn.model_selection import (
+    GridSearchCV,
+    GroupKFold,
+    GroupShuffleSplit,
+    KFold,
+    LeaveOneGroupOut,
+    LeaveOneOut,
+    LeavePGroupsOut,
+    LeavePOut,
+    PredefinedSplit,
+    RepeatedKFold,
+    RepeatedStratifiedKFold,
+    ShuffleSplit,
+    StratifiedGroupKFold,
+    StratifiedKFold,
+    StratifiedShuffleSplit,
+    TimeSeriesSplit,
+    check_cv,
+    cross_val_score,
+    train_test_split,
+)
+from sklearn.model_selection._split import (
+    _build_repr,
+    _validate_shuffle_split,
+    _yields_constant_splits,
+)
+from sklearn.svm import SVC
+from sklearn.tests.metadata_routing_common import assert_request_is_empty
+from sklearn.utils._array_api import (
+    _convert_to_numpy,
+    get_namespace,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._array_api import (
+    device as array_api_device,
+)
+from sklearn.utils._mocking import MockDataFrame
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.estimator_checks import (
+    _array_api_for_tests,
+)
+from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
+from sklearn.utils.validation import _num_samples
+
+NO_GROUP_SPLITTERS = [
+    KFold(),
+    StratifiedKFold(),
+    TimeSeriesSplit(),
+    LeaveOneOut(),
+    LeavePOut(p=2),
+    ShuffleSplit(),
+    StratifiedShuffleSplit(test_size=0.5),
+    PredefinedSplit([1, 1, 2, 2]),
+    RepeatedKFold(),
+    RepeatedStratifiedKFold(),
+]
+
+GROUP_SPLITTERS = [
+    GroupKFold(),
+    LeavePGroupsOut(n_groups=1),
+    StratifiedGroupKFold(),
+    LeaveOneGroupOut(),
+    GroupShuffleSplit(),
+]
+GROUP_SPLITTER_NAMES = set(splitter.__class__.__name__ for splitter in GROUP_SPLITTERS)
+
+ALL_SPLITTERS = NO_GROUP_SPLITTERS + GROUP_SPLITTERS  # type: ignore
+
+SPLITTERS_REQUIRING_TARGET = [
+    StratifiedKFold(),
+    StratifiedShuffleSplit(),
+    RepeatedStratifiedKFold(),
+]
+
+X = np.ones(10)
+y = np.arange(10) // 2
+test_groups = (
+    np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
+    np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
+    np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
+    np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
+    [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3],
+    ["1", "1", "1", "1", "2", "2", "2", "3", "3", "3", "3", "3"],
+)
+digits = load_digits()
+
+pytestmark = pytest.mark.filterwarnings(
+    "error:The groups parameter:UserWarning:sklearn.*"
+)
+
+
+def _split(splitter, X, y, groups):
+    if splitter.__class__.__name__ in GROUP_SPLITTER_NAMES:
+        return splitter.split(X, y, groups=groups)
+    else:
+        return splitter.split(X, y)
+
+
+def test_cross_validator_with_default_params():
+    n_samples = 4
+    n_unique_groups = 4
+    n_splits = 2
+    p = 2
+    n_shuffle_splits = 10  # (the default value)
+
+    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    X_1d = np.array([1, 2, 3, 4])
+    y = np.array([1, 1, 2, 2])
+    groups = np.array([1, 2, 3, 4])
+    loo = LeaveOneOut()
+    lpo = LeavePOut(p)
+    kf = KFold(n_splits)
+    skf = StratifiedKFold(n_splits)
+    lolo = LeaveOneGroupOut()
+    lopo = LeavePGroupsOut(p)
+    ss = ShuffleSplit(random_state=0)
+    ps = PredefinedSplit([1, 1, 2, 2])  # n_splits = np of unique folds = 2
+    sgkf = StratifiedGroupKFold(n_splits)
+
+    loo_repr = "LeaveOneOut()"
+    lpo_repr = "LeavePOut(p=2)"
+    kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)"
+    skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)"
+    lolo_repr = "LeaveOneGroupOut()"
+    lopo_repr = "LeavePGroupsOut(n_groups=2)"
+    ss_repr = (
+        "ShuffleSplit(n_splits=10, random_state=0, test_size=None, train_size=None)"
+    )
+    ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"
+    sgkf_repr = "StratifiedGroupKFold(n_splits=2, random_state=None, shuffle=False)"
+
+    n_splits_expected = [
+        n_samples,
+        comb(n_samples, p),
+        n_splits,
+        n_splits,
+        n_unique_groups,
+        comb(n_unique_groups, p),
+        n_shuffle_splits,
+        2,
+        n_splits,
+    ]
+
+    for i, (cv, cv_repr) in enumerate(
+        zip(
+            [loo, lpo, kf, skf, lolo, lopo, ss, ps, sgkf],
+            [
+                loo_repr,
+                lpo_repr,
+                kf_repr,
+                skf_repr,
+                lolo_repr,
+                lopo_repr,
+                ss_repr,
+                ps_repr,
+                sgkf_repr,
+            ],
+        )
+    ):
+        # Test if get_n_splits works correctly
+        assert n_splits_expected[i] == cv.get_n_splits(X, y, groups)
+
+        # Test if the cross-validator works as expected even if
+        # the data is 1d
+        np.testing.assert_equal(
+            list(_split(cv, X, y, groups)), list(_split(cv, X_1d, y, groups))
+        )
+        # Test that train, test indices returned are integers
+        for train, test in _split(cv, X, y, groups):
+            assert np.asarray(train).dtype.kind == "i"
+            assert np.asarray(test).dtype.kind == "i"
+
+        # Test if the repr works without any errors
+        assert cv_repr == repr(cv)
+
+    # ValueError for get_n_splits methods
+    msg = "The 'X' parameter should not be None."
+    with pytest.raises(ValueError, match=msg):
+        loo.get_n_splits(None, y, groups)
+    with pytest.raises(ValueError, match=msg):
+        lpo.get_n_splits(None, y, groups)
+
+
+def test_2d_y():
+    # smoke test for 2d y and multi-label
+    n_samples = 30
+    rng = np.random.RandomState(1)
+    X = rng.randint(0, 3, size=(n_samples, 2))
+    y = rng.randint(0, 3, size=(n_samples,))
+    y_2d = y.reshape(-1, 1)
+    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
+    groups = rng.randint(0, 3, size=(n_samples,))
+    splitters = [
+        LeaveOneOut(),
+        LeavePOut(p=2),
+        KFold(),
+        StratifiedKFold(),
+        RepeatedKFold(),
+        RepeatedStratifiedKFold(),
+        StratifiedGroupKFold(),
+        ShuffleSplit(),
+        StratifiedShuffleSplit(test_size=0.5),
+        GroupShuffleSplit(),
+        LeaveOneGroupOut(),
+        LeavePGroupsOut(n_groups=2),
+        GroupKFold(n_splits=3),
+        TimeSeriesSplit(),
+        PredefinedSplit(test_fold=groups),
+    ]
+    for splitter in splitters:
+        list(_split(splitter, X, y, groups=groups))
+        list(_split(splitter, X, y_2d, groups=groups))
+        try:
+            list(_split(splitter, X, y_multilabel, groups=groups))
+        except ValueError as e:
+            allowed_target_types = ("binary", "multiclass")
+            msg = "Supported target types are: {}. Got 'multilabel".format(
+                allowed_target_types
+            )
+            assert msg in str(e)
+
+
+def check_valid_split(train, test, n_samples=None):
+    # Use python sets to get more informative assertion failure messages
+    train, test = set(train), set(test)
+
+    # Train and test split should not overlap
+    assert train.intersection(test) == set()
+
+    if n_samples is not None:
+        # Check that the union of train an test split cover all the indices
+        assert train.union(test) == set(range(n_samples))
+
+
+def check_cv_coverage(cv, X, y, groups, expected_n_splits):
+    n_samples = _num_samples(X)
+    # Check that a all the samples appear at least once in a test fold
+    assert cv.get_n_splits(X, y, groups) == expected_n_splits
+
+    collected_test_samples = set()
+    iterations = 0
+    for train, test in cv.split(X, y, groups):
+        check_valid_split(train, test, n_samples=n_samples)
+        iterations += 1
+        collected_test_samples.update(test)
+
+    # Check that the accumulated test samples cover the whole dataset
+    assert iterations == expected_n_splits
+    if n_samples is not None:
+        assert collected_test_samples == set(range(n_samples))
+
+
+def test_kfold_valueerrors():
+    X1 = np.array([[1, 2], [3, 4], [5, 6]])
+    X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
+    # Check that errors are raised if there is not enough samples
+    (ValueError, next, KFold(4).split(X1))
+
+    # Check that a warning is raised if the least populated class has too few
+    # members.
+    y = np.array([3, 3, -1, -1, 3])
+
+    skf_3 = StratifiedKFold(3)
+    with pytest.warns(Warning, match="The least populated class"):
+        next(skf_3.split(X2, y))
+
+    sgkf_3 = StratifiedGroupKFold(3)
+    naive_groups = np.arange(len(y))
+    with pytest.warns(Warning, match="The least populated class"):
+        next(sgkf_3.split(X2, y, naive_groups))
+
+    # Check that despite the warning the folds are still computed even
+    # though all the classes are not necessarily represented at on each
+    # side of the split at each split
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        check_cv_coverage(skf_3, X2, y, groups=None, expected_n_splits=3)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        check_cv_coverage(sgkf_3, X2, y, groups=naive_groups, expected_n_splits=3)
+
+    # Check that errors are raised if all n_groups for individual
+    # classes are less than n_splits.
+    y = np.array([3, 3, -1, -1, 2])
+
+    with pytest.raises(ValueError):
+        next(skf_3.split(X2, y))
+    with pytest.raises(ValueError):
+        next(sgkf_3.split(X2, y))
+
+    # Error when number of folds is <= 1
+    with pytest.raises(ValueError):
+        KFold(0)
+    with pytest.raises(ValueError):
+        KFold(1)
+    error_string = "k-fold cross-validation requires at least one train/test split"
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedKFold(0)
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedKFold(1)
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedGroupKFold(0)
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedGroupKFold(1)
+
+    # When n_splits is not integer:
+    with pytest.raises(ValueError):
+        KFold(1.5)
+    with pytest.raises(ValueError):
+        KFold(2.0)
+    with pytest.raises(ValueError):
+        StratifiedKFold(1.5)
+    with pytest.raises(ValueError):
+        StratifiedKFold(2.0)
+    with pytest.raises(ValueError):
+        StratifiedGroupKFold(1.5)
+    with pytest.raises(ValueError):
+        StratifiedGroupKFold(2.0)
+
+    # When shuffle is not  a bool:
+    with pytest.raises(TypeError):
+        KFold(n_splits=4, shuffle=None)
+
+
+def test_kfold_indices():
+    # Check all indices are returned in the test folds
+    X1 = np.ones(18)
+    kf = KFold(3)
+    check_cv_coverage(kf, X1, y=None, groups=None, expected_n_splits=3)
+
+    # Check all indices are returned in the test folds even when equal-sized
+    # folds are not possible
+    X2 = np.ones(17)
+    kf = KFold(3)
+    check_cv_coverage(kf, X2, y=None, groups=None, expected_n_splits=3)
+
+    # Check if get_n_splits returns the number of folds
+    assert 5 == KFold(5).get_n_splits(X2)
+
+
+def test_kfold_no_shuffle():
+    # Manually check that KFold preserves the data ordering on toy datasets
+    X2 = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+
+    splits = KFold(2).split(X2[:-1])
+    train, test = next(splits)
+    assert_array_equal(test, [0, 1])
+    assert_array_equal(train, [2, 3])
+
+    train, test = next(splits)
+    assert_array_equal(test, [2, 3])
+    assert_array_equal(train, [0, 1])
+
+    splits = KFold(2).split(X2)
+    train, test = next(splits)
+    assert_array_equal(test, [0, 1, 2])
+    assert_array_equal(train, [3, 4])
+
+    train, test = next(splits)
+    assert_array_equal(test, [3, 4])
+    assert_array_equal(train, [0, 1, 2])
+
+
+def test_stratified_kfold_no_shuffle():
+    # Manually check that StratifiedKFold preserves the data ordering as much
+    # as possible on toy datasets in order to avoid hiding sample dependencies
+    # when possible
+    X, y = np.ones(4), [1, 1, 0, 0]
+    splits = StratifiedKFold(2).split(X, y)
+    train, test = next(splits)
+    assert_array_equal(test, [0, 2])
+    assert_array_equal(train, [1, 3])
+
+    train, test = next(splits)
+    assert_array_equal(test, [1, 3])
+    assert_array_equal(train, [0, 2])
+
+    X, y = np.ones(7), [1, 1, 1, 0, 0, 0, 0]
+    splits = StratifiedKFold(2).split(X, y)
+    train, test = next(splits)
+    assert_array_equal(test, [0, 1, 3, 4])
+    assert_array_equal(train, [2, 5, 6])
+
+    train, test = next(splits)
+    assert_array_equal(test, [2, 5, 6])
+    assert_array_equal(train, [0, 1, 3, 4])
+
+    # Check if get_n_splits returns the number of folds
+    assert 5 == StratifiedKFold(5).get_n_splits(X, y)
+
+    # Make sure string labels are also supported
+    X = np.ones(7)
+    y1 = ["1", "1", "1", "0", "0", "0", "0"]
+    y2 = [1, 1, 1, 0, 0, 0, 0]
+    np.testing.assert_equal(
+        list(StratifiedKFold(2).split(X, y1)), list(StratifiedKFold(2).split(X, y2))
+    )
+
+    # Check equivalence to KFold
+    y = [0, 1, 0, 1, 0, 1, 0, 1]
+    X = np.ones_like(y)
+    np.testing.assert_equal(
+        list(StratifiedKFold(3).split(X, y)), list(KFold(3).split(X, y))
+    )
+
+
+@pytest.mark.parametrize("shuffle", [False, True])
+@pytest.mark.parametrize("k", [4, 5, 6, 7, 8, 9, 10])
+@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold])
+def test_stratified_kfold_ratios(k, shuffle, kfold):
+    # Check that stratified kfold preserves class ratios in individual splits
+    # Repeat with shuffling turned off and on
+    n_samples = 1000
+    X = np.ones(n_samples)
+    y = np.array(
+        [4] * int(0.10 * n_samples)
+        + [0] * int(0.89 * n_samples)
+        + [1] * int(0.01 * n_samples)
+    )
+    # ensure perfect stratification with StratifiedGroupKFold
+    groups = np.arange(len(y))
+    distr = np.bincount(y) / len(y)
+
+    test_sizes = []
+    random_state = None if not shuffle else 0
+    skf = kfold(k, random_state=random_state, shuffle=shuffle)
+    for train, test in _split(skf, X, y, groups=groups):
+        assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02)
+        assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02)
+        test_sizes.append(len(test))
+    assert np.ptp(test_sizes) <= 1
+
+
+@pytest.mark.parametrize("shuffle", [False, True])
+@pytest.mark.parametrize("k", [4, 6, 7])
+@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold])
+def test_stratified_kfold_label_invariance(k, shuffle, kfold):
+    # Check that stratified kfold gives the same indices regardless of labels
+    n_samples = 100
+    y = np.array(
+        [2] * int(0.10 * n_samples)
+        + [0] * int(0.89 * n_samples)
+        + [1] * int(0.01 * n_samples)
+    )
+    X = np.ones(len(y))
+    # ensure perfect stratification with StratifiedGroupKFold
+    groups = np.arange(len(y))
+
+    def get_splits(y):
+        random_state = None if not shuffle else 0
+        return [
+            (list(train), list(test))
+            for train, test in _split(
+                kfold(k, random_state=random_state, shuffle=shuffle),
+                X,
+                y,
+                groups=groups,
+            )
+        ]
+
+    splits_base = get_splits(y)
+    for perm in permutations([0, 1, 2]):
+        y_perm = np.take(perm, y)
+        splits_perm = get_splits(y_perm)
+        assert splits_perm == splits_base
+
+
+def test_kfold_balance():
+    # Check that KFold returns folds with balanced sizes
+    for i in range(11, 17):
+        kf = KFold(5).split(X=np.ones(i))
+        sizes = [len(test) for _, test in kf]
+
+        assert (np.max(sizes) - np.min(sizes)) <= 1
+        assert np.sum(sizes) == i
+
+
+@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold])
+def test_stratifiedkfold_balance(kfold):
+    # Check that KFold returns folds with balanced sizes (only when
+    # stratification is possible)
+    # Repeat with shuffling turned off and on
+    X = np.ones(17)
+    y = [0] * 3 + [1] * 14
+    # ensure perfect stratification with StratifiedGroupKFold
+    groups = np.arange(len(y))
+
+    for shuffle in (True, False):
+        cv = kfold(3, shuffle=shuffle)
+        for i in range(11, 17):
+            skf = _split(cv, X[:i], y[:i], groups[:i])
+            sizes = [len(test) for _, test in skf]
+
+            assert (np.max(sizes) - np.min(sizes)) <= 1
+            assert np.sum(sizes) == i
+
+
+def test_shuffle_kfold():
+    # Check the indices are shuffled properly
+    kf = KFold(3)
+    kf2 = KFold(3, shuffle=True, random_state=0)
+    kf3 = KFold(3, shuffle=True, random_state=1)
+
+    X = np.ones(300)
+
+    all_folds = np.zeros(300)
+    for (tr1, te1), (tr2, te2), (tr3, te3) in zip(
+        kf.split(X), kf2.split(X), kf3.split(X)
+    ):
+        for tr_a, tr_b in combinations((tr1, tr2, tr3), 2):
+            # Assert that there is no complete overlap
+            assert len(np.intersect1d(tr_a, tr_b)) != len(tr1)
+
+        # Set all test indices in successive iterations of kf2 to 1
+        all_folds[te2] = 1
+
+    # Check that all indices are returned in the different test folds
+    assert sum(all_folds) == 300
+
+
+@pytest.mark.parametrize("kfold", [KFold, StratifiedKFold, StratifiedGroupKFold])
+def test_shuffle_kfold_stratifiedkfold_reproducibility(kfold):
+    X = np.ones(15)  # Divisible by 3
+    y = [0] * 7 + [1] * 8
+    groups_1 = np.arange(len(y))
+    X2 = np.ones(16)  # Not divisible by 3
+    y2 = [0] * 8 + [1] * 8
+    groups_2 = np.arange(len(y2))
+
+    # Check that when the shuffle is True, multiple split calls produce the
+    # same split when random_state is int
+    kf = kfold(3, shuffle=True, random_state=0)
+
+    np.testing.assert_equal(
+        list(_split(kf, X, y, groups_1)), list(_split(kf, X, y, groups_1))
+    )
+
+    # Check that when the shuffle is True, multiple split calls often
+    # (not always) produce different splits when random_state is
+    # RandomState instance or None
+    kf = kfold(3, shuffle=True, random_state=np.random.RandomState(0))
+    for data in zip((X, X2), (y, y2), (groups_1, groups_2)):
+        # Test if the two splits are different cv
+        for (_, test_a), (_, test_b) in zip(_split(kf, *data), _split(kf, *data)):
+            # cv.split(...) returns an array of tuples, each tuple
+            # consisting of an array with train indices and test indices
+            # Ensure that the splits for data are not same
+            # when random state is not set
+            with pytest.raises(AssertionError):
+                np.testing.assert_array_equal(test_a, test_b)
+
+
+def test_shuffle_stratifiedkfold():
+    # Check that shuffling is happening when requested, and for proper
+    # sample coverage
+    X_40 = np.ones(40)
+    y = [0] * 20 + [1] * 20
+    kf0 = StratifiedKFold(5, shuffle=True, random_state=0)
+    kf1 = StratifiedKFold(5, shuffle=True, random_state=1)
+    for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)):
+        assert set(test0) != set(test1)
+    check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5)
+
+    # Ensure that we shuffle each class's samples with different
+    # random_state in StratifiedKFold
+    # See https://github.com/scikit-learn/scikit-learn/pull/13124
+    X = np.arange(10)
+    y = [0] * 5 + [1] * 5
+    kf1 = StratifiedKFold(5, shuffle=True, random_state=0)
+    kf2 = StratifiedKFold(5, shuffle=True, random_state=1)
+    test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)])
+    test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)])
+    assert test_set1 != test_set2
+
+
+def test_shuffle_groupkfold():
+    # Check that shuffling is happening when requested, and for proper
+    # sample coverage
+    X = np.ones(40)
+    y = [0] * 20 + [1] * 20
+    groups = np.arange(40) // 3
+    gkf0 = GroupKFold(4, shuffle=True, random_state=0)
+    gkf1 = GroupKFold(4, shuffle=True, random_state=1)
+
+    # Check that the groups are shuffled differently
+    test_groups0 = [
+        set(groups[test_idx]) for _, test_idx in gkf0.split(X, None, groups)
+    ]
+    test_groups1 = [
+        set(groups[test_idx]) for _, test_idx in gkf1.split(X, None, groups)
+    ]
+    for g0, g1 in zip(test_groups0, test_groups1):
+        assert g0 != g1, "Test groups should differ with different random states"
+
+    # Check coverage and splits
+    check_cv_coverage(gkf0, X, y, groups, expected_n_splits=4)
+    check_cv_coverage(gkf1, X, y, groups, expected_n_splits=4)
+
+
+def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
+    # The digits samples are dependent: they are apparently grouped by authors
+    # although we don't have any information on the groups segment locations
+    # for this data. We can highlight this fact by computing k-fold cross-
+    # validation with and without shuffling: we observe that the shuffling case
+    # wrongly makes the IID assumption and is therefore too optimistic: it
+    # estimates a much higher accuracy (around 0.93) than that the non
+    # shuffling variant (around 0.81).
+
+    X, y = digits.data[:600], digits.target[:600]
+    model = SVC(C=10, gamma=0.005)
+
+    n_splits = 3
+
+    cv = KFold(n_splits=n_splits, shuffle=False)
+    mean_score = cross_val_score(model, X, y, cv=cv).mean()
+    assert 0.92 > mean_score
+    assert mean_score > 0.80
+
+    # Shuffling the data artificially breaks the dependency and hides the
+    # overfitting of the model with regards to the writing style of the authors
+    # by yielding a seriously overestimated score:
+
+    cv = KFold(n_splits, shuffle=True, random_state=0)
+    mean_score = cross_val_score(model, X, y, cv=cv).mean()
+    assert mean_score > 0.92
+
+    cv = KFold(n_splits, shuffle=True, random_state=1)
+    mean_score = cross_val_score(model, X, y, cv=cv).mean()
+    assert mean_score > 0.92
+
+    # Similarly, StratifiedKFold should try to shuffle the data as little
+    # as possible (while respecting the balanced class constraints)
+    # and thus be able to detect the dependency by not overestimating
+    # the CV score either. As the digits dataset is approximately balanced
+    # the estimated mean score is close to the score measured with
+    # non-shuffled KFold
+
+    cv = StratifiedKFold(n_splits)
+    mean_score = cross_val_score(model, X, y, cv=cv).mean()
+    assert 0.94 > mean_score
+    assert mean_score > 0.80
+
+
+def test_stratified_group_kfold_trivial():
+    sgkf = StratifiedGroupKFold(n_splits=3)
+    # Trivial example - groups with the same distribution
+    y = np.array([1] * 6 + [0] * 12)
+    X = np.ones_like(y).reshape(-1, 1)
+    groups = np.asarray((1, 2, 3, 4, 5, 6, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6))
+    distr = np.bincount(y) / len(y)
+    test_sizes = []
+    for train, test in sgkf.split(X, y, groups):
+        # check group constraint
+        assert np.intersect1d(groups[train], groups[test]).size == 0
+        # check y distribution
+        assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02)
+        assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02)
+        test_sizes.append(len(test))
+    assert np.ptp(test_sizes) <= 1
+
+
+def test_stratified_group_kfold_approximate():
+    # Not perfect stratification (even though it is possible) because of
+    # iteration over groups
+    sgkf = StratifiedGroupKFold(n_splits=3)
+    y = np.array([1] * 6 + [0] * 12)
+    X = np.ones_like(y).reshape(-1, 1)
+    groups = np.array([1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6])
+    expected = np.asarray([[0.833, 0.166], [0.666, 0.333], [0.5, 0.5]])
+    test_sizes = []
+    for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected):
+        # check group constraint
+        assert np.intersect1d(groups[train], groups[test]).size == 0
+        split_dist = np.bincount(y[test]) / len(test)
+        assert_allclose(split_dist, expect_dist, atol=0.001)
+        test_sizes.append(len(test))
+    assert np.ptp(test_sizes) <= 1
+
+
+@pytest.mark.parametrize(
+    "y, groups, expected",
+    [
+        (
+            np.array([0] * 6 + [1] * 6),
+            np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]),
+            np.asarray([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]),
+        ),
+        (
+            np.array([0] * 9 + [1] * 3),
+            np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6]),
+            np.asarray([[0.75, 0.25], [0.75, 0.25], [0.75, 0.25]]),
+        ),
+    ],
+)
+def test_stratified_group_kfold_homogeneous_groups(y, groups, expected):
+    sgkf = StratifiedGroupKFold(n_splits=3)
+    X = np.ones_like(y).reshape(-1, 1)
+    for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected):
+        # check group constraint
+        assert np.intersect1d(groups[train], groups[test]).size == 0
+        split_dist = np.bincount(y[test]) / len(test)
+        assert_allclose(split_dist, expect_dist, atol=0.001)
+
+
+@pytest.mark.parametrize("cls_distr", [(0.4, 0.6), (0.3, 0.7), (0.2, 0.8), (0.8, 0.2)])
+@pytest.mark.parametrize("n_groups", [5, 30, 70])
+def test_stratified_group_kfold_against_group_kfold(cls_distr, n_groups):
+    # Check that given sufficient amount of samples StratifiedGroupKFold
+    # produces better stratified folds than regular GroupKFold
+    n_splits = 5
+    sgkf = StratifiedGroupKFold(n_splits=n_splits)
+    gkf = GroupKFold(n_splits=n_splits)
+    rng = np.random.RandomState(0)
+    n_points = 1000
+    y = rng.choice(2, size=n_points, p=cls_distr)
+    X = np.ones_like(y).reshape(-1, 1)
+    g = rng.choice(n_groups, n_points)
+    sgkf_folds = sgkf.split(X, y, groups=g)
+    gkf_folds = gkf.split(X, y, groups=g)
+    sgkf_entr = 0
+    gkf_entr = 0
+    for (sgkf_train, sgkf_test), (_, gkf_test) in zip(sgkf_folds, gkf_folds):
+        # check group constraint
+        assert np.intersect1d(g[sgkf_train], g[sgkf_test]).size == 0
+        sgkf_distr = np.bincount(y[sgkf_test]) / len(sgkf_test)
+        gkf_distr = np.bincount(y[gkf_test]) / len(gkf_test)
+        sgkf_entr += stats.entropy(sgkf_distr, qk=cls_distr)
+        gkf_entr += stats.entropy(gkf_distr, qk=cls_distr)
+    sgkf_entr /= n_splits
+    gkf_entr /= n_splits
+    assert sgkf_entr <= gkf_entr
+
+
+def test_shuffle_split():
+    ss1 = ShuffleSplit(test_size=0.2, random_state=0).split(X)
+    ss2 = ShuffleSplit(test_size=2, random_state=0).split(X)
+    ss3 = ShuffleSplit(test_size=np.int32(2), random_state=0).split(X)
+    ss4 = ShuffleSplit(test_size=int(2), random_state=0).split(X)
+    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
+        assert_array_equal(t1[0], t2[0])
+        assert_array_equal(t2[0], t3[0])
+        assert_array_equal(t3[0], t4[0])
+        assert_array_equal(t1[1], t2[1])
+        assert_array_equal(t2[1], t3[1])
+        assert_array_equal(t3[1], t4[1])
+
+
+@pytest.mark.parametrize("split_class", [ShuffleSplit, StratifiedShuffleSplit])
+@pytest.mark.parametrize(
+    "train_size, exp_train, exp_test", [(None, 9, 1), (8, 8, 2), (0.8, 8, 2)]
+)
+def test_shuffle_split_default_test_size(split_class, train_size, exp_train, exp_test):
+    # Check that the default value has the expected behavior, i.e. 0.1 if both
+    # unspecified or complement train_size unless both are specified.
+    X = np.ones(10)
+    y = np.ones(10)
+
+    X_train, X_test = next(split_class(train_size=train_size).split(X, y))
+
+    assert len(X_train) == exp_train
+    assert len(X_test) == exp_test
+
+
+@pytest.mark.parametrize(
+    "train_size, exp_train, exp_test", [(None, 8, 2), (7, 7, 3), (0.7, 7, 3)]
+)
+def test_group_shuffle_split_default_test_size(train_size, exp_train, exp_test):
+    # Check that the default value has the expected behavior, i.e. 0.2 if both
+    # unspecified or complement train_size unless both are specified.
+    X = np.ones(10)
+    y = np.ones(10)
+    groups = range(10)
+
+    X_train, X_test = next(GroupShuffleSplit(train_size=train_size).split(X, y, groups))
+
+    assert len(X_train) == exp_train
+    assert len(X_test) == exp_test
+
+
+def test_stratified_shuffle_split_init():
+    X = np.arange(7)
+    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
+    # Check that error is raised if there is a class with only one sample
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(3, test_size=0.2).split(X, y))
+
+    # Check that error is raised if the test set size is smaller than n_classes
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(3, test_size=2).split(X, y))
+    # Check that error is raised if the train set size is smaller than
+    # n_classes
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(3, test_size=3, train_size=2).split(X, y))
+
+    X = np.arange(9)
+    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
+
+    # Train size or test size too small
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(train_size=2).split(X, y))
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(test_size=2).split(X, y))
+
+
+def test_stratified_shuffle_split_respects_test_size():
+    y = np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2])
+    test_size = 5
+    train_size = 10
+    sss = StratifiedShuffleSplit(
+        6, test_size=test_size, train_size=train_size, random_state=0
+    ).split(np.ones(len(y)), y)
+    for train, test in sss:
+        assert len(train) == train_size
+        assert len(test) == test_size
+
+
+def test_stratified_shuffle_split_iter():
+    ys = [
+        np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
+        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
+        np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
+        np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
+        np.array([-1] * 800 + [1] * 50),
+        np.concatenate([[i] * (100 + i) for i in range(11)]),
+        [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3],
+        ["1", "1", "1", "1", "2", "2", "2", "3", "3", "3", "3", "3"],
+    ]
+
+    for y in ys:
+        sss = StratifiedShuffleSplit(6, test_size=0.33, random_state=0).split(
+            np.ones(len(y)), y
+        )
+        y = np.asanyarray(y)  # To make it indexable for y[train]
+        # this is how test-size is computed internally
+        # in _validate_shuffle_split
+        test_size = np.ceil(0.33 * len(y))
+        train_size = len(y) - test_size
+        for train, test in sss:
+            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
+            # Checks if folds keep classes proportions
+            p_train = np.bincount(np.unique(y[train], return_inverse=True)[1]) / float(
+                len(y[train])
+            )
+            p_test = np.bincount(np.unique(y[test], return_inverse=True)[1]) / float(
+                len(y[test])
+            )
+            assert_array_almost_equal(p_train, p_test, 1)
+            assert len(train) + len(test) == y.size
+            assert len(train) == train_size
+            assert len(test) == test_size
+            assert_array_equal(np.intersect1d(train, test), [])
+
+
+def test_stratified_shuffle_split_even():
+    # Test the StratifiedShuffleSplit, indices are drawn with a
+    # equal chance
+    n_folds = 5
+    n_splits = 1000
+
+    def assert_counts_are_ok(idx_counts, p):
+        # Here we test that the distribution of the counts
+        # per index is close enough to a binomial
+        threshold = 0.05 / n_splits
+        bf = stats.binom(n_splits, p)
+        for count in idx_counts:
+            prob = bf.pmf(count)
+            assert (
+                prob > threshold
+            ), "An index is not drawn with chance corresponding to even draws"
+
+    for n_samples in (6, 22):
+        groups = np.array((n_samples // 2) * [0, 1])
+        splits = StratifiedShuffleSplit(
+            n_splits=n_splits, test_size=1.0 / n_folds, random_state=0
+        )
+
+        train_counts = [0] * n_samples
+        test_counts = [0] * n_samples
+        n_splits_actual = 0
+        for train, test in splits.split(X=np.ones(n_samples), y=groups):
+            n_splits_actual += 1
+            for counter, ids in [(train_counts, train), (test_counts, test)]:
+                for id in ids:
+                    counter[id] += 1
+        assert n_splits_actual == n_splits
+
+        n_train, n_test = _validate_shuffle_split(
+            n_samples, test_size=1.0 / n_folds, train_size=1.0 - (1.0 / n_folds)
+        )
+
+        assert len(train) == n_train
+        assert len(test) == n_test
+        assert len(set(train).intersection(test)) == 0
+
+        group_counts = np.unique(groups)
+        assert splits.test_size == 1.0 / n_folds
+        assert n_train + n_test == len(groups)
+        assert len(group_counts) == 2
+        ex_test_p = float(n_test) / n_samples
+        ex_train_p = float(n_train) / n_samples
+
+        assert_counts_are_ok(train_counts, ex_train_p)
+        assert_counts_are_ok(test_counts, ex_test_p)
+
+
+def test_stratified_shuffle_split_overlap_train_test_bug():
+    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
+    # the original bug report
+    y = [0, 1, 2, 3] * 3 + [4, 5] * 5
+    X = np.ones_like(y)
+
+    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
+
+    train, test = next(sss.split(X=X, y=y))
+
+    # no overlap
+    assert_array_equal(np.intersect1d(train, test), [])
+
+    # complete partition
+    assert_array_equal(np.union1d(train, test), np.arange(len(y)))
+
+
+def test_stratified_shuffle_split_multilabel():
+    # fix for issue 9037
+    for y in [
+        np.array([[0, 1], [1, 0], [1, 0], [0, 1]]),
+        np.array([[0, 1], [1, 1], [1, 1], [0, 1]]),
+    ]:
+        X = np.ones_like(y)
+        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
+        train, test = next(sss.split(X=X, y=y))
+        y_train = y[train]
+        y_test = y[test]
+
+        # no overlap
+        assert_array_equal(np.intersect1d(train, test), [])
+
+        # complete partition
+        assert_array_equal(np.union1d(train, test), np.arange(len(y)))
+
+        # correct stratification of entire rows
+        # (by design, here y[:, 0] uniquely determines the entire row of y)
+        expected_ratio = np.mean(y[:, 0])
+        assert expected_ratio == np.mean(y_train[:, 0])
+        assert expected_ratio == np.mean(y_test[:, 0])
+
+
+def test_stratified_shuffle_split_multilabel_many_labels():
+    # fix in PR #9922: for multilabel data with > 1000 labels, str(row)
+    # truncates with an ellipsis for elements in positions 4 through
+    # len(row) - 4, so labels were not being correctly split using the powerset
+    # method for transforming a multilabel problem to a multiclass one; this
+    # test checks that this problem is fixed.
+    row_with_many_zeros = [1, 0, 1] + [0] * 1000 + [1, 0, 1]
+    row_with_many_ones = [1, 0, 1] + [1] * 1000 + [1, 0, 1]
+    y = np.array([row_with_many_zeros] * 10 + [row_with_many_ones] * 100)
+    X = np.ones_like(y)
+
+    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
+    train, test = next(sss.split(X=X, y=y))
+    y_train = y[train]
+    y_test = y[test]
+
+    # correct stratification of entire rows
+    # (by design, here y[:, 4] uniquely determines the entire row of y)
+    expected_ratio = np.mean(y[:, 4])
+    assert expected_ratio == np.mean(y_train[:, 4])
+    assert expected_ratio == np.mean(y_test[:, 4])
+
+
+def test_predefinedsplit_with_kfold_split():
+    # Check that PredefinedSplit can reproduce a split generated by Kfold.
+    folds = np.full(10, -1.0)
+    kf_train = []
+    kf_test = []
+    for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)):
+        kf_train.append(train_ind)
+        kf_test.append(test_ind)
+        folds[test_ind] = i
+    ps = PredefinedSplit(folds)
+    # n_splits is simply the no of unique folds
+    assert len(np.unique(folds)) == ps.get_n_splits()
+    ps_train, ps_test = zip(*ps.split())
+    assert_array_equal(ps_train, kf_train)
+    assert_array_equal(ps_test, kf_test)
+
+
+def test_group_shuffle_split():
+    for groups_i in test_groups:
+        X = y = np.ones(len(groups_i))
+        n_splits = 6
+        test_size = 1.0 / 3
+        slo = GroupShuffleSplit(n_splits, test_size=test_size, random_state=0)
+
+        # Make sure the repr works
+        repr(slo)
+
+        # Test that the length is correct
+        assert slo.get_n_splits(X, y, groups=groups_i) == n_splits
+
+        l_unique = np.unique(groups_i)
+        l = np.asarray(groups_i)
+
+        for train, test in slo.split(X, y, groups=groups_i):
+            # First test: no train group is in the test set and vice versa
+            l_train_unique = np.unique(l[train])
+            l_test_unique = np.unique(l[test])
+            assert not np.any(np.isin(l[train], l_test_unique))
+            assert not np.any(np.isin(l[test], l_train_unique))
+
+            # Second test: train and test add up to all the data
+            assert l[train].size + l[test].size == l.size
+
+            # Third test: train and test are disjoint
+            assert_array_equal(np.intersect1d(train, test), [])
+
+            # Fourth test:
+            # unique train and test groups are correct, +- 1 for rounding error
+            assert abs(len(l_test_unique) - round(test_size * len(l_unique))) <= 1
+            assert (
+                abs(len(l_train_unique) - round((1.0 - test_size) * len(l_unique))) <= 1
+            )
+
+
+def test_leave_one_p_group_out():
+    logo = LeaveOneGroupOut()
+    lpgo_1 = LeavePGroupsOut(n_groups=1)
+    lpgo_2 = LeavePGroupsOut(n_groups=2)
+
+    # Make sure the repr works
+    assert repr(logo) == "LeaveOneGroupOut()"
+    assert repr(lpgo_1) == "LeavePGroupsOut(n_groups=1)"
+    assert repr(lpgo_2) == "LeavePGroupsOut(n_groups=2)"
+    assert repr(LeavePGroupsOut(n_groups=3)) == "LeavePGroupsOut(n_groups=3)"
+
+    for j, (cv, p_groups_out) in enumerate(((logo, 1), (lpgo_1, 1), (lpgo_2, 2))):
+        for i, groups_i in enumerate(test_groups):
+            n_groups = len(np.unique(groups_i))
+            n_splits = n_groups if p_groups_out == 1 else n_groups * (n_groups - 1) / 2
+            X = y = np.ones(len(groups_i))
+
+            # Test that the length is correct
+            assert cv.get_n_splits(X, y, groups=groups_i) == n_splits
+
+            groups_arr = np.asarray(groups_i)
+
+            # Split using the original list / array / list of string groups_i
+            for train, test in cv.split(X, y, groups=groups_i):
+                # First test: no train group is in the test set and vice versa
+                assert_array_equal(
+                    np.intersect1d(groups_arr[train], groups_arr[test]).tolist(), []
+                )
+
+                # Second test: train and test add up to all the data
+                assert len(train) + len(test) == len(groups_i)
+
+                # Third test:
+                # The number of groups in test must be equal to p_groups_out
+                assert np.unique(groups_arr[test]).shape[0], p_groups_out
+
+    # check get_n_splits() with dummy parameters
+    assert logo.get_n_splits(None, None, ["a", "b", "c", "b", "c"]) == 3
+    assert logo.get_n_splits(groups=[1.0, 1.1, 1.0, 1.2]) == 3
+    assert lpgo_2.get_n_splits(None, None, np.arange(4)) == 6
+    assert lpgo_1.get_n_splits(groups=np.arange(4)) == 4
+
+    # raise ValueError if a `groups` parameter is illegal
+    with pytest.raises(ValueError):
+        logo.get_n_splits(None, None, [0.0, np.nan, 0.0])
+    with pytest.raises(ValueError):
+        lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0])
+
+    msg = "The 'groups' parameter should not be None."
+    with pytest.raises(ValueError, match=msg):
+        logo.get_n_splits(None, None, None)
+    with pytest.raises(ValueError, match=msg):
+        lpgo_1.get_n_splits(None, None, None)
+
+
+def test_leave_group_out_changing_groups():
+    # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if
+    # the groups variable is changed before calling split
+    groups = np.array([0, 1, 2, 1, 1, 2, 0, 0])
+    X = np.ones(len(groups))
+    groups_changing = np.array(groups, copy=True)
+    lolo = LeaveOneGroupOut().split(X, groups=groups)
+    lolo_changing = LeaveOneGroupOut().split(X, groups=groups)
+    lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
+    lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
+    groups_changing[:] = 0
+    for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
+        for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
+            assert_array_equal(train, train_chan)
+            assert_array_equal(test, test_chan)
+
+    # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3
+    assert 3 == LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X, groups=groups)
+    # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups)
+    assert 3 == LeaveOneGroupOut().get_n_splits(X, y=X, groups=groups)
+
+
+def test_leave_group_out_order_dependence():
+    # Check that LeaveOneGroupOut orders the splits according to the index
+    # of the group left out.
+    groups = np.array([2, 2, 0, 0, 1, 1])
+    X = np.ones(len(groups))
+
+    splits = iter(LeaveOneGroupOut().split(X, groups=groups))
+
+    expected_indices = [
+        ([0, 1, 4, 5], [2, 3]),
+        ([0, 1, 2, 3], [4, 5]),
+        ([2, 3, 4, 5], [0, 1]),
+    ]
+
+    for expected_train, expected_test in expected_indices:
+        train, test = next(splits)
+        assert_array_equal(train, expected_train)
+        assert_array_equal(test, expected_test)
+
+
+def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
+    X = y = groups = np.ones(0)
+    msg = re.escape("Found array with 0 sample(s)")
+    with pytest.raises(ValueError, match=msg):
+        next(LeaveOneGroupOut().split(X, y, groups))
+
+    X = y = groups = np.ones(1)
+    msg = re.escape(
+        f"The groups parameter contains fewer than 2 unique groups ({groups})."
+        " LeaveOneGroupOut expects at least 2."
+    )
+    with pytest.raises(ValueError, match=msg):
+        next(LeaveOneGroupOut().split(X, y, groups))
+
+    X = y = groups = np.ones(1)
+    msg = re.escape(
+        "The groups parameter contains fewer than (or equal to) n_groups "
+        f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects "
+        "that at least n_groups + 1 (4) unique groups "
+        "be present"
+    )
+    with pytest.raises(ValueError, match=msg):
+        next(LeavePGroupsOut(n_groups=3).split(X, y, groups))
+
+    X = y = groups = np.arange(3)
+    msg = re.escape(
+        "The groups parameter contains fewer than (or equal to) n_groups "
+        f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects "
+        "that at least n_groups + 1 (4) unique groups "
+        "be present"
+    )
+    with pytest.raises(ValueError, match=msg):
+        next(LeavePGroupsOut(n_groups=3).split(X, y, groups))
+
+
+def test_repeated_cv_value_errors():
+    # n_repeats is not integer or <= 0
+    for cv in (RepeatedKFold, RepeatedStratifiedKFold):
+        with pytest.raises(ValueError):
+            cv(n_repeats=0)
+        with pytest.raises(ValueError):
+            cv(n_repeats=1.5)
+
+
+@pytest.mark.parametrize("RepeatedCV", [RepeatedKFold, RepeatedStratifiedKFold])
+def test_repeated_cv_repr(RepeatedCV):
+    n_splits, n_repeats = 2, 6
+    repeated_cv = RepeatedCV(n_splits=n_splits, n_repeats=n_repeats)
+    repeated_cv_repr = "{}(n_repeats=6, n_splits=2, random_state=None)".format(
+        repeated_cv.__class__.__name__
+    )
+    assert repeated_cv_repr == repr(repeated_cv)
+
+
+def test_repeated_kfold_determinstic_split():
+    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+    random_state = 258173307
+    rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state)
+
+    # split should produce same and deterministic splits on
+    # each call
+    for _ in range(3):
+        splits = rkf.split(X)
+        train, test = next(splits)
+        assert_array_equal(train, [2, 4])
+        assert_array_equal(test, [0, 1, 3])
+
+        train, test = next(splits)
+        assert_array_equal(train, [0, 1, 3])
+        assert_array_equal(test, [2, 4])
+
+        train, test = next(splits)
+        assert_array_equal(train, [0, 1])
+        assert_array_equal(test, [2, 3, 4])
+
+        train, test = next(splits)
+        assert_array_equal(train, [2, 3, 4])
+        assert_array_equal(test, [0, 1])
+
+        with pytest.raises(StopIteration):
+            next(splits)
+
+
+def test_get_n_splits_for_repeated_kfold():
+    n_splits = 3
+    n_repeats = 4
+    rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)
+    expected_n_splits = n_splits * n_repeats
+    assert expected_n_splits == rkf.get_n_splits()
+
+
+def test_get_n_splits_for_repeated_stratified_kfold():
+    n_splits = 3
+    n_repeats = 4
+    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats)
+    expected_n_splits = n_splits * n_repeats
+    assert expected_n_splits == rskf.get_n_splits()
+
+
+def test_repeated_stratified_kfold_determinstic_split():
+    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+    y = [1, 1, 1, 0, 0]
+    random_state = 1944695409
+    rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, random_state=random_state)
+
+    # split should produce same and deterministic splits on
+    # each call
+    for _ in range(3):
+        splits = rskf.split(X, y)
+        train, test = next(splits)
+        assert_array_equal(train, [1, 4])
+        assert_array_equal(test, [0, 2, 3])
+
+        train, test = next(splits)
+        assert_array_equal(train, [0, 2, 3])
+        assert_array_equal(test, [1, 4])
+
+        train, test = next(splits)
+        assert_array_equal(train, [2, 3])
+        assert_array_equal(test, [0, 1, 4])
+
+        train, test = next(splits)
+        assert_array_equal(train, [0, 1, 4])
+        assert_array_equal(test, [2, 3])
+
+        with pytest.raises(StopIteration):
+            next(splits)
+
+
+def test_train_test_split_errors():
+    pytest.raises(ValueError, train_test_split)
+
+    pytest.raises(ValueError, train_test_split, range(3), train_size=1.1)
+
+    pytest.raises(ValueError, train_test_split, range(3), test_size=0.6, train_size=0.6)
+    pytest.raises(
+        ValueError,
+        train_test_split,
+        range(3),
+        test_size=np.float32(0.6),
+        train_size=np.float32(0.6),
+    )
+    pytest.raises(ValueError, train_test_split, range(3), test_size="wrong_type")
+    pytest.raises(ValueError, train_test_split, range(3), test_size=2, train_size=4)
+    pytest.raises(TypeError, train_test_split, range(3), some_argument=1.1)
+    pytest.raises(ValueError, train_test_split, range(3), range(42))
+    pytest.raises(ValueError, train_test_split, range(10), shuffle=False, stratify=True)
+
+    with pytest.raises(
+        ValueError,
+        match=r"train_size=11 should be either positive and "
+        r"smaller than the number of samples 10 or a "
+        r"float in the \(0, 1\) range",
+    ):
+        train_test_split(range(10), train_size=11, test_size=1)
+
+
+@pytest.mark.parametrize(
+    "train_size, exp_train, exp_test", [(None, 7, 3), (8, 8, 2), (0.8, 8, 2)]
+)
+def test_train_test_split_default_test_size(train_size, exp_train, exp_test):
+    # Check that the default value has the expected behavior, i.e. complement
+    # train_size unless both are specified.
+    X_train, X_test = train_test_split(X, train_size=train_size)
+
+    assert len(X_train) == exp_train
+    assert len(X_test) == exp_test
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
+)
+@pytest.mark.parametrize(
+    "shuffle,stratify",
+    (
+        (True, None),
+        (True, np.hstack((np.ones(6), np.zeros(4)))),
+        # stratification only works with shuffling
+        (False, None),
+    ),
+)
+def test_array_api_train_test_split(
+    shuffle, stratify, array_namespace, device, dtype_name
+):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    X = np.arange(100).reshape((10, 10))
+    y = np.arange(10)
+
+    X_np = X.astype(dtype_name)
+    X_xp = xp.asarray(X_np, device=device)
+
+    y_np = y.astype(dtype_name)
+    y_xp = xp.asarray(y_np, device=device)
+
+    X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
+        X_np, y, random_state=0, shuffle=shuffle, stratify=stratify
+    )
+    with config_context(array_api_dispatch=True):
+        if stratify is not None:
+            stratify_xp = xp.asarray(stratify)
+        else:
+            stratify_xp = stratify
+        X_train_xp, X_test_xp, y_train_xp, y_test_xp = train_test_split(
+            X_xp, y_xp, shuffle=shuffle, stratify=stratify_xp, random_state=0
+        )
+
+        # Check that namespace is preserved, has to happen with
+        # array_api_dispatch enabled.
+        assert get_namespace(X_train_xp)[0] == get_namespace(X_xp)[0]
+        assert get_namespace(X_test_xp)[0] == get_namespace(X_xp)[0]
+        assert get_namespace(y_train_xp)[0] == get_namespace(y_xp)[0]
+        assert get_namespace(y_test_xp)[0] == get_namespace(y_xp)[0]
+
+    # Check device and dtype is preserved on output
+    assert array_api_device(X_train_xp) == array_api_device(X_xp)
+    assert array_api_device(y_train_xp) == array_api_device(y_xp)
+    assert array_api_device(X_test_xp) == array_api_device(X_xp)
+    assert array_api_device(y_test_xp) == array_api_device(y_xp)
+
+    assert X_train_xp.dtype == X_xp.dtype
+    assert y_train_xp.dtype == y_xp.dtype
+    assert X_test_xp.dtype == X_xp.dtype
+    assert y_test_xp.dtype == y_xp.dtype
+
+    assert_allclose(
+        _convert_to_numpy(X_train_xp, xp=xp),
+        X_train_np,
+    )
+    assert_allclose(
+        _convert_to_numpy(X_test_xp, xp=xp),
+        X_test_np,
+    )
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_train_test_split(coo_container):
+    X = np.arange(100).reshape((10, 10))
+    X_s = coo_container(X)
+    y = np.arange(10)
+
+    # simple test
+    split = train_test_split(X, y, test_size=None, train_size=0.5)
+    X_train, X_test, y_train, y_test = split
+    assert len(y_test) == len(y_train)
+    # test correspondence of X and y
+    assert_array_equal(X_train[:, 0], y_train * 10)
+    assert_array_equal(X_test[:, 0], y_test * 10)
+
+    # don't convert lists to anything else by default
+    split = train_test_split(X, X_s, y.tolist())
+    X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
+    assert isinstance(y_train, list)
+    assert isinstance(y_test, list)
+
+    # allow nd-arrays
+    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
+    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
+    split = train_test_split(X_4d, y_3d)
+    assert split[0].shape == (7, 5, 3, 2)
+    assert split[1].shape == (3, 5, 3, 2)
+    assert split[2].shape == (7, 7, 11)
+    assert split[3].shape == (3, 7, 11)
+
+    # test stratification option
+    y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
+    for test_size, exp_test_size in zip([2, 4, 0.25, 0.5, 0.75], [2, 4, 2, 4, 6]):
+        train, test = train_test_split(
+            y, test_size=test_size, stratify=y, random_state=0
+        )
+        assert len(test) == exp_test_size
+        assert len(test) + len(train) == len(y)
+        # check the 1:1 ratio of ones and twos in the data is preserved
+        assert np.sum(train == 1) == np.sum(train == 2)
+
+    # test unshuffled split
+    y = np.arange(10)
+    for test_size in [2, 0.2]:
+        train, test = train_test_split(y, shuffle=False, test_size=test_size)
+        assert_array_equal(test, [8, 9])
+        assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7])
+
+
+def test_train_test_split_32bit_overflow():
+    """Check for integer overflow on 32-bit platforms.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/20774
+    """
+
+    # A number 'n' big enough for expression 'n * n * train_size' to cause
+    # an overflow for signed 32-bit integer
+    big_number = 100000
+
+    # Definition of 'y' is a part of reproduction - population for at least
+    # one class should be in the same order of magnitude as size of X
+    X = np.arange(big_number)
+    y = X > (0.99 * big_number)
+
+    split = train_test_split(X, y, stratify=y, train_size=0.25)
+    X_train, X_test, y_train, y_test = split
+
+    assert X_train.size + X_test.size == big_number
+    assert y_train.size + y_test.size == big_number
+
+
+def test_train_test_split_pandas():
+    # check train_test_split doesn't destroy pandas dataframe
+    types = [MockDataFrame]
+    try:
+        from pandas import DataFrame
+
+        types.append(DataFrame)
+    except ImportError:
+        pass
+    for InputFeatureType in types:
+        # X dataframe
+        X_df = InputFeatureType(X)
+        X_train, X_test = train_test_split(X_df)
+        assert isinstance(X_train, InputFeatureType)
+        assert isinstance(X_test, InputFeatureType)
+
+
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_train_test_split_sparse(sparse_container):
+    # check that train_test_split converts scipy sparse matrices
+    # to csr, as stated in the documentation
+    X = np.arange(100).reshape((10, 10))
+    X_s = sparse_container(X)
+    X_train, X_test = train_test_split(X_s)
+    assert issparse(X_train) and X_train.format == "csr"
+    assert issparse(X_test) and X_test.format == "csr"
+
+
+def test_train_test_split_mock_pandas():
+    # X mock dataframe
+    X_df = MockDataFrame(X)
+    X_train, X_test = train_test_split(X_df)
+    assert isinstance(X_train, MockDataFrame)
+    assert isinstance(X_test, MockDataFrame)
+    X_train_arr, X_test_arr = train_test_split(X_df)
+
+
+def test_train_test_split_list_input():
+    # Check that when y is a list / list of string labels, it works.
+    X = np.ones(7)
+    y1 = ["1"] * 4 + ["0"] * 3
+    y2 = np.hstack((np.ones(4), np.zeros(3)))
+    y3 = y2.tolist()
+
+    for stratify in (True, False):
+        X_train1, X_test1, y_train1, y_test1 = train_test_split(
+            X, y1, stratify=y1 if stratify else None, random_state=0
+        )
+        X_train2, X_test2, y_train2, y_test2 = train_test_split(
+            X, y2, stratify=y2 if stratify else None, random_state=0
+        )
+        X_train3, X_test3, y_train3, y_test3 = train_test_split(
+            X, y3, stratify=y3 if stratify else None, random_state=0
+        )
+
+        np.testing.assert_equal(X_train1, X_train2)
+        np.testing.assert_equal(y_train2, y_train3)
+        np.testing.assert_equal(X_test1, X_test3)
+        np.testing.assert_equal(y_test3, y_test2)
+
+
+@pytest.mark.parametrize(
+    "test_size, train_size",
+    [(2.0, None), (1.0, None), (0.1, 0.95), (None, 1j), (11, None), (10, None), (8, 3)],
+)
+def test_shufflesplit_errors(test_size, train_size):
+    with pytest.raises(ValueError):
+        next(ShuffleSplit(test_size=test_size, train_size=train_size).split(X))
+
+
+def test_shufflesplit_reproducible():
+    # Check that iterating twice on the ShuffleSplit gives the same
+    # sequence of train-test when the random_state is given
+    ss = ShuffleSplit(random_state=21)
+    assert_array_equal([a for a, b in ss.split(X)], [a for a, b in ss.split(X)])
+
+
+def test_stratifiedshufflesplit_list_input():
+    # Check that when y is a list / list of string labels, it works.
+    sss = StratifiedShuffleSplit(test_size=2, random_state=42)
+    X = np.ones(7)
+    y1 = ["1"] * 4 + ["0"] * 3
+    y2 = np.hstack((np.ones(4), np.zeros(3)))
+    y3 = y2.tolist()
+
+    np.testing.assert_equal(list(sss.split(X, y1)), list(sss.split(X, y2)))
+    np.testing.assert_equal(list(sss.split(X, y3)), list(sss.split(X, y2)))
+
+
+def test_train_test_split_allow_nans():
+    # Check that train_test_split allows input data with NaNs
+    X = np.arange(200, dtype=np.float64).reshape(10, -1)
+    X[2, :] = np.nan
+    y = np.repeat([0, 1], X.shape[0] / 2)
+    train_test_split(X, y, test_size=0.2, random_state=42)
+
+
+def test_check_cv():
+    X = np.ones(9)
+    cv = check_cv(3, classifier=False)
+    # Use numpy.testing.assert_equal which recursively compares
+    # lists of lists
+    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))
+
+    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
+    cv = check_cv(3, y_binary, classifier=True)
+    np.testing.assert_equal(
+        list(StratifiedKFold(3).split(X, y_binary)), list(cv.split(X, y_binary))
+    )
+
+    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
+    cv = check_cv(3, y_multiclass, classifier=True)
+    np.testing.assert_equal(
+        list(StratifiedKFold(3).split(X, y_multiclass)), list(cv.split(X, y_multiclass))
+    )
+    # also works with 2d multiclass
+    y_multiclass_2d = y_multiclass.reshape(-1, 1)
+    cv = check_cv(3, y_multiclass_2d, classifier=True)
+    np.testing.assert_equal(
+        list(StratifiedKFold(3).split(X, y_multiclass_2d)),
+        list(cv.split(X, y_multiclass_2d)),
+    )
+
+    assert not np.all(
+        next(StratifiedKFold(3).split(X, y_multiclass_2d))[0]
+        == next(KFold(3).split(X, y_multiclass_2d))[0]
+    )
+
+    X = np.ones(5)
+    y_multilabel = np.array(
+        [[0, 0, 0, 0], [0, 1, 1, 0], [0, 0, 0, 1], [1, 1, 0, 1], [0, 0, 1, 0]]
+    )
+    cv = check_cv(3, y_multilabel, classifier=True)
+    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))
+
+    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
+    cv = check_cv(3, y_multioutput, classifier=True)
+    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))
+
+    with pytest.raises(ValueError):
+        check_cv(cv="lolo")
+
+
+def test_cv_iterable_wrapper():
+    kf_iter = KFold().split(X, y)
+    kf_iter_wrapped = check_cv(kf_iter)
+    # Since the wrapped iterable is enlisted and stored,
+    # split can be called any number of times to produce
+    # consistent results.
+    np.testing.assert_equal(
+        list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y))
+    )
+    # If the splits are randomized, successive calls to split yields different
+    # results
+    kf_randomized_iter = KFold(shuffle=True, random_state=0).split(X, y)
+    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
+    # numpy's assert_array_equal properly compares nested lists
+    np.testing.assert_equal(
+        list(kf_randomized_iter_wrapped.split(X, y)),
+        list(kf_randomized_iter_wrapped.split(X, y)),
+    )
+
+    try:
+        splits_are_equal = True
+        np.testing.assert_equal(
+            list(kf_iter_wrapped.split(X, y)),
+            list(kf_randomized_iter_wrapped.split(X, y)),
+        )
+    except AssertionError:
+        splits_are_equal = False
+    assert not splits_are_equal, (
+        "If the splits are randomized, "
+        "successive calls to split should yield different results"
+    )
+
+
+@pytest.mark.parametrize("kfold", [GroupKFold, StratifiedGroupKFold])
+@pytest.mark.parametrize("shuffle", [True, False])
+def test_group_kfold(kfold, shuffle, global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+
+    # Parameters of the test
+    n_groups = 15
+    n_samples = 1000
+    n_splits = 5
+
+    X = y = np.ones(n_samples)
+
+    # Construct the test data
+    tolerance = 0.05 * n_samples  # 5 percent error allowed
+    groups = rng.randint(0, n_groups, n_samples)
+
+    ideal_n_groups_per_fold = n_samples // n_splits
+
+    len(np.unique(groups))
+    # Get the test fold indices from the test set indices of each fold
+    folds = np.zeros(n_samples)
+    random_state = None if not shuffle else global_random_seed
+    lkf = kfold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
+        folds[test] = i
+
+    # Check that folds have approximately the same size
+    assert len(folds) == len(groups)
+    for i in np.unique(folds):
+        assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold)
+
+    # Check that each group appears only in 1 fold
+    for group in np.unique(groups):
+        assert len(np.unique(folds[groups == group])) == 1
+
+    # Check that no group is on both sides of the split
+    groups = np.asarray(groups, dtype=object)
+    for train, test in lkf.split(X, y, groups):
+        assert len(np.intersect1d(groups[train], groups[test])) == 0
+
+    # Construct the test data
+    groups = np.array(
+        [
+            "Albert",
+            "Jean",
+            "Bertrand",
+            "Michel",
+            "Jean",
+            "Francis",
+            "Robert",
+            "Michel",
+            "Rachel",
+            "Lois",
+            "Michelle",
+            "Bernard",
+            "Marion",
+            "Laura",
+            "Jean",
+            "Rachel",
+            "Franck",
+            "John",
+            "Gael",
+            "Anna",
+            "Alix",
+            "Robert",
+            "Marion",
+            "David",
+            "Tony",
+            "Abel",
+            "Becky",
+            "Madmood",
+            "Cary",
+            "Mary",
+            "Alexandre",
+            "David",
+            "Francis",
+            "Barack",
+            "Abdoul",
+            "Rasha",
+            "Xi",
+            "Silvia",
+        ]
+    )
+
+    n_groups = len(np.unique(groups))
+    n_samples = len(groups)
+    n_splits = 5
+    tolerance = 0.05 * n_samples  # 5 percent error allowed
+    ideal_n_groups_per_fold = n_samples // n_splits
+
+    X = y = np.ones(n_samples)
+
+    # Get the test fold indices from the test set indices of each fold
+    folds = np.zeros(n_samples)
+    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
+        folds[test] = i
+
+    # Check that folds have approximately the same size
+    assert len(folds) == len(groups)
+    if not shuffle:
+        for i in np.unique(folds):
+            assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold)
+
+    # Check that each group appears only in 1 fold
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", FutureWarning)
+        for group in np.unique(groups):
+            assert len(np.unique(folds[groups == group])) == 1
+
+    # Check that no group is on both sides of the split
+    groups = np.asarray(groups, dtype=object)
+    for train, test in lkf.split(X, y, groups):
+        assert len(np.intersect1d(groups[train], groups[test])) == 0
+
+    # groups can also be a list
+    # use a new instance for reproducibility when shuffle=True
+    lkf_copy = kfold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+    cv_iter = list(lkf.split(X, y, groups.tolist()))
+    for (train1, test1), (train2, test2) in zip(lkf_copy.split(X, y, groups), cv_iter):
+        assert_array_equal(train1, train2)
+        assert_array_equal(test1, test2)
+
+    # Should fail if there are more folds than groups
+    groups = np.array([1, 1, 1, 2, 2])
+    X = y = np.ones(len(groups))
+    with pytest.raises(ValueError, match="Cannot have number of splits.*greater"):
+        next(GroupKFold(n_splits=3).split(X, y, groups))
+
+
+def test_time_series_cv():
+    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]
+
+    # Should fail if there are more folds than samples
+    with pytest.raises(ValueError, match="Cannot have number of folds.*greater"):
+        next(TimeSeriesSplit(n_splits=7).split(X))
+
+    tscv = TimeSeriesSplit(2)
+
+    # Manually check that Time Series CV preserves the data
+    # ordering on toy datasets
+    splits = tscv.split(X[:-1])
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1])
+    assert_array_equal(test, [2, 3])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3])
+    assert_array_equal(test, [4, 5])
+
+    splits = TimeSeriesSplit(2).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2])
+    assert_array_equal(test, [3, 4])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [5, 6])
+
+    # Check get_n_splits returns the correct number of splits
+    splits = TimeSeriesSplit(2).split(X)
+    n_splits_actual = len(list(splits))
+    assert n_splits_actual == tscv.get_n_splits()
+    assert n_splits_actual == 2
+
+
+def _check_time_series_max_train_size(splits, check_splits, max_train_size):
+    for (train, test), (check_train, check_test) in zip(splits, check_splits):
+        assert_array_equal(test, check_test)
+        assert len(check_train) <= max_train_size
+        suffix_start = max(len(train) - max_train_size, 0)
+        assert_array_equal(check_train, train[suffix_start:])
+
+
+def test_time_series_max_train_size():
+    X = np.zeros((6, 1))
+    splits = TimeSeriesSplit(n_splits=3).split(X)
+    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X)
+    _check_time_series_max_train_size(splits, check_splits, max_train_size=3)
+
+    # Test for the case where the size of a fold is greater than max_train_size
+    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X)
+    _check_time_series_max_train_size(splits, check_splits, max_train_size=2)
+
+    # Test for the case where the size of each fold is less than max_train_size
+    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X)
+    _check_time_series_max_train_size(splits, check_splits, max_train_size=2)
+
+
+def test_time_series_test_size():
+    X = np.zeros((10, 1))
+
+    # Test alone
+    splits = TimeSeriesSplit(n_splits=3, test_size=3).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0])
+    assert_array_equal(test, [1, 2, 3])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3])
+    assert_array_equal(test, [4, 5, 6])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6])
+    assert_array_equal(test, [7, 8, 9])
+
+    # Test with max_train_size
+    splits = TimeSeriesSplit(n_splits=2, test_size=2, max_train_size=4).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [2, 3, 4, 5])
+    assert_array_equal(test, [6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [4, 5, 6, 7])
+    assert_array_equal(test, [8, 9])
+
+    # Should fail with not enough data points for configuration
+    with pytest.raises(ValueError, match="Too many splits.*with test_size"):
+        splits = TimeSeriesSplit(n_splits=5, test_size=2).split(X)
+        next(splits)
+
+
+def test_time_series_gap():
+    X = np.zeros((10, 1))
+
+    # Test alone
+    splits = TimeSeriesSplit(n_splits=2, gap=2).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1])
+    assert_array_equal(test, [4, 5, 6])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [7, 8, 9])
+
+    # Test with max_train_size
+    splits = TimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1])
+    assert_array_equal(test, [4, 5])
+
+    train, test = next(splits)
+    assert_array_equal(train, [2, 3])
+    assert_array_equal(test, [6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [4, 5])
+    assert_array_equal(test, [8, 9])
+
+    # Test with test_size
+    splits = TimeSeriesSplit(n_splits=2, gap=2, max_train_size=4, test_size=2).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3])
+    assert_array_equal(test, [6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [2, 3, 4, 5])
+    assert_array_equal(test, [8, 9])
+
+    # Test with additional test_size
+    splits = TimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1])
+    assert_array_equal(test, [4, 5, 6])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [7, 8, 9])
+
+    # Verify proper error is thrown
+    with pytest.raises(ValueError, match="Too many splits.*and gap"):
+        splits = TimeSeriesSplit(n_splits=4, gap=2).split(X)
+        next(splits)
+
+
+@ignore_warnings
+def test_nested_cv():
+    # Test if nested cross validation works with different combinations of cv
+    rng = np.random.RandomState(0)
+
+    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
+    groups = rng.randint(0, 5, 15)
+
+    cvs = [
+        LeaveOneGroupOut(),
+        StratifiedKFold(n_splits=2),
+        LeaveOneOut(),
+        GroupKFold(n_splits=3),
+        StratifiedKFold(),
+        StratifiedGroupKFold(),
+        StratifiedShuffleSplit(n_splits=3, random_state=0),
+    ]
+
+    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
+        gs = GridSearchCV(
+            DummyClassifier(),
+            param_grid={"strategy": ["stratified", "most_frequent"]},
+            cv=inner_cv,
+            error_score="raise",
+        )
+        cross_val_score(
+            gs, X=X, y=y, groups=groups, cv=outer_cv, params={"groups": groups}
+        )
+
+
+def test_build_repr():
+    class MockSplitter:
+        def __init__(self, a, b=0, c=None):
+            self.a = a
+            self.b = b
+            self.c = c
+
+        def __repr__(self):
+            return _build_repr(self)
+
+    assert repr(MockSplitter(5, 6)) == "MockSplitter(a=5, b=6, c=None)"
+
+
+@pytest.mark.parametrize(
+    "CVSplitter", (ShuffleSplit, GroupShuffleSplit, StratifiedShuffleSplit)
+)
+def test_shuffle_split_empty_trainset(CVSplitter):
+    cv = CVSplitter(test_size=0.99)
+    X, y = [[1]], [0]  # 1 sample
+    with pytest.raises(
+        ValueError,
+        match=(
+            "With n_samples=1, test_size=0.99 and train_size=None, "
+            "the resulting train set will be empty"
+        ),
+    ):
+        next(_split(cv, X, y, groups=[1]))
+
+
+def test_train_test_split_empty_trainset():
+    (X,) = [[1]]  # 1 sample
+    with pytest.raises(
+        ValueError,
+        match=(
+            "With n_samples=1, test_size=0.99 and train_size=None, "
+            "the resulting train set will be empty"
+        ),
+    ):
+        train_test_split(X, test_size=0.99)
+
+    X = [[1], [1], [1]]  # 3 samples, ask for more than 2 thirds
+    with pytest.raises(
+        ValueError,
+        match=(
+            "With n_samples=3, test_size=0.67 and train_size=None, "
+            "the resulting train set will be empty"
+        ),
+    ):
+        train_test_split(X, test_size=0.67)
+
+
+def test_leave_one_out_empty_trainset():
+    # LeaveOneGroup out expect at least 2 groups so no need to check
+    cv = LeaveOneOut()
+    X, y = [[1]], [0]  # 1 sample
+    with pytest.raises(ValueError, match="Cannot perform LeaveOneOut with n_samples=1"):
+        next(cv.split(X, y))
+
+
+def test_leave_p_out_empty_trainset():
+    # No need to check LeavePGroupsOut
+    cv = LeavePOut(p=2)
+    X, y = [[1], [2]], [0, 3]  # 2 samples
+    with pytest.raises(
+        ValueError, match="p=2 must be strictly less than the number of samples=2"
+    ):
+        next(cv.split(X, y))
+
+
+@pytest.mark.parametrize(
+    "Klass", (KFold, StratifiedKFold, StratifiedGroupKFold, GroupKFold)
+)
+def test_random_state_shuffle_false(Klass):
+    # passing a non-default random_state when shuffle=False makes no sense
+    with pytest.raises(ValueError, match="has no effect since shuffle is False"):
+        Klass(3, shuffle=False, random_state=0)
+
+
+@pytest.mark.parametrize(
+    "cv, expected",
+    [
+        (KFold(), True),
+        (KFold(shuffle=True, random_state=123), True),
+        (StratifiedKFold(), True),
+        (StratifiedKFold(shuffle=True, random_state=123), True),
+        (StratifiedGroupKFold(shuffle=True, random_state=123), True),
+        (StratifiedGroupKFold(), True),
+        (RepeatedKFold(random_state=123), True),
+        (RepeatedStratifiedKFold(random_state=123), True),
+        (ShuffleSplit(random_state=123), True),
+        (GroupShuffleSplit(random_state=123), True),
+        (StratifiedShuffleSplit(random_state=123), True),
+        (GroupKFold(), True),
+        (GroupKFold(shuffle=True, random_state=123), True),
+        (TimeSeriesSplit(), True),
+        (LeaveOneOut(), True),
+        (LeaveOneGroupOut(), True),
+        (LeavePGroupsOut(n_groups=2), True),
+        (LeavePOut(p=2), True),
+        (KFold(shuffle=True, random_state=None), False),
+        (KFold(shuffle=True, random_state=None), False),
+        (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), False),
+        (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), False),
+        (RepeatedKFold(random_state=None), False),
+        (RepeatedKFold(random_state=np.random.RandomState(0)), False),
+        (RepeatedStratifiedKFold(random_state=None), False),
+        (RepeatedStratifiedKFold(random_state=np.random.RandomState(0)), False),
+        (ShuffleSplit(random_state=None), False),
+        (ShuffleSplit(random_state=np.random.RandomState(0)), False),
+        (GroupShuffleSplit(random_state=None), False),
+        (GroupShuffleSplit(random_state=np.random.RandomState(0)), False),
+        (StratifiedShuffleSplit(random_state=None), False),
+        (StratifiedShuffleSplit(random_state=np.random.RandomState(0)), False),
+    ],
+)
+def test_yields_constant_splits(cv, expected):
+    assert _yields_constant_splits(cv) == expected
+
+
+@pytest.mark.parametrize("cv", ALL_SPLITTERS, ids=[str(cv) for cv in ALL_SPLITTERS])
+def test_splitter_get_metadata_routing(cv):
+    """Check get_metadata_routing returns the correct MetadataRouter."""
+    assert hasattr(cv, "get_metadata_routing")
+    metadata = cv.get_metadata_routing()
+    if cv in GROUP_SPLITTERS:
+        assert metadata.split.requests["groups"] is True
+    elif cv in NO_GROUP_SPLITTERS:
+        assert not metadata.split.requests
+
+    assert_request_is_empty(metadata, exclude=["split"])
+
+
+@pytest.mark.parametrize("cv", ALL_SPLITTERS, ids=[str(cv) for cv in ALL_SPLITTERS])
+def test_splitter_set_split_request(cv):
+    """Check set_split_request is defined for group splitters and not for others."""
+    if cv in GROUP_SPLITTERS:
+        assert hasattr(cv, "set_split_request")
+    elif cv in NO_GROUP_SPLITTERS:
+        assert not hasattr(cv, "set_split_request")
+
+
+@pytest.mark.parametrize("cv", NO_GROUP_SPLITTERS, ids=str)
+def test_no_group_splitters_warns_with_groups(cv):
+    msg = f"The groups parameter is ignored by {cv.__class__.__name__}"
+
+    n_samples = 30
+    rng = np.random.RandomState(1)
+    X = rng.randint(0, 3, size=(n_samples, 2))
+    y = rng.randint(0, 3, size=(n_samples,))
+    groups = rng.randint(0, 3, size=(n_samples,))
+
+    with pytest.warns(UserWarning, match=msg):
+        cv.split(X, y, groups=groups)
+
+
+@pytest.mark.parametrize(
+    "cv", SPLITTERS_REQUIRING_TARGET, ids=[str(cv) for cv in SPLITTERS_REQUIRING_TARGET]
+)
+def test_stratified_splitter_without_y(cv):
+    msg = "missing 1 required positional argument: 'y'"
+    with pytest.raises(TypeError, match=msg):
+        cv.split(X)
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_successive_halving.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_successive_halving.py
new file mode 100644
index 0000000000000000000000000000000000000000..c75798437bc862dc64fbb9a5a504a1a2a9522c84
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_successive_halving.py
@@ -0,0 +1,856 @@
+from math import ceil
+
+import numpy as np
+import pytest
+from scipy.stats import expon, norm, randint
+
+from sklearn.datasets import make_classification
+from sklearn.dummy import DummyClassifier
+from sklearn.experimental import enable_halving_search_cv  # noqa
+from sklearn.model_selection import (
+    GroupKFold,
+    GroupShuffleSplit,
+    HalvingGridSearchCV,
+    HalvingRandomSearchCV,
+    KFold,
+    LeaveOneGroupOut,
+    LeavePGroupsOut,
+    ShuffleSplit,
+    StratifiedKFold,
+    StratifiedShuffleSplit,
+)
+from sklearn.model_selection._search_successive_halving import (
+    _SubsampleMetaSplitter,
+    _top_k,
+)
+from sklearn.model_selection.tests.test_search import (
+    check_cv_results_array_types,
+    check_cv_results_keys,
+)
+from sklearn.svm import SVC, LinearSVC
+
+
+class FastClassifier(DummyClassifier):
+    """Dummy classifier that accepts parameters a, b, ... z.
+
+    These parameter don't affect the predictions and are useful for fast
+    grid searching."""
+
+    # update the constraints such that we accept all parameters from a to z
+    _parameter_constraints: dict = {
+        **DummyClassifier._parameter_constraints,
+        **{
+            chr(key): "no_validation"  # type: ignore
+            for key in range(ord("a"), ord("z") + 1)
+        },
+    }
+
+    def __init__(
+        self, strategy="stratified", random_state=None, constant=None, **kwargs
+    ):
+        super().__init__(
+            strategy=strategy, random_state=random_state, constant=constant
+        )
+
+    def get_params(self, deep=False):
+        params = super().get_params(deep=deep)
+        for char in range(ord("a"), ord("z") + 1):
+            params[chr(char)] = "whatever"
+        return params
+
+
+class SometimesFailClassifier(DummyClassifier):
+    def __init__(
+        self,
+        strategy="stratified",
+        random_state=None,
+        constant=None,
+        n_estimators=10,
+        fail_fit=False,
+        fail_predict=False,
+        a=0,
+    ):
+        self.fail_fit = fail_fit
+        self.fail_predict = fail_predict
+        self.n_estimators = n_estimators
+        self.a = a
+
+        super().__init__(
+            strategy=strategy, random_state=random_state, constant=constant
+        )
+
+    def fit(self, X, y):
+        if self.fail_fit:
+            raise Exception("fitting failed")
+        return super().fit(X, y)
+
+    def predict(self, X):
+        if self.fail_predict:
+            raise Exception("predict failed")
+        return super().predict(X)
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.FitFailedWarning")
+@pytest.mark.filterwarnings("ignore:Scoring failed:UserWarning")
+@pytest.mark.filterwarnings("ignore:One or more of the:UserWarning")
+@pytest.mark.parametrize("HalvingSearch", (HalvingGridSearchCV, HalvingRandomSearchCV))
+@pytest.mark.parametrize("fail_at", ("fit", "predict"))
+def test_nan_handling(HalvingSearch, fail_at):
+    """Check the selection of the best scores in presence of failure represented by
+    NaN values."""
+    n_samples = 1_000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+
+    search = HalvingSearch(
+        SometimesFailClassifier(),
+        {f"fail_{fail_at}": [False, True], "a": range(3)},
+        resource="n_estimators",
+        max_resources=6,
+        min_resources=1,
+        factor=2,
+    )
+
+    search.fit(X, y)
+
+    # estimators that failed during fit/predict should always rank lower
+    # than ones where the fit/predict succeeded
+    assert not search.best_params_[f"fail_{fail_at}"]
+    scores = search.cv_results_["mean_test_score"]
+    ranks = search.cv_results_["rank_test_score"]
+
+    # some scores should be NaN
+    assert np.isnan(scores).any()
+
+    unique_nan_ranks = np.unique(ranks[np.isnan(scores)])
+    # all NaN scores should have the same rank
+    assert unique_nan_ranks.shape[0] == 1
+    # NaNs should have the lowest rank
+    assert (unique_nan_ranks[0] >= ranks).all()
+
+
+@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV))
+@pytest.mark.parametrize(
+    (
+        "aggressive_elimination,"
+        "max_resources,"
+        "expected_n_iterations,"
+        "expected_n_required_iterations,"
+        "expected_n_possible_iterations,"
+        "expected_n_remaining_candidates,"
+        "expected_n_candidates,"
+        "expected_n_resources,"
+    ),
+    [
+        # notice how it loops at the beginning
+        # also, the number of candidates evaluated at the last iteration is
+        # <= factor
+        (True, "limited", 4, 4, 3, 1, [60, 20, 7, 3], [20, 20, 60, 180]),
+        # no aggressive elimination: we end up with less iterations, and
+        # the number of candidates at the last iter is > factor, which isn't
+        # ideal
+        (False, "limited", 3, 4, 3, 3, [60, 20, 7], [20, 60, 180]),
+        #  # When the amount of resource isn't limited, aggressive_elimination
+        #  # has no effect. Here the default min_resources='exhaust' will take
+        #  # over.
+        (True, "unlimited", 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]),
+        (False, "unlimited", 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]),
+    ],
+)
+def test_aggressive_elimination(
+    Est,
+    aggressive_elimination,
+    max_resources,
+    expected_n_iterations,
+    expected_n_required_iterations,
+    expected_n_possible_iterations,
+    expected_n_remaining_candidates,
+    expected_n_candidates,
+    expected_n_resources,
+):
+    # Test the aggressive_elimination parameter.
+
+    n_samples = 1000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": ("l1", "l2"), "b": list(range(30))}
+    base_estimator = FastClassifier()
+
+    if max_resources == "limited":
+        max_resources = 180
+    else:
+        max_resources = n_samples
+
+    sh = Est(
+        base_estimator,
+        param_grid,
+        aggressive_elimination=aggressive_elimination,
+        max_resources=max_resources,
+        factor=3,
+    )
+    sh.set_params(verbose=True)  # just for test coverage
+
+    if Est is HalvingRandomSearchCV:
+        # same number of candidates as with the grid
+        sh.set_params(n_candidates=2 * 30, min_resources="exhaust")
+
+    sh.fit(X, y)
+
+    assert sh.n_iterations_ == expected_n_iterations
+    assert sh.n_required_iterations_ == expected_n_required_iterations
+    assert sh.n_possible_iterations_ == expected_n_possible_iterations
+    assert sh.n_resources_ == expected_n_resources
+    assert sh.n_candidates_ == expected_n_candidates
+    assert sh.n_remaining_candidates_ == expected_n_remaining_candidates
+    assert ceil(sh.n_candidates_[-1] / sh.factor) == sh.n_remaining_candidates_
+
+
+@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV))
+@pytest.mark.parametrize(
+    (
+        "min_resources,"
+        "max_resources,"
+        "expected_n_iterations,"
+        "expected_n_possible_iterations,"
+        "expected_n_resources,"
+    ),
+    [
+        # with enough resources
+        ("smallest", "auto", 2, 4, [20, 60]),
+        # with enough resources but min_resources set manually
+        (50, "auto", 2, 3, [50, 150]),
+        # without enough resources, only one iteration can be done
+        ("smallest", 30, 1, 1, [20]),
+        # with exhaust: use as much resources as possible at the last iter
+        ("exhaust", "auto", 2, 2, [333, 999]),
+        ("exhaust", 1000, 2, 2, [333, 999]),
+        ("exhaust", 999, 2, 2, [333, 999]),
+        ("exhaust", 600, 2, 2, [200, 600]),
+        ("exhaust", 599, 2, 2, [199, 597]),
+        ("exhaust", 300, 2, 2, [100, 300]),
+        ("exhaust", 60, 2, 2, [20, 60]),
+        ("exhaust", 50, 1, 1, [20]),
+        ("exhaust", 20, 1, 1, [20]),
+    ],
+)
+def test_min_max_resources(
+    Est,
+    min_resources,
+    max_resources,
+    expected_n_iterations,
+    expected_n_possible_iterations,
+    expected_n_resources,
+):
+    # Test the min_resources and max_resources parameters, and how they affect
+    # the number of resources used at each iteration
+    n_samples = 1000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": [1, 2], "b": [1, 2, 3]}
+    base_estimator = FastClassifier()
+
+    sh = Est(
+        base_estimator,
+        param_grid,
+        factor=3,
+        min_resources=min_resources,
+        max_resources=max_resources,
+    )
+    if Est is HalvingRandomSearchCV:
+        sh.set_params(n_candidates=6)  # same number as with the grid
+
+    sh.fit(X, y)
+
+    expected_n_required_iterations = 2  # given 6 combinations and factor = 3
+    assert sh.n_iterations_ == expected_n_iterations
+    assert sh.n_required_iterations_ == expected_n_required_iterations
+    assert sh.n_possible_iterations_ == expected_n_possible_iterations
+    assert sh.n_resources_ == expected_n_resources
+    if min_resources == "exhaust":
+        assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh.n_resources_)
+
+
+@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV))
+@pytest.mark.parametrize(
+    "max_resources, n_iterations, n_possible_iterations",
+    [
+        ("auto", 5, 9),  # all resources are used
+        (1024, 5, 9),
+        (700, 5, 8),
+        (512, 5, 8),
+        (511, 5, 7),
+        (32, 4, 4),
+        (31, 3, 3),
+        (16, 3, 3),
+        (4, 1, 1),  # max_resources == min_resources, only one iteration is
+        # possible
+    ],
+)
+def test_n_iterations(Est, max_resources, n_iterations, n_possible_iterations):
+    # test the number of actual iterations that were run depending on
+    # max_resources
+
+    n_samples = 1024
+    X, y = make_classification(n_samples=n_samples, random_state=1)
+    param_grid = {"a": [1, 2], "b": list(range(10))}
+    base_estimator = FastClassifier()
+    factor = 2
+
+    sh = Est(
+        base_estimator,
+        param_grid,
+        cv=2,
+        factor=factor,
+        max_resources=max_resources,
+        min_resources=4,
+    )
+    if Est is HalvingRandomSearchCV:
+        sh.set_params(n_candidates=20)  # same as for HalvingGridSearchCV
+    sh.fit(X, y)
+    assert sh.n_required_iterations_ == 5
+    assert sh.n_iterations_ == n_iterations
+    assert sh.n_possible_iterations_ == n_possible_iterations
+
+
+@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV))
+def test_resource_parameter(Est):
+    # Test the resource parameter
+
+    n_samples = 1000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": [1, 2], "b": list(range(10))}
+    base_estimator = FastClassifier()
+    sh = Est(base_estimator, param_grid, cv=2, resource="c", max_resources=10, factor=3)
+    sh.fit(X, y)
+    assert set(sh.n_resources_) == set([1, 3, 9])
+    for r_i, params, param_c in zip(
+        sh.cv_results_["n_resources"],
+        sh.cv_results_["params"],
+        sh.cv_results_["param_c"],
+    ):
+        assert r_i == params["c"] == param_c
+
+    with pytest.raises(
+        ValueError, match="Cannot use resource=1234 which is not supported "
+    ):
+        sh = HalvingGridSearchCV(
+            base_estimator, param_grid, cv=2, resource="1234", max_resources=10
+        )
+        sh.fit(X, y)
+
+    with pytest.raises(
+        ValueError,
+        match=(
+            "Cannot use parameter c as the resource since it is part "
+            "of the searched parameters."
+        ),
+    ):
+        param_grid = {"a": [1, 2], "b": [1, 2], "c": [1, 3]}
+        sh = HalvingGridSearchCV(
+            base_estimator, param_grid, cv=2, resource="c", max_resources=10
+        )
+        sh.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "max_resources, n_candidates, expected_n_candidates",
+    [
+        (512, "exhaust", 128),  # generate exactly as much as needed
+        (32, "exhaust", 8),
+        (32, 8, 8),
+        (32, 7, 7),  # ask for less than what we could
+        (32, 9, 9),  # ask for more than 'reasonable'
+    ],
+)
+def test_random_search(max_resources, n_candidates, expected_n_candidates):
+    # Test random search and make sure the number of generated candidates is
+    # as expected
+
+    n_samples = 1024
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": norm, "b": norm}
+    base_estimator = FastClassifier()
+    sh = HalvingRandomSearchCV(
+        base_estimator,
+        param_grid,
+        n_candidates=n_candidates,
+        cv=2,
+        max_resources=max_resources,
+        factor=2,
+        min_resources=4,
+    )
+    sh.fit(X, y)
+    assert sh.n_candidates_[0] == expected_n_candidates
+    if n_candidates == "exhaust":
+        # Make sure 'exhaust' makes the last iteration use as much resources as
+        # we can
+        assert sh.n_resources_[-1] == max_resources
+
+
+@pytest.mark.parametrize(
+    "param_distributions, expected_n_candidates",
+    [
+        ({"a": [1, 2]}, 2),  # all lists, sample less than n_candidates
+        ({"a": randint(1, 3)}, 10),  # not all list, respect n_candidates
+    ],
+)
+def test_random_search_discrete_distributions(
+    param_distributions, expected_n_candidates
+):
+    # Make sure random search samples the appropriate number of candidates when
+    # we ask for more than what's possible. How many parameters are sampled
+    # depends whether the distributions are 'all lists' or not (see
+    # ParameterSampler for details). This is somewhat redundant with the checks
+    # in ParameterSampler but interaction bugs were discovered during
+    # development of SH
+
+    n_samples = 1024
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    base_estimator = FastClassifier()
+    sh = HalvingRandomSearchCV(base_estimator, param_distributions, n_candidates=10)
+    sh.fit(X, y)
+    assert sh.n_candidates_[0] == expected_n_candidates
+
+
+@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV))
+@pytest.mark.parametrize(
+    "params, expected_error_message",
+    [
+        (
+            {"resource": "not_a_parameter"},
+            "Cannot use resource=not_a_parameter which is not supported",
+        ),
+        (
+            {"resource": "a", "max_resources": 100},
+            "Cannot use parameter a as the resource since it is part of",
+        ),
+        (
+            {"max_resources": "auto", "resource": "b"},
+            "resource can only be 'n_samples' when max_resources='auto'",
+        ),
+        (
+            {"min_resources": 15, "max_resources": 14},
+            "min_resources_=15 is greater than max_resources_=14",
+        ),
+        ({"cv": KFold(shuffle=True)}, "must yield consistent folds"),
+        ({"cv": ShuffleSplit()}, "must yield consistent folds"),
+    ],
+)
+def test_input_errors(Est, params, expected_error_message):
+    base_estimator = FastClassifier()
+    param_grid = {"a": [1]}
+    X, y = make_classification(100)
+
+    sh = Est(base_estimator, param_grid, **params)
+
+    with pytest.raises(ValueError, match=expected_error_message):
+        sh.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "params, expected_error_message",
+    [
+        (
+            {"n_candidates": "exhaust", "min_resources": "exhaust"},
+            "cannot be both set to 'exhaust'",
+        ),
+    ],
+)
+def test_input_errors_randomized(params, expected_error_message):
+    # tests specific to HalvingRandomSearchCV
+
+    base_estimator = FastClassifier()
+    param_grid = {"a": [1]}
+    X, y = make_classification(100)
+
+    sh = HalvingRandomSearchCV(base_estimator, param_grid, **params)
+
+    with pytest.raises(ValueError, match=expected_error_message):
+        sh.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "fraction, subsample_test, expected_train_size, expected_test_size",
+    [
+        (0.5, True, 40, 10),
+        (0.5, False, 40, 20),
+        (0.2, True, 16, 4),
+        (0.2, False, 16, 20),
+    ],
+)
+def test_subsample_splitter_shapes(
+    fraction, subsample_test, expected_train_size, expected_test_size
+):
+    # Make sure splits returned by SubsampleMetaSplitter are of appropriate
+    # size
+
+    n_samples = 100
+    X, y = make_classification(n_samples)
+    cv = _SubsampleMetaSplitter(
+        base_cv=KFold(5),
+        fraction=fraction,
+        subsample_test=subsample_test,
+        random_state=None,
+    )
+
+    for train, test in cv.split(X, y):
+        assert train.shape[0] == expected_train_size
+        assert test.shape[0] == expected_test_size
+        if subsample_test:
+            assert train.shape[0] + test.shape[0] == int(n_samples * fraction)
+        else:
+            assert test.shape[0] == n_samples // cv.base_cv.get_n_splits()
+
+
+@pytest.mark.parametrize("subsample_test", (True, False))
+def test_subsample_splitter_determinism(subsample_test):
+    # Make sure _SubsampleMetaSplitter is consistent across calls to split():
+    # - we're OK having training sets differ (they're always sampled with a
+    #   different fraction anyway)
+    # - when we don't subsample the test set, we want it to be always the same.
+    #   This check is the most important. This is ensured by the determinism
+    #   of the base_cv.
+
+    # Note: we could force both train and test splits to be always the same if
+    # we drew an int seed in _SubsampleMetaSplitter.__init__
+
+    n_samples = 100
+    X, y = make_classification(n_samples)
+    cv = _SubsampleMetaSplitter(
+        base_cv=KFold(5), fraction=0.5, subsample_test=subsample_test, random_state=None
+    )
+
+    folds_a = list(cv.split(X, y, groups=None))
+    folds_b = list(cv.split(X, y, groups=None))
+
+    for (train_a, test_a), (train_b, test_b) in zip(folds_a, folds_b):
+        assert not np.all(train_a == train_b)
+
+        if subsample_test:
+            assert not np.all(test_a == test_b)
+        else:
+            assert np.all(test_a == test_b)
+            assert np.all(X[test_a] == X[test_b])
+
+
+@pytest.mark.parametrize(
+    "k, itr, expected",
+    [
+        (1, 0, ["c"]),
+        (2, 0, ["a", "c"]),
+        (4, 0, ["d", "b", "a", "c"]),
+        (10, 0, ["d", "b", "a", "c"]),
+        (1, 1, ["e"]),
+        (2, 1, ["f", "e"]),
+        (10, 1, ["f", "e"]),
+        (1, 2, ["i"]),
+        (10, 2, ["g", "h", "i"]),
+    ],
+)
+def test_top_k(k, itr, expected):
+    results = {  # this isn't a 'real world' result dict
+        "iter": [0, 0, 0, 0, 1, 1, 2, 2, 2],
+        "mean_test_score": [4, 3, 5, 1, 11, 10, 5, 6, 9],
+        "params": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
+    }
+    got = _top_k(results, k=k, itr=itr)
+    assert np.all(got == expected)
+
+
+@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV))
+def test_cv_results(Est):
+    # test that the cv_results_ matches correctly the logic of the
+    # tournament: in particular that the candidates continued in each
+    # successive iteration are those that were best in the previous iteration
+    pd = pytest.importorskip("pandas")
+
+    rng = np.random.RandomState(0)
+
+    n_samples = 1000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": ("l1", "l2"), "b": list(range(30))}
+    base_estimator = FastClassifier()
+
+    # generate random scores: we want to avoid ties, which would otherwise
+    # mess with the ordering and make testing harder
+    def scorer(est, X, y):
+        return rng.rand()
+
+    sh = Est(base_estimator, param_grid, factor=2, scoring=scorer)
+    if Est is HalvingRandomSearchCV:
+        # same number of candidates as with the grid
+        sh.set_params(n_candidates=2 * 30, min_resources="exhaust")
+
+    sh.fit(X, y)
+
+    # non-regression check for
+    # https://github.com/scikit-learn/scikit-learn/issues/19203
+    assert isinstance(sh.cv_results_["iter"], np.ndarray)
+    assert isinstance(sh.cv_results_["n_resources"], np.ndarray)
+
+    cv_results_df = pd.DataFrame(sh.cv_results_)
+
+    # just make sure we don't have ties
+    assert len(cv_results_df["mean_test_score"].unique()) == len(cv_results_df)
+
+    cv_results_df["params_str"] = cv_results_df["params"].apply(str)
+    table = cv_results_df.pivot(
+        index="params_str", columns="iter", values="mean_test_score"
+    )
+
+    # table looks like something like this:
+    # iter                    0      1       2        3   4   5
+    # params_str
+    # {'a': 'l2', 'b': 23} 0.75    NaN     NaN      NaN NaN NaN
+    # {'a': 'l1', 'b': 30} 0.90  0.875     NaN      NaN NaN NaN
+    # {'a': 'l1', 'b': 0}  0.75    NaN     NaN      NaN NaN NaN
+    # {'a': 'l2', 'b': 3}  0.85  0.925  0.9125  0.90625 NaN NaN
+    # {'a': 'l1', 'b': 5}  0.80    NaN     NaN      NaN NaN NaN
+    # ...
+
+    # where a NaN indicates that the candidate wasn't evaluated at a given
+    # iteration, because it wasn't part of the top-K at some previous
+    # iteration. We here make sure that candidates that aren't in the top-k at
+    # any given iteration are indeed not evaluated at the subsequent
+    # iterations.
+    nan_mask = pd.isna(table)
+    n_iter = sh.n_iterations_
+    for it in range(n_iter - 1):
+        already_discarded_mask = nan_mask[it]
+
+        # make sure that if a candidate is already discarded, we don't evaluate
+        # it later
+        assert (
+            already_discarded_mask & nan_mask[it + 1] == already_discarded_mask
+        ).all()
+
+        # make sure that the number of discarded candidate is correct
+        discarded_now_mask = ~already_discarded_mask & nan_mask[it + 1]
+        kept_mask = ~already_discarded_mask & ~discarded_now_mask
+        assert kept_mask.sum() == sh.n_candidates_[it + 1]
+
+        # make sure that all discarded candidates have a lower score than the
+        # kept candidates
+        discarded_max_score = table[it].where(discarded_now_mask).max()
+        kept_min_score = table[it].where(kept_mask).min()
+        assert discarded_max_score < kept_min_score
+
+    # We now make sure that the best candidate is chosen only from the last
+    # iteration.
+    # We also make sure this is true even if there were higher scores in
+    # earlier rounds (this isn't generally the case, but worth ensuring it's
+    # possible).
+
+    last_iter = cv_results_df["iter"].max()
+    idx_best_last_iter = cv_results_df[cv_results_df["iter"] == last_iter][
+        "mean_test_score"
+    ].idxmax()
+    idx_best_all_iters = cv_results_df["mean_test_score"].idxmax()
+
+    assert sh.best_params_ == cv_results_df.iloc[idx_best_last_iter]["params"]
+    assert (
+        cv_results_df.iloc[idx_best_last_iter]["mean_test_score"]
+        < cv_results_df.iloc[idx_best_all_iters]["mean_test_score"]
+    )
+    assert (
+        cv_results_df.iloc[idx_best_last_iter]["params"]
+        != cv_results_df.iloc[idx_best_all_iters]["params"]
+    )
+
+
+@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV))
+def test_base_estimator_inputs(Est):
+    # make sure that the base estimators are passed the correct parameters and
+    # number of samples at each iteration.
+    pd = pytest.importorskip("pandas")
+
+    passed_n_samples_fit = []
+    passed_n_samples_predict = []
+    passed_params = []
+
+    class FastClassifierBookKeeping(FastClassifier):
+        def fit(self, X, y):
+            passed_n_samples_fit.append(X.shape[0])
+            return super().fit(X, y)
+
+        def predict(self, X):
+            passed_n_samples_predict.append(X.shape[0])
+            return super().predict(X)
+
+        def set_params(self, **params):
+            passed_params.append(params)
+            return super().set_params(**params)
+
+    n_samples = 1024
+    n_splits = 2
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": ("l1", "l2"), "b": list(range(30))}
+    base_estimator = FastClassifierBookKeeping()
+
+    sh = Est(
+        base_estimator,
+        param_grid,
+        factor=2,
+        cv=n_splits,
+        return_train_score=False,
+        refit=False,
+    )
+    if Est is HalvingRandomSearchCV:
+        # same number of candidates as with the grid
+        sh.set_params(n_candidates=2 * 30, min_resources="exhaust")
+
+    sh.fit(X, y)
+
+    assert len(passed_n_samples_fit) == len(passed_n_samples_predict)
+    passed_n_samples = [
+        x + y for (x, y) in zip(passed_n_samples_fit, passed_n_samples_predict)
+    ]
+
+    # Lists are of length n_splits * n_iter * n_candidates_at_i.
+    # Each chunk of size n_splits corresponds to the n_splits folds for the
+    # same candidate at the same iteration, so they contain equal values. We
+    # subsample such that the lists are of length n_iter * n_candidates_at_it
+    passed_n_samples = passed_n_samples[::n_splits]
+    passed_params = passed_params[::n_splits]
+
+    cv_results_df = pd.DataFrame(sh.cv_results_)
+
+    assert len(passed_params) == len(passed_n_samples) == len(cv_results_df)
+
+    uniques, counts = np.unique(passed_n_samples, return_counts=True)
+    assert (sh.n_resources_ == uniques).all()
+    assert (sh.n_candidates_ == counts).all()
+
+    assert (cv_results_df["params"] == passed_params).all()
+    assert (cv_results_df["n_resources"] == passed_n_samples).all()
+
+
+@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV))
+def test_groups_support(Est):
+    # Check if ValueError (when groups is None) propagates to
+    # HalvingGridSearchCV and HalvingRandomSearchCV
+    # And also check if groups is correctly passed to the cv object
+    rng = np.random.RandomState(0)
+
+    X, y = make_classification(n_samples=50, n_classes=2, random_state=0)
+    groups = rng.randint(0, 3, 50)
+
+    clf = LinearSVC(random_state=0)
+    grid = {"C": [1]}
+
+    group_cvs = [
+        LeaveOneGroupOut(),
+        LeavePGroupsOut(2),
+        GroupKFold(n_splits=3),
+        GroupShuffleSplit(random_state=0),
+    ]
+    error_msg = "The 'groups' parameter should not be None."
+    for cv in group_cvs:
+        gs = Est(clf, grid, cv=cv, random_state=0)
+        with pytest.raises(ValueError, match=error_msg):
+            gs.fit(X, y)
+        gs.fit(X, y, groups=groups)
+
+    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit(random_state=0)]
+    for cv in non_group_cvs:
+        gs = Est(clf, grid, cv=cv)
+        # Should not raise an error
+        gs.fit(X, y)
+
+
+@pytest.mark.parametrize("SearchCV", [HalvingRandomSearchCV, HalvingGridSearchCV])
+def test_min_resources_null(SearchCV):
+    """Check that we raise an error if the minimum resources is set to 0."""
+    base_estimator = FastClassifier()
+    param_grid = {"a": [1]}
+    X = np.empty(0).reshape(0, 3)
+
+    search = SearchCV(base_estimator, param_grid, min_resources="smallest")
+
+    err_msg = "min_resources_=0: you might have passed an empty dataset X."
+    with pytest.raises(ValueError, match=err_msg):
+        search.fit(X, [])
+
+
+@pytest.mark.parametrize("SearchCV", [HalvingGridSearchCV, HalvingRandomSearchCV])
+def test_select_best_index(SearchCV):
+    """Check the selection strategy of the halving search."""
+    results = {  # this isn't a 'real world' result dict
+        "iter": np.array([0, 0, 0, 0, 1, 1, 2, 2, 2]),
+        "mean_test_score": np.array([4, 3, 5, 1, 11, 10, 5, 6, 9]),
+        "params": np.array(["a", "b", "c", "d", "e", "f", "g", "h", "i"]),
+    }
+
+    # we expect the index of 'i'
+    best_index = SearchCV._select_best_index(None, None, results)
+    assert best_index == 8
+
+
+def test_halving_random_search_list_of_dicts():
+    """Check the behaviour of the `HalvingRandomSearchCV` with `param_distribution`
+    being a list of dictionary.
+    """
+    X, y = make_classification(n_samples=150, n_features=4, random_state=42)
+
+    params = [
+        {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)},
+        {"kernel": ["poly"], "degree": [2, 3]},
+    ]
+    param_keys = (
+        "param_C",
+        "param_degree",
+        "param_gamma",
+        "param_kernel",
+    )
+    score_keys = (
+        "mean_test_score",
+        "mean_train_score",
+        "rank_test_score",
+        "split0_test_score",
+        "split1_test_score",
+        "split2_test_score",
+        "split0_train_score",
+        "split1_train_score",
+        "split2_train_score",
+        "std_test_score",
+        "std_train_score",
+        "mean_fit_time",
+        "std_fit_time",
+        "mean_score_time",
+        "std_score_time",
+    )
+    extra_keys = ("n_resources", "iter")
+
+    search = HalvingRandomSearchCV(
+        SVC(), cv=3, param_distributions=params, return_train_score=True, random_state=0
+    )
+    search.fit(X, y)
+    n_candidates = sum(search.n_candidates_)
+    cv_results = search.cv_results_
+    # Check results structure
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys)
+    expected_cv_results_kinds = {
+        "param_C": "f",
+        "param_degree": "i",
+        "param_gamma": "f",
+        "param_kernel": "O",
+    }
+    check_cv_results_array_types(
+        search, param_keys, score_keys, expected_cv_results_kinds
+    )
+
+    assert all(
+        (
+            cv_results["param_C"].mask[i]
+            and cv_results["param_gamma"].mask[i]
+            and not cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "poly"
+    )
+    assert all(
+        (
+            not cv_results["param_C"].mask[i]
+            and not cv_results["param_gamma"].mask[i]
+            and cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "rbf"
+    )
diff --git a/.venv/Lib/site-packages/sklearn/model_selection/tests/test_validation.py b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2a96d9b1e97f9c973a8b53f072fe50ff9f5ae13
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/model_selection/tests/test_validation.py
@@ -0,0 +1,2711 @@
+"""Test the validation module"""
+
+import os
+import re
+import sys
+import tempfile
+import warnings
+from functools import partial
+from io import StringIO
+from time import sleep
+
+import numpy as np
+import pytest
+from scipy.sparse import issparse
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator, ClassifierMixin, clone
+from sklearn.cluster import KMeans
+from sklearn.datasets import (
+    load_diabetes,
+    load_digits,
+    load_iris,
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.exceptions import FitFailedWarning
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import (
+    LogisticRegression,
+    PassiveAggressiveClassifier,
+    Ridge,
+    RidgeClassifier,
+    SGDClassifier,
+)
+from sklearn.metrics import (
+    accuracy_score,
+    check_scoring,
+    confusion_matrix,
+    explained_variance_score,
+    make_scorer,
+    mean_squared_error,
+    precision_recall_fscore_support,
+    precision_score,
+    r2_score,
+)
+from sklearn.metrics._scorer import _MultimetricScorer
+from sklearn.model_selection import (
+    GridSearchCV,
+    GroupKFold,
+    GroupShuffleSplit,
+    KFold,
+    LeaveOneGroupOut,
+    LeaveOneOut,
+    LeavePGroupsOut,
+    ShuffleSplit,
+    StratifiedKFold,
+    cross_val_predict,
+    cross_val_score,
+    cross_validate,
+    learning_curve,
+    permutation_test_score,
+    validation_curve,
+)
+from sklearn.model_selection._validation import (
+    _check_is_permutation,
+    _fit_and_score,
+    _score,
+)
+from sklearn.model_selection.tests.common import OneTimeSplitter
+from sklearn.model_selection.tests.test_search import FailingClassifier
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.neural_network import MLPRegressor
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder, scale
+from sklearn.svm import SVC, LinearSVC
+from sklearn.tests.metadata_routing_common import (
+    ConsumingClassifier,
+    ConsumingScorer,
+    ConsumingSplitter,
+    _Registry,
+    check_recorded_metadata,
+)
+from sklearn.utils import shuffle
+from sklearn.utils._mocking import CheckingClassifier, MockDataFrame
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS
+from sklearn.utils.validation import _num_samples
+
+
+class MockImprovingEstimator(BaseEstimator):
+    """Dummy classifier to test the learning curve"""
+
+    def __init__(self, n_max_train_sizes):
+        self.n_max_train_sizes = n_max_train_sizes
+        self.train_sizes = 0
+        self.X_subset = None
+
+    def fit(self, X_subset, y_subset=None):
+        self.X_subset = X_subset
+        self.train_sizes = X_subset.shape[0]
+        return self
+
+    def predict(self, X):
+        raise NotImplementedError
+
+    def score(self, X=None, Y=None):
+        # training score becomes worse (2 -> 1), test error better (0 -> 1)
+        if self._is_training_data(X):
+            return 2.0 - float(self.train_sizes) / self.n_max_train_sizes
+        else:
+            return float(self.train_sizes) / self.n_max_train_sizes
+
+    def _is_training_data(self, X):
+        return X is self.X_subset
+
+
+class MockIncrementalImprovingEstimator(MockImprovingEstimator):
+    """Dummy classifier that provides partial_fit"""
+
+    def __init__(self, n_max_train_sizes, expected_fit_params=None):
+        super().__init__(n_max_train_sizes)
+        self.x = None
+        self.expected_fit_params = expected_fit_params
+
+    def _is_training_data(self, X):
+        return self.x in X
+
+    def partial_fit(self, X, y=None, **params):
+        self.train_sizes += X.shape[0]
+        self.x = X[0]
+        if self.expected_fit_params:
+            missing = set(self.expected_fit_params) - set(params)
+            if missing:
+                raise AssertionError(
+                    f"Expected fit parameter(s) {list(missing)} not seen."
+                )
+            for key, value in params.items():
+                if key in self.expected_fit_params and _num_samples(
+                    value
+                ) != _num_samples(X):
+                    raise AssertionError(
+                        f"Fit parameter {key} has length {_num_samples(value)}"
+                        f"; expected {_num_samples(X)}."
+                    )
+
+
+class MockEstimatorWithParameter(BaseEstimator):
+    """Dummy classifier to test the validation curve"""
+
+    def __init__(self, param=0.5):
+        self.X_subset = None
+        self.param = param
+
+    def fit(self, X_subset, y_subset):
+        self.X_subset = X_subset
+        self.train_sizes = X_subset.shape[0]
+        return self
+
+    def predict(self, X):
+        raise NotImplementedError
+
+    def score(self, X=None, y=None):
+        return self.param if self._is_training_data(X) else 1 - self.param
+
+    def _is_training_data(self, X):
+        return X is self.X_subset
+
+
+class MockEstimatorWithSingleFitCallAllowed(MockEstimatorWithParameter):
+    """Dummy classifier that disallows repeated calls of fit method"""
+
+    def fit(self, X_subset, y_subset):
+        assert not hasattr(self, "fit_called_"), "fit is called the second time"
+        self.fit_called_ = True
+        return super().fit(X_subset, y_subset)
+
+    def predict(self, X):
+        raise NotImplementedError
+
+
+class MockClassifier(ClassifierMixin, BaseEstimator):
+    """Dummy classifier to test the cross-validation"""
+
+    def __init__(self, a=0, allow_nd=False):
+        self.a = a
+        self.allow_nd = allow_nd
+
+    def fit(
+        self,
+        X,
+        Y=None,
+        sample_weight=None,
+        class_prior=None,
+        sparse_sample_weight=None,
+        sparse_param=None,
+        dummy_int=None,
+        dummy_str=None,
+        dummy_obj=None,
+        callback=None,
+    ):
+        """The dummy arguments are to test that this fit function can
+        accept non-array arguments through cross-validation, such as:
+            - int
+            - str (this is actually array-like)
+            - object
+            - function
+        """
+        self.dummy_int = dummy_int
+        self.dummy_str = dummy_str
+        self.dummy_obj = dummy_obj
+        if callback is not None:
+            callback(self)
+
+        if self.allow_nd:
+            X = X.reshape(len(X), -1)
+        if X.ndim >= 3 and not self.allow_nd:
+            raise ValueError("X cannot be d")
+        if sample_weight is not None:
+            assert sample_weight.shape[0] == X.shape[0], (
+                "MockClassifier extra fit_param "
+                "sample_weight.shape[0] is {0}, should be {1}".format(
+                    sample_weight.shape[0], X.shape[0]
+                )
+            )
+        if class_prior is not None:
+            assert class_prior.shape[0] == len(np.unique(y)), (
+                "MockClassifier extra fit_param class_prior.shape[0]"
+                " is {0}, should be {1}".format(class_prior.shape[0], len(np.unique(y)))
+            )
+        if sparse_sample_weight is not None:
+            fmt = (
+                "MockClassifier extra fit_param sparse_sample_weight"
+                ".shape[0] is {0}, should be {1}"
+            )
+            assert sparse_sample_weight.shape[0] == X.shape[0], fmt.format(
+                sparse_sample_weight.shape[0], X.shape[0]
+            )
+        if sparse_param is not None:
+            fmt = (
+                "MockClassifier extra fit_param sparse_param.shape "
+                "is ({0}, {1}), should be ({2}, {3})"
+            )
+            assert sparse_param.shape == P.shape, fmt.format(
+                sparse_param.shape[0],
+                sparse_param.shape[1],
+                P.shape[0],
+                P.shape[1],
+            )
+        self.classes_ = np.unique(y)
+        return self
+
+    def predict(self, T):
+        if self.allow_nd:
+            T = T.reshape(len(T), -1)
+        return T[:, 0]
+
+    def predict_proba(self, T):
+        return T
+
+    def score(self, X=None, Y=None):
+        return 1.0 / (1 + np.abs(self.a))
+
+    def get_params(self, deep=False):
+        return {"a": self.a, "allow_nd": self.allow_nd}
+
+
+# XXX: use 2D array, since 1D X is being detected as a single sample in
+# check_consistent_length
+X = np.ones((15, 2))
+y = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6])
+# The number of samples per class needs to be > n_splits,
+# for StratifiedKFold(n_splits=3)
+y2 = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
+P = np.eye(5)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_cross_val_score(coo_container):
+    clf = MockClassifier()
+    X_sparse = coo_container(X)
+
+    for a in range(-10, 10):
+        clf.a = a
+        # Smoke test
+        scores = cross_val_score(clf, X, y2)
+        assert_array_equal(scores, clf.score(X, y2))
+
+        # test with multioutput y
+        multioutput_y = np.column_stack([y2, y2[::-1]])
+        scores = cross_val_score(clf, X_sparse, multioutput_y)
+        assert_array_equal(scores, clf.score(X_sparse, multioutput_y))
+
+        scores = cross_val_score(clf, X_sparse, y2)
+        assert_array_equal(scores, clf.score(X_sparse, y2))
+
+        # test with multioutput y
+        scores = cross_val_score(clf, X_sparse, multioutput_y)
+        assert_array_equal(scores, clf.score(X_sparse, multioutput_y))
+
+    # test with X and y as list
+    list_check = lambda x: isinstance(x, list)
+    clf = CheckingClassifier(check_X=list_check)
+    scores = cross_val_score(clf, X.tolist(), y2.tolist(), cv=3)
+
+    clf = CheckingClassifier(check_y=list_check)
+    scores = cross_val_score(clf, X, y2.tolist(), cv=3)
+
+    # test with 3d X and
+    X_3d = X[:, :, np.newaxis]
+    clf = MockClassifier(allow_nd=True)
+    scores = cross_val_score(clf, X_3d, y2)
+
+    clf = MockClassifier(allow_nd=False)
+    with pytest.raises(ValueError):
+        cross_val_score(clf, X_3d, y2, error_score="raise")
+
+
+def test_cross_validate_many_jobs():
+    # regression test for #12154: cv='warn' with n_jobs>1 trigger a copy of
+    # the parameters leading to a failure in check_cv due to cv is 'warn'
+    # instead of cv == 'warn'.
+    X, y = load_iris(return_X_y=True)
+    clf = SVC(gamma="auto")
+    grid = GridSearchCV(clf, param_grid={"C": [1, 10]})
+    cross_validate(grid, X, y, n_jobs=2)
+
+
+def test_cross_validate_invalid_scoring_param():
+    X, y = make_classification(random_state=0)
+    estimator = MockClassifier()
+
+    # Test the errors
+    error_message_regexp = ".*must be unique strings.*"
+
+    # List/tuple of callables should raise a message advising users to use
+    # dict of names to callables mapping
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_validate(
+            estimator,
+            X,
+            y,
+            scoring=(make_scorer(precision_score), make_scorer(accuracy_score)),
+        )
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_validate(estimator, X, y, scoring=(make_scorer(precision_score),))
+
+    # So should empty lists/tuples
+    with pytest.raises(ValueError, match=error_message_regexp + "Empty list.*"):
+        cross_validate(estimator, X, y, scoring=())
+
+    # So should duplicated entries
+    with pytest.raises(ValueError, match=error_message_regexp + "Duplicate.*"):
+        cross_validate(estimator, X, y, scoring=("f1_micro", "f1_micro"))
+
+    # Nested Lists should raise a generic error message
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_validate(estimator, X, y, scoring=[[make_scorer(precision_score)]])
+
+    # Empty dict should raise invalid scoring error
+    with pytest.raises(ValueError, match="An empty dict"):
+        cross_validate(estimator, X, y, scoring=(dict()))
+
+    multiclass_scorer = make_scorer(precision_recall_fscore_support)
+
+    # Multiclass Scorers that return multiple values are not supported yet
+    # the warning message we're expecting to see
+    warning_message = (
+        "Scoring failed. The score on this train-test "
+        f"partition for these parameters will be set to {np.nan}. "
+        "Details: \n"
+    )
+
+    with pytest.warns(UserWarning, match=warning_message):
+        cross_validate(estimator, X, y, scoring=multiclass_scorer)
+
+    with pytest.warns(UserWarning, match=warning_message):
+        cross_validate(estimator, X, y, scoring={"foo": multiclass_scorer})
+
+
+def test_cross_validate_nested_estimator():
+    # Non-regression test to ensure that nested
+    # estimators are properly returned in a list
+    # https://github.com/scikit-learn/scikit-learn/pull/17745
+    (X, y) = load_iris(return_X_y=True)
+    pipeline = Pipeline(
+        [
+            ("imputer", SimpleImputer()),
+            ("classifier", MockClassifier()),
+        ]
+    )
+
+    results = cross_validate(pipeline, X, y, return_estimator=True)
+    estimators = results["estimator"]
+
+    assert isinstance(estimators, list)
+    assert all(isinstance(estimator, Pipeline) for estimator in estimators)
+
+
+@pytest.mark.parametrize("use_sparse", [False, True])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_cross_validate(use_sparse: bool, csr_container):
+    # Compute train and test mse/r2 scores
+    cv = KFold()
+
+    # Regression
+    X_reg, y_reg = make_regression(n_samples=30, random_state=0)
+    reg = Ridge(random_state=0)
+
+    # Classification
+    X_clf, y_clf = make_classification(n_samples=30, random_state=0)
+    clf = SVC(kernel="linear", random_state=0)
+
+    if use_sparse:
+        X_reg = csr_container(X_reg)
+        X_clf = csr_container(X_clf)
+
+    for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)):
+        # It's okay to evaluate regression metrics on classification too
+        mse_scorer = check_scoring(est, scoring="neg_mean_squared_error")
+        r2_scorer = check_scoring(est, scoring="r2")
+        train_mse_scores = []
+        test_mse_scores = []
+        train_r2_scores = []
+        test_r2_scores = []
+        fitted_estimators = []
+
+        for train, test in cv.split(X, y):
+            est = clone(est).fit(X[train], y[train])
+            train_mse_scores.append(mse_scorer(est, X[train], y[train]))
+            train_r2_scores.append(r2_scorer(est, X[train], y[train]))
+            test_mse_scores.append(mse_scorer(est, X[test], y[test]))
+            test_r2_scores.append(r2_scorer(est, X[test], y[test]))
+            fitted_estimators.append(est)
+
+        train_mse_scores = np.array(train_mse_scores)
+        test_mse_scores = np.array(test_mse_scores)
+        train_r2_scores = np.array(train_r2_scores)
+        test_r2_scores = np.array(test_r2_scores)
+        fitted_estimators = np.array(fitted_estimators)
+
+        scores = (
+            train_mse_scores,
+            test_mse_scores,
+            train_r2_scores,
+            test_r2_scores,
+            fitted_estimators,
+        )
+
+        # To ensure that the test does not suffer from
+        # large statistical fluctuations due to slicing small datasets,
+        # we pass the cross-validation instance
+        check_cross_validate_single_metric(est, X, y, scores, cv)
+        check_cross_validate_multi_metric(est, X, y, scores, cv)
+
+
+def check_cross_validate_single_metric(clf, X, y, scores, cv):
+    (
+        train_mse_scores,
+        test_mse_scores,
+        train_r2_scores,
+        test_r2_scores,
+        fitted_estimators,
+    ) = scores
+    # Test single metric evaluation when scoring is string or singleton list
+    for return_train_score, dict_len in ((True, 4), (False, 3)):
+        # Single metric passed as a string
+        if return_train_score:
+            mse_scores_dict = cross_validate(
+                clf,
+                X,
+                y,
+                scoring="neg_mean_squared_error",
+                return_train_score=True,
+                cv=cv,
+            )
+            assert_array_almost_equal(mse_scores_dict["train_score"], train_mse_scores)
+        else:
+            mse_scores_dict = cross_validate(
+                clf,
+                X,
+                y,
+                scoring="neg_mean_squared_error",
+                return_train_score=False,
+                cv=cv,
+            )
+        assert isinstance(mse_scores_dict, dict)
+        assert len(mse_scores_dict) == dict_len
+        assert_array_almost_equal(mse_scores_dict["test_score"], test_mse_scores)
+
+        # Single metric passed as a list
+        if return_train_score:
+            # It must be True by default - deprecated
+            r2_scores_dict = cross_validate(
+                clf, X, y, scoring=["r2"], return_train_score=True, cv=cv
+            )
+            assert_array_almost_equal(r2_scores_dict["train_r2"], train_r2_scores, True)
+        else:
+            r2_scores_dict = cross_validate(
+                clf, X, y, scoring=["r2"], return_train_score=False, cv=cv
+            )
+        assert isinstance(r2_scores_dict, dict)
+        assert len(r2_scores_dict) == dict_len
+        assert_array_almost_equal(r2_scores_dict["test_r2"], test_r2_scores)
+
+    # Test return_estimator option
+    mse_scores_dict = cross_validate(
+        clf, X, y, scoring="neg_mean_squared_error", return_estimator=True, cv=cv
+    )
+    for k, est in enumerate(mse_scores_dict["estimator"]):
+        est_coef = est.coef_.copy()
+        if issparse(est_coef):
+            est_coef = est_coef.toarray()
+
+        fitted_est_coef = fitted_estimators[k].coef_.copy()
+        if issparse(fitted_est_coef):
+            fitted_est_coef = fitted_est_coef.toarray()
+
+        assert_almost_equal(est_coef, fitted_est_coef)
+        assert_almost_equal(est.intercept_, fitted_estimators[k].intercept_)
+
+
+def check_cross_validate_multi_metric(clf, X, y, scores, cv):
+    # Test multimetric evaluation when scoring is a list / dict
+    (
+        train_mse_scores,
+        test_mse_scores,
+        train_r2_scores,
+        test_r2_scores,
+        fitted_estimators,
+    ) = scores
+
+    def custom_scorer(clf, X, y):
+        y_pred = clf.predict(X)
+        return {
+            "r2": r2_score(y, y_pred),
+            "neg_mean_squared_error": -mean_squared_error(y, y_pred),
+        }
+
+    all_scoring = (
+        ("r2", "neg_mean_squared_error"),
+        {
+            "r2": make_scorer(r2_score),
+            "neg_mean_squared_error": "neg_mean_squared_error",
+        },
+        custom_scorer,
+    )
+
+    keys_sans_train = {
+        "test_r2",
+        "test_neg_mean_squared_error",
+        "fit_time",
+        "score_time",
+    }
+    keys_with_train = keys_sans_train.union(
+        {"train_r2", "train_neg_mean_squared_error"}
+    )
+
+    for return_train_score in (True, False):
+        for scoring in all_scoring:
+            if return_train_score:
+                # return_train_score must be True by default - deprecated
+                cv_results = cross_validate(
+                    clf, X, y, scoring=scoring, return_train_score=True, cv=cv
+                )
+                assert_array_almost_equal(cv_results["train_r2"], train_r2_scores)
+                assert_array_almost_equal(
+                    cv_results["train_neg_mean_squared_error"], train_mse_scores
+                )
+            else:
+                cv_results = cross_validate(
+                    clf, X, y, scoring=scoring, return_train_score=False, cv=cv
+                )
+            assert isinstance(cv_results, dict)
+            assert set(cv_results.keys()) == (
+                keys_with_train if return_train_score else keys_sans_train
+            )
+            assert_array_almost_equal(cv_results["test_r2"], test_r2_scores)
+            assert_array_almost_equal(
+                cv_results["test_neg_mean_squared_error"], test_mse_scores
+            )
+
+            # Make sure all the arrays are of np.ndarray type
+            assert isinstance(cv_results["test_r2"], np.ndarray)
+            assert isinstance(cv_results["test_neg_mean_squared_error"], np.ndarray)
+            assert isinstance(cv_results["fit_time"], np.ndarray)
+            assert isinstance(cv_results["score_time"], np.ndarray)
+
+            # Ensure all the times are within sane limits
+            assert np.all(cv_results["fit_time"] >= 0)
+            assert np.all(cv_results["fit_time"] < 10)
+            assert np.all(cv_results["score_time"] >= 0)
+            assert np.all(cv_results["score_time"] < 10)
+
+
+def test_cross_val_score_predict_groups():
+    # Check if ValueError (when groups is None) propagates to cross_val_score
+    # and cross_val_predict
+    # And also check if groups is correctly passed to the cv object
+    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)
+
+    clf = SVC(kernel="linear")
+
+    group_cvs = [
+        LeaveOneGroupOut(),
+        LeavePGroupsOut(2),
+        GroupKFold(),
+        GroupShuffleSplit(),
+    ]
+    error_message = "The 'groups' parameter should not be None."
+    for cv in group_cvs:
+        with pytest.raises(ValueError, match=error_message):
+            cross_val_score(estimator=clf, X=X, y=y, cv=cv)
+        with pytest.raises(ValueError, match=error_message):
+            cross_val_predict(estimator=clf, X=X, y=y, cv=cv)
+
+
+def test_cross_val_score_pandas():
+    # check cross_val_score doesn't destroy pandas dataframe
+    types = [(MockDataFrame, MockDataFrame)]
+    try:
+        from pandas import DataFrame, Series
+
+        types.append((Series, DataFrame))
+    except ImportError:
+        pass
+    for TargetType, InputFeatureType in types:
+        # X dataframe, y series
+        # 3 fold cross val is used so we need at least 3 samples per class
+        X_df, y_ser = InputFeatureType(X), TargetType(y2)
+        check_df = lambda x: isinstance(x, InputFeatureType)
+        check_series = lambda x: isinstance(x, TargetType)
+        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
+        cross_val_score(clf, X_df, y_ser, cv=3)
+
+
+def test_cross_val_score_mask():
+    # test that cross_val_score works with boolean masks
+    svm = SVC(kernel="linear")
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    kfold = KFold(5)
+    scores_indices = cross_val_score(svm, X, y, cv=kfold)
+    kfold = KFold(5)
+    cv_masks = []
+    for train, test in kfold.split(X, y):
+        mask_train = np.zeros(len(y), dtype=bool)
+        mask_test = np.zeros(len(y), dtype=bool)
+        mask_train[train] = 1
+        mask_test[test] = 1
+        cv_masks.append((train, test))
+    scores_masks = cross_val_score(svm, X, y, cv=cv_masks)
+    assert_array_equal(scores_indices, scores_masks)
+
+
+def test_cross_val_score_precomputed():
+    # test for svm with precomputed kernel
+    svm = SVC(kernel="precomputed")
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    linear_kernel = np.dot(X, X.T)
+    score_precomputed = cross_val_score(svm, linear_kernel, y)
+    svm = SVC(kernel="linear")
+    score_linear = cross_val_score(svm, X, y)
+    assert_array_almost_equal(score_precomputed, score_linear)
+
+    # test with callable
+    svm = SVC(kernel=lambda x, y: np.dot(x, y.T))
+    score_callable = cross_val_score(svm, X, y)
+    assert_array_almost_equal(score_precomputed, score_callable)
+
+    # Error raised for non-square X
+    svm = SVC(kernel="precomputed")
+    with pytest.raises(ValueError):
+        cross_val_score(svm, X, y)
+
+    # test error is raised when the precomputed kernel is not array-like
+    # or sparse
+    with pytest.raises(ValueError):
+        cross_val_score(svm, linear_kernel.tolist(), y)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_cross_val_score_fit_params(coo_container):
+    clf = MockClassifier()
+    n_samples = X.shape[0]
+    n_classes = len(np.unique(y))
+
+    W_sparse = coo_container(
+        (np.array([1]), (np.array([1]), np.array([0]))), shape=(15, 1)
+    )
+    P_sparse = coo_container(np.eye(5))
+
+    DUMMY_INT = 42
+    DUMMY_STR = "42"
+    DUMMY_OBJ = object()
+
+    def assert_fit_params(clf):
+        # Function to test that the values are passed correctly to the
+        # classifier arguments for non-array type
+
+        assert clf.dummy_int == DUMMY_INT
+        assert clf.dummy_str == DUMMY_STR
+        assert clf.dummy_obj == DUMMY_OBJ
+
+    fit_params = {
+        "sample_weight": np.ones(n_samples),
+        "class_prior": np.full(n_classes, 1.0 / n_classes),
+        "sparse_sample_weight": W_sparse,
+        "sparse_param": P_sparse,
+        "dummy_int": DUMMY_INT,
+        "dummy_str": DUMMY_STR,
+        "dummy_obj": DUMMY_OBJ,
+        "callback": assert_fit_params,
+    }
+    cross_val_score(clf, X, y2, params=fit_params)
+
+
+def test_cross_val_score_score_func():
+    clf = MockClassifier()
+    _score_func_args = []
+
+    def score_func(y_test, y_predict):
+        _score_func_args.append((y_test, y_predict))
+        return 1.0
+
+    with warnings.catch_warnings(record=True):
+        scoring = make_scorer(score_func)
+        score = cross_val_score(clf, X, y, scoring=scoring, cv=3)
+    assert_array_equal(score, [1.0, 1.0, 1.0])
+    # Test that score function is called only 3 times (for cv=3)
+    assert len(_score_func_args) == 3
+
+
+def test_cross_val_score_with_score_func_classification():
+    iris = load_iris()
+    clf = SVC(kernel="linear")
+
+    # Default score (should be the accuracy score)
+    scores = cross_val_score(clf, iris.data, iris.target)
+    assert_array_almost_equal(scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2)
+
+    # Correct classification score (aka. zero / one score) - should be the
+    # same as the default estimator score
+    zo_scores = cross_val_score(clf, iris.data, iris.target, scoring="accuracy")
+    assert_array_almost_equal(zo_scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2)
+
+    # F1 score (class are balanced so f1_score should be equal to zero/one
+    # score
+    f1_scores = cross_val_score(clf, iris.data, iris.target, scoring="f1_weighted")
+    assert_array_almost_equal(f1_scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2)
+
+
+def test_cross_val_score_with_score_func_regression():
+    X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0)
+    reg = Ridge()
+
+    # Default score of the Ridge regression estimator
+    scores = cross_val_score(reg, X, y)
+    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
+
+    # R2 score (aka. determination coefficient) - should be the
+    # same as the default estimator score
+    r2_scores = cross_val_score(reg, X, y, scoring="r2")
+    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
+
+    # Mean squared error; this is a loss function, so "scores" are negative
+    neg_mse_scores = cross_val_score(reg, X, y, scoring="neg_mean_squared_error")
+    expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
+    assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2)
+
+    # Explained variance
+    scoring = make_scorer(explained_variance_score)
+    ev_scores = cross_val_score(reg, X, y, scoring=scoring)
+    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_permutation_score(coo_container):
+    iris = load_iris()
+    X = iris.data
+    X_sparse = coo_container(X)
+    y = iris.target
+    svm = SVC(kernel="linear")
+    cv = StratifiedKFold(2)
+
+    score, scores, pvalue = permutation_test_score(
+        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy"
+    )
+    assert score > 0.9
+    assert_almost_equal(pvalue, 0.0, 1)
+
+    score_group, _, pvalue_group = permutation_test_score(
+        svm,
+        X,
+        y,
+        n_permutations=30,
+        cv=cv,
+        scoring="accuracy",
+        groups=np.ones(y.size),
+        random_state=0,
+    )
+    assert score_group == score
+    assert pvalue_group == pvalue
+
+    # check that we obtain the same results with a sparse representation
+    svm_sparse = SVC(kernel="linear")
+    cv_sparse = StratifiedKFold(2)
+    score_group, _, pvalue_group = permutation_test_score(
+        svm_sparse,
+        X_sparse,
+        y,
+        n_permutations=30,
+        cv=cv_sparse,
+        scoring="accuracy",
+        groups=np.ones(y.size),
+        random_state=0,
+    )
+
+    assert score_group == score
+    assert pvalue_group == pvalue
+
+    # test with custom scoring object
+    def custom_score(y_true, y_pred):
+        return ((y_true == y_pred).sum() - (y_true != y_pred).sum()) / y_true.shape[0]
+
+    scorer = make_scorer(custom_score)
+    score, _, pvalue = permutation_test_score(
+        svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0
+    )
+    assert_almost_equal(score, 0.93, 2)
+    assert_almost_equal(pvalue, 0.01, 3)
+
+    # set random y
+    y = np.mod(np.arange(len(y)), 3)
+
+    score, scores, pvalue = permutation_test_score(
+        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy"
+    )
+
+    assert score < 0.5
+    assert pvalue > 0.2
+
+
+def test_permutation_test_score_allow_nans():
+    # Check that permutation_test_score allows input data with NaNs
+    X = np.arange(200, dtype=np.float64).reshape(10, -1)
+    X[2, :] = np.nan
+    y = np.repeat([0, 1], X.shape[0] / 2)
+    p = Pipeline(
+        [
+            ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)),
+            ("classifier", MockClassifier()),
+        ]
+    )
+    permutation_test_score(p, X, y)
+
+
+def test_permutation_test_score_params():
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+    clf = CheckingClassifier(expected_sample_weight=True)
+
+    err_msg = r"Expected sample_weight to be passed"
+    with pytest.raises(AssertionError, match=err_msg):
+        permutation_test_score(clf, X, y)
+
+    err_msg = r"sample_weight.shape == \(1,\), expected \(8,\)!"
+    with pytest.raises(ValueError, match=err_msg):
+        permutation_test_score(clf, X, y, params={"sample_weight": np.ones(1)})
+    permutation_test_score(clf, X, y, params={"sample_weight": np.ones(10)})
+
+
+def test_cross_val_score_allow_nans():
+    # Check that cross_val_score allows input data with NaNs
+    X = np.arange(200, dtype=np.float64).reshape(10, -1)
+    X[2, :] = np.nan
+    y = np.repeat([0, 1], X.shape[0] / 2)
+    p = Pipeline(
+        [
+            ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)),
+            ("classifier", MockClassifier()),
+        ]
+    )
+    cross_val_score(p, X, y)
+
+
+def test_cross_val_score_multilabel():
+    X = np.array(
+        [
+            [-3, 4],
+            [2, 4],
+            [3, 3],
+            [0, 2],
+            [-3, 1],
+            [-2, 1],
+            [0, 0],
+            [-2, -1],
+            [-1, -2],
+            [1, -2],
+        ]
+    )
+    y = np.array(
+        [[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]]
+    )
+    clf = KNeighborsClassifier(n_neighbors=1)
+    scoring_micro = make_scorer(precision_score, average="micro")
+    scoring_macro = make_scorer(precision_score, average="macro")
+    scoring_samples = make_scorer(precision_score, average="samples")
+    score_micro = cross_val_score(clf, X, y, scoring=scoring_micro)
+    score_macro = cross_val_score(clf, X, y, scoring=scoring_macro)
+    score_samples = cross_val_score(clf, X, y, scoring=scoring_samples)
+    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
+    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
+    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_cross_val_predict(coo_container):
+    X, y = load_diabetes(return_X_y=True)
+    cv = KFold()
+
+    est = Ridge()
+
+    # Naive loop (should be same as cross_val_predict):
+    preds2 = np.zeros_like(y)
+    for train, test in cv.split(X, y):
+        est.fit(X[train], y[train])
+        preds2[test] = est.predict(X[test])
+
+    preds = cross_val_predict(est, X, y, cv=cv)
+    assert_array_almost_equal(preds, preds2)
+
+    preds = cross_val_predict(est, X, y)
+    assert len(preds) == len(y)
+
+    cv = LeaveOneOut()
+    preds = cross_val_predict(est, X, y, cv=cv)
+    assert len(preds) == len(y)
+
+    Xsp = X.copy()
+    Xsp *= Xsp > np.median(Xsp)
+    Xsp = coo_container(Xsp)
+    preds = cross_val_predict(est, Xsp, y)
+    assert_array_almost_equal(len(preds), len(y))
+
+    preds = cross_val_predict(KMeans(n_init="auto"), X)
+    assert len(preds) == len(y)
+
+    class BadCV:
+        def split(self, X, y=None, groups=None):
+            for i in range(4):
+                yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8])
+
+    with pytest.raises(ValueError):
+        cross_val_predict(est, X, y, cv=BadCV())
+
+    X, y = load_iris(return_X_y=True)
+
+    warning_message = (
+        r"Number of classes in training fold \(2\) does "
+        r"not match total number of classes \(3\). "
+        "Results may not be appropriate for your use case."
+    )
+    with pytest.warns(RuntimeWarning, match=warning_message):
+        cross_val_predict(
+            LogisticRegression(solver="liblinear"),
+            X,
+            y,
+            method="predict_proba",
+            cv=KFold(2),
+        )
+
+
+def test_cross_val_predict_decision_function_shape():
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+
+    preds = cross_val_predict(
+        LogisticRegression(solver="liblinear"), X, y, method="decision_function"
+    )
+    assert preds.shape == (50,)
+
+    X, y = load_iris(return_X_y=True)
+
+    preds = cross_val_predict(
+        LogisticRegression(solver="liblinear"), X, y, method="decision_function"
+    )
+    assert preds.shape == (150, 3)
+
+    # This specifically tests imbalanced splits for binary
+    # classification with decision_function. This is only
+    # applicable to classifiers that can be fit on a single
+    # class.
+    X = X[:100]
+    y = y[:100]
+    error_message = (
+        "Only 1 class/es in training fold,"
+        " but 2 in overall dataset. This"
+        " is not supported for decision_function"
+        " with imbalanced folds. To fix "
+        "this, use a cross-validation technique "
+        "resulting in properly stratified folds"
+    )
+    with pytest.raises(ValueError, match=error_message):
+        cross_val_predict(
+            RidgeClassifier(), X, y, method="decision_function", cv=KFold(2)
+        )
+
+    X, y = load_digits(return_X_y=True)
+    est = SVC(kernel="linear", decision_function_shape="ovo")
+
+    preds = cross_val_predict(est, X, y, method="decision_function")
+    assert preds.shape == (1797, 45)
+
+    ind = np.argsort(y)
+    X, y = X[ind], y[ind]
+    error_message_regexp = (
+        r"Output shape \(599L?, 21L?\) of "
+        "decision_function does not match number of "
+        r"classes \(7\) in fold. Irregular "
+        "decision_function .*"
+    )
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_val_predict(est, X, y, cv=KFold(n_splits=3), method="decision_function")
+
+
+def test_cross_val_predict_predict_proba_shape():
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+
+    preds = cross_val_predict(
+        LogisticRegression(solver="liblinear"), X, y, method="predict_proba"
+    )
+    assert preds.shape == (50, 2)
+
+    X, y = load_iris(return_X_y=True)
+
+    preds = cross_val_predict(
+        LogisticRegression(solver="liblinear"), X, y, method="predict_proba"
+    )
+    assert preds.shape == (150, 3)
+
+
+def test_cross_val_predict_predict_log_proba_shape():
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+
+    preds = cross_val_predict(
+        LogisticRegression(solver="liblinear"), X, y, method="predict_log_proba"
+    )
+    assert preds.shape == (50, 2)
+
+    X, y = load_iris(return_X_y=True)
+
+    preds = cross_val_predict(
+        LogisticRegression(solver="liblinear"), X, y, method="predict_log_proba"
+    )
+    assert preds.shape == (150, 3)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_cross_val_predict_input_types(coo_container):
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X_sparse = coo_container(X)
+    multioutput_y = np.column_stack([y, y[::-1]])
+
+    clf = Ridge(fit_intercept=False, random_state=0)
+    # 3 fold cv is used --> at least 3 samples per class
+    # Smoke test
+    predictions = cross_val_predict(clf, X, y)
+    assert predictions.shape == (150,)
+
+    # test with multioutput y
+    predictions = cross_val_predict(clf, X_sparse, multioutput_y)
+    assert predictions.shape == (150, 2)
+
+    predictions = cross_val_predict(clf, X_sparse, y)
+    assert_array_equal(predictions.shape, (150,))
+
+    # test with multioutput y
+    predictions = cross_val_predict(clf, X_sparse, multioutput_y)
+    assert_array_equal(predictions.shape, (150, 2))
+
+    # test with X and y as list
+    list_check = lambda x: isinstance(x, list)
+    clf = CheckingClassifier(check_X=list_check)
+    predictions = cross_val_predict(clf, X.tolist(), y.tolist())
+
+    clf = CheckingClassifier(check_y=list_check)
+    predictions = cross_val_predict(clf, X, y.tolist())
+
+    # test with X and y as list and non empty method
+    predictions = cross_val_predict(
+        LogisticRegression(solver="liblinear"),
+        X.tolist(),
+        y.tolist(),
+        method="decision_function",
+    )
+    predictions = cross_val_predict(
+        LogisticRegression(solver="liblinear"),
+        X,
+        y.tolist(),
+        method="decision_function",
+    )
+
+    # test with 3d X and
+    X_3d = X[:, :, np.newaxis]
+    check_3d = lambda x: x.ndim == 3
+    clf = CheckingClassifier(check_X=check_3d)
+    predictions = cross_val_predict(clf, X_3d, y)
+    assert_array_equal(predictions.shape, (150,))
+
+
+def test_cross_val_predict_pandas():
+    # check cross_val_score doesn't destroy pandas dataframe
+    types = [(MockDataFrame, MockDataFrame)]
+    try:
+        from pandas import DataFrame, Series
+
+        types.append((Series, DataFrame))
+    except ImportError:
+        pass
+    for TargetType, InputFeatureType in types:
+        # X dataframe, y series
+        X_df, y_ser = InputFeatureType(X), TargetType(y2)
+        check_df = lambda x: isinstance(x, InputFeatureType)
+        check_series = lambda x: isinstance(x, TargetType)
+        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
+        cross_val_predict(clf, X_df, y_ser, cv=3)
+
+
+def test_cross_val_predict_unbalanced():
+    X, y = make_classification(
+        n_samples=100,
+        n_features=2,
+        n_redundant=0,
+        n_informative=2,
+        n_clusters_per_class=1,
+        random_state=1,
+    )
+    # Change the first sample to a new class
+    y[0] = 2
+    clf = LogisticRegression(random_state=1, solver="liblinear")
+    cv = StratifiedKFold(n_splits=2)
+    train, test = list(cv.split(X, y))
+    yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba")
+    assert y[test[0]][0] == 2  # sanity check for further assertions
+    assert np.all(yhat_proba[test[0]][:, 2] == 0)
+    assert np.all(yhat_proba[test[0]][:, 0:1] > 0)
+    assert np.all(yhat_proba[test[1]] > 0)
+    assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape), decimal=12)
+
+
+def test_cross_val_predict_y_none():
+    # ensure that cross_val_predict works when y is None
+    mock_classifier = MockClassifier()
+    rng = np.random.RandomState(42)
+    X = rng.rand(100, 10)
+    y_hat = cross_val_predict(mock_classifier, X, y=None, cv=5, method="predict")
+    assert_allclose(X[:, 0], y_hat)
+    y_hat_proba = cross_val_predict(
+        mock_classifier, X, y=None, cv=5, method="predict_proba"
+    )
+    assert_allclose(X, y_hat_proba)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_cross_val_score_sparse_fit_params(coo_container):
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    clf = MockClassifier()
+    fit_params = {"sparse_sample_weight": coo_container(np.eye(X.shape[0]))}
+    a = cross_val_score(clf, X, y, params=fit_params, cv=3)
+    assert_array_equal(a, np.ones(3))
+
+
+def test_learning_curve():
+    n_samples = 30
+    n_splits = 3
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(n_samples * ((n_splits - 1) / n_splits))
+    for shuffle_train in [False, True]:
+        with warnings.catch_warnings(record=True) as w:
+            (
+                train_sizes,
+                train_scores,
+                test_scores,
+                fit_times,
+                score_times,
+            ) = learning_curve(
+                estimator,
+                X,
+                y,
+                cv=KFold(n_splits=n_splits),
+                train_sizes=np.linspace(0.1, 1.0, 10),
+                shuffle=shuffle_train,
+                return_times=True,
+            )
+        if len(w) > 0:
+            raise RuntimeError("Unexpected warning: %r" % w[0].message)
+        assert train_scores.shape == (10, 3)
+        assert test_scores.shape == (10, 3)
+        assert fit_times.shape == (10, 3)
+        assert score_times.shape == (10, 3)
+        assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+        assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10))
+        assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
+
+        # Cannot use assert_array_almost_equal for fit and score times because
+        # the values are hardware-dependant
+        assert fit_times.dtype == "float64"
+        assert score_times.dtype == "float64"
+
+        # Test a custom cv splitter that can iterate only once
+        with warnings.catch_warnings(record=True) as w:
+            train_sizes2, train_scores2, test_scores2 = learning_curve(
+                estimator,
+                X,
+                y,
+                cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples),
+                train_sizes=np.linspace(0.1, 1.0, 10),
+                shuffle=shuffle_train,
+            )
+        if len(w) > 0:
+            raise RuntimeError("Unexpected warning: %r" % w[0].message)
+        assert_array_almost_equal(train_scores2, train_scores)
+        assert_array_almost_equal(test_scores2, test_scores)
+
+
+def test_learning_curve_unsupervised():
+    X, _ = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(20)
+    train_sizes, train_scores, test_scores = learning_curve(
+        estimator, X, y=None, cv=3, train_sizes=np.linspace(0.1, 1.0, 10)
+    )
+    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+    assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10))
+    assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
+
+
+def test_learning_curve_verbose():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(20)
+
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        train_sizes, train_scores, test_scores = learning_curve(
+            estimator, X, y, cv=3, verbose=1
+        )
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
+
+    assert "[learning_curve]" in out
+
+
+def test_learning_curve_incremental_learning_not_possible():
+    X, y = make_classification(
+        n_samples=2,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    # The mockup does not have partial_fit()
+    estimator = MockImprovingEstimator(1)
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, exploit_incremental_learning=True)
+
+
+def test_learning_curve_incremental_learning():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockIncrementalImprovingEstimator(20)
+    for shuffle_train in [False, True]:
+        train_sizes, train_scores, test_scores = learning_curve(
+            estimator,
+            X,
+            y,
+            cv=3,
+            exploit_incremental_learning=True,
+            train_sizes=np.linspace(0.1, 1.0, 10),
+            shuffle=shuffle_train,
+        )
+        assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+        assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10))
+        assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
+
+
+def test_learning_curve_incremental_learning_unsupervised():
+    X, _ = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockIncrementalImprovingEstimator(20)
+    train_sizes, train_scores, test_scores = learning_curve(
+        estimator,
+        X,
+        y=None,
+        cv=3,
+        exploit_incremental_learning=True,
+        train_sizes=np.linspace(0.1, 1.0, 10),
+    )
+    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+    assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10))
+    assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
+
+
+def test_learning_curve_batch_and_incremental_learning_are_equal():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    train_sizes = np.linspace(0.2, 1.0, 5)
+    estimator = PassiveAggressiveClassifier(max_iter=1, tol=None, shuffle=False)
+
+    train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve(
+        estimator,
+        X,
+        y,
+        train_sizes=train_sizes,
+        cv=3,
+        exploit_incremental_learning=True,
+    )
+    train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve(
+        estimator,
+        X,
+        y,
+        cv=3,
+        train_sizes=train_sizes,
+        exploit_incremental_learning=False,
+    )
+
+    assert_array_equal(train_sizes_inc, train_sizes_batch)
+    assert_array_almost_equal(
+        train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1)
+    )
+    assert_array_almost_equal(
+        test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1)
+    )
+
+
+def test_learning_curve_n_sample_range_out_of_bounds():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(20)
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0, 1])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0.0, 1.0])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0.1, 1.1])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0, 20])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[1, 21])
+
+
+def test_learning_curve_remove_duplicate_sample_sizes():
+    X, y = make_classification(
+        n_samples=3,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(2)
+    warning_message = (
+        "Removed duplicate entries from 'train_sizes'. Number of ticks "
+        "will be less than the size of 'train_sizes': 2 instead of 3."
+    )
+    with pytest.warns(RuntimeWarning, match=warning_message):
+        train_sizes, _, _ = learning_curve(
+            estimator, X, y, cv=3, train_sizes=np.linspace(0.33, 1.0, 3)
+        )
+    assert_array_equal(train_sizes, [1, 2])
+
+
+def test_learning_curve_with_boolean_indices():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(20)
+    cv = KFold(n_splits=3)
+    train_sizes, train_scores, test_scores = learning_curve(
+        estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10)
+    )
+    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+    assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10))
+    assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
+
+
+def test_learning_curve_with_shuffle():
+    # Following test case was designed this way to verify the code
+    # changes made in pull request: #7506.
+    X = np.array(
+        [
+            [1, 2],
+            [3, 4],
+            [5, 6],
+            [7, 8],
+            [11, 12],
+            [13, 14],
+            [15, 16],
+            [17, 18],
+            [19, 20],
+            [7, 8],
+            [9, 10],
+            [11, 12],
+            [13, 14],
+            [15, 16],
+            [17, 18],
+        ]
+    )
+    y = np.array([1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 2, 3, 4])
+    groups = np.array([1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4, 4])
+    # Splits on these groups fail without shuffle as the first iteration
+    # of the learning curve doesn't contain label 4 in the training set.
+    estimator = PassiveAggressiveClassifier(max_iter=5, tol=None, shuffle=False)
+
+    cv = GroupKFold(n_splits=2)
+    train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve(
+        estimator,
+        X,
+        y,
+        cv=cv,
+        n_jobs=1,
+        train_sizes=np.linspace(0.3, 1.0, 3),
+        groups=groups,
+        shuffle=True,
+        random_state=2,
+    )
+    assert_array_almost_equal(
+        train_scores_batch.mean(axis=1), np.array([0.75, 0.3, 0.36111111])
+    )
+    assert_array_almost_equal(
+        test_scores_batch.mean(axis=1), np.array([0.36111111, 0.25, 0.25])
+    )
+    with pytest.raises(ValueError):
+        learning_curve(
+            estimator,
+            X,
+            y,
+            cv=cv,
+            n_jobs=1,
+            train_sizes=np.linspace(0.3, 1.0, 3),
+            groups=groups,
+            error_score="raise",
+        )
+
+    train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve(
+        estimator,
+        X,
+        y,
+        cv=cv,
+        n_jobs=1,
+        train_sizes=np.linspace(0.3, 1.0, 3),
+        groups=groups,
+        shuffle=True,
+        random_state=2,
+        exploit_incremental_learning=True,
+    )
+    assert_array_almost_equal(
+        train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1)
+    )
+    assert_array_almost_equal(
+        test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1)
+    )
+
+
+def test_learning_curve_params():
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+    clf = CheckingClassifier(expected_sample_weight=True)
+
+    err_msg = r"Expected sample_weight to be passed"
+    with pytest.raises(AssertionError, match=err_msg):
+        learning_curve(clf, X, y, error_score="raise")
+
+    err_msg = r"sample_weight.shape == \(1,\), expected \(2,\)!"
+    with pytest.raises(ValueError, match=err_msg):
+        learning_curve(
+            clf, X, y, error_score="raise", params={"sample_weight": np.ones(1)}
+        )
+    learning_curve(
+        clf, X, y, error_score="raise", params={"sample_weight": np.ones(10)}
+    )
+
+
+def test_learning_curve_incremental_learning_params():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockIncrementalImprovingEstimator(20, ["sample_weight"])
+    err_msg = r"Expected fit parameter\(s\) \['sample_weight'\] not seen."
+    with pytest.raises(AssertionError, match=err_msg):
+        learning_curve(
+            estimator,
+            X,
+            y,
+            cv=3,
+            exploit_incremental_learning=True,
+            train_sizes=np.linspace(0.1, 1.0, 10),
+            error_score="raise",
+        )
+
+    err_msg = "Fit parameter sample_weight has length 3; expected"
+    with pytest.raises(AssertionError, match=err_msg):
+        learning_curve(
+            estimator,
+            X,
+            y,
+            cv=3,
+            exploit_incremental_learning=True,
+            train_sizes=np.linspace(0.1, 1.0, 10),
+            error_score="raise",
+            params={"sample_weight": np.ones(3)},
+        )
+
+    learning_curve(
+        estimator,
+        X,
+        y,
+        cv=3,
+        exploit_incremental_learning=True,
+        train_sizes=np.linspace(0.1, 1.0, 10),
+        error_score="raise",
+        params={"sample_weight": np.ones(2)},
+    )
+
+
+def test_validation_curve():
+    X, y = make_classification(
+        n_samples=2,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    param_range = np.linspace(0, 1, 10)
+    with warnings.catch_warnings(record=True) as w:
+        train_scores, test_scores = validation_curve(
+            MockEstimatorWithParameter(),
+            X,
+            y,
+            param_name="param",
+            param_range=param_range,
+            cv=2,
+        )
+    if len(w) > 0:
+        raise RuntimeError("Unexpected warning: %r" % w[0].message)
+
+    assert_array_almost_equal(train_scores.mean(axis=1), param_range)
+    assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range)
+
+
+def test_validation_curve_clone_estimator():
+    X, y = make_classification(
+        n_samples=2,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+
+    param_range = np.linspace(1, 0, 10)
+    _, _ = validation_curve(
+        MockEstimatorWithSingleFitCallAllowed(),
+        X,
+        y,
+        param_name="param",
+        param_range=param_range,
+        cv=2,
+    )
+
+
+def test_validation_curve_cv_splits_consistency():
+    n_samples = 100
+    n_splits = 5
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    scores1 = validation_curve(
+        SVC(kernel="linear", random_state=0),
+        X,
+        y,
+        param_name="C",
+        param_range=[0.1, 0.1, 0.2, 0.2],
+        cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples),
+    )
+    # The OneTimeSplitter is a non-re-entrant cv splitter. Unless, the
+    # `split` is called for each parameter, the following should produce
+    # identical results for param setting 1 and param setting 2 as both have
+    # the same C value.
+    assert_array_almost_equal(*np.vsplit(np.hstack(scores1)[(0, 2, 1, 3), :], 2))
+
+    scores2 = validation_curve(
+        SVC(kernel="linear", random_state=0),
+        X,
+        y,
+        param_name="C",
+        param_range=[0.1, 0.1, 0.2, 0.2],
+        cv=KFold(n_splits=n_splits, shuffle=True),
+    )
+
+    # For scores2, compare the 1st and 2nd parameter's scores
+    # (Since the C value for 1st two param setting is 0.1, they must be
+    # consistent unless the train test folds differ between the param settings)
+    assert_array_almost_equal(*np.vsplit(np.hstack(scores2)[(0, 2, 1, 3), :], 2))
+
+    scores3 = validation_curve(
+        SVC(kernel="linear", random_state=0),
+        X,
+        y,
+        param_name="C",
+        param_range=[0.1, 0.1, 0.2, 0.2],
+        cv=KFold(n_splits=n_splits),
+    )
+
+    # OneTimeSplitter is basically unshuffled KFold(n_splits=5). Sanity check.
+    assert_array_almost_equal(np.array(scores3), np.array(scores1))
+
+
+def test_validation_curve_params():
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+    clf = CheckingClassifier(expected_sample_weight=True)
+
+    err_msg = r"Expected sample_weight to be passed"
+    with pytest.raises(AssertionError, match=err_msg):
+        validation_curve(
+            clf,
+            X,
+            y,
+            param_name="foo_param",
+            param_range=[1, 2, 3],
+            error_score="raise",
+        )
+
+    err_msg = r"sample_weight.shape == \(1,\), expected \(8,\)!"
+    with pytest.raises(ValueError, match=err_msg):
+        validation_curve(
+            clf,
+            X,
+            y,
+            param_name="foo_param",
+            param_range=[1, 2, 3],
+            error_score="raise",
+            params={"sample_weight": np.ones(1)},
+        )
+    validation_curve(
+        clf,
+        X,
+        y,
+        param_name="foo_param",
+        param_range=[1, 2, 3],
+        error_score="raise",
+        params={"sample_weight": np.ones(10)},
+    )
+
+
+def test_check_is_permutation():
+    rng = np.random.RandomState(0)
+    p = np.arange(100)
+    rng.shuffle(p)
+    assert _check_is_permutation(p, 100)
+    assert not _check_is_permutation(np.delete(p, 23), 100)
+
+    p[0] = 23
+    assert not _check_is_permutation(p, 100)
+
+    # Check if the additional duplicate indices are caught
+    assert not _check_is_permutation(np.hstack((p, 0)), 100)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_cross_val_predict_sparse_prediction(csr_container):
+    # check that cross_val_predict gives same result for sparse and dense input
+    X, y = make_multilabel_classification(
+        n_classes=2,
+        n_labels=1,
+        allow_unlabeled=False,
+        return_indicator=True,
+        random_state=1,
+    )
+    X_sparse = csr_container(X)
+    y_sparse = csr_container(y)
+    classif = OneVsRestClassifier(SVC(kernel="linear"))
+    preds = cross_val_predict(classif, X, y, cv=10)
+    preds_sparse = cross_val_predict(classif, X_sparse, y_sparse, cv=10)
+    preds_sparse = preds_sparse.toarray()
+    assert_array_almost_equal(preds_sparse, preds)
+
+
+def check_cross_val_predict_binary(est, X, y, method):
+    """Helper for tests of cross_val_predict with binary classification"""
+    cv = KFold(n_splits=3, shuffle=False)
+
+    # Generate expected outputs
+    if y.ndim == 1:
+        exp_shape = (len(X),) if method == "decision_function" else (len(X), 2)
+    else:
+        exp_shape = y.shape
+    expected_predictions = np.zeros(exp_shape)
+    for train, test in cv.split(X, y):
+        est = clone(est).fit(X[train], y[train])
+        expected_predictions[test] = getattr(est, method)(X[test])
+
+    # Check actual outputs for several representations of y
+    for tg in [y, y + 1, y - 2, y.astype("str")]:
+        assert_allclose(
+            cross_val_predict(est, X, tg, method=method, cv=cv), expected_predictions
+        )
+
+
+def check_cross_val_predict_multiclass(est, X, y, method):
+    """Helper for tests of cross_val_predict with multiclass classification"""
+    cv = KFold(n_splits=3, shuffle=False)
+
+    # Generate expected outputs
+    float_min = np.finfo(np.float64).min
+    default_values = {
+        "decision_function": float_min,
+        "predict_log_proba": float_min,
+        "predict_proba": 0,
+    }
+    expected_predictions = np.full(
+        (len(X), len(set(y))), default_values[method], dtype=np.float64
+    )
+    _, y_enc = np.unique(y, return_inverse=True)
+    for train, test in cv.split(X, y_enc):
+        est = clone(est).fit(X[train], y_enc[train])
+        fold_preds = getattr(est, method)(X[test])
+        i_cols_fit = np.unique(y_enc[train])
+        expected_predictions[np.ix_(test, i_cols_fit)] = fold_preds
+
+    # Check actual outputs for several representations of y
+    for tg in [y, y + 1, y - 2, y.astype("str")]:
+        assert_allclose(
+            cross_val_predict(est, X, tg, method=method, cv=cv), expected_predictions
+        )
+
+
+def check_cross_val_predict_multilabel(est, X, y, method):
+    """Check the output of cross_val_predict for 2D targets using
+    Estimators which provide a predictions as a list with one
+    element per class.
+    """
+    cv = KFold(n_splits=3, shuffle=False)
+
+    # Create empty arrays of the correct size to hold outputs
+    float_min = np.finfo(np.float64).min
+    default_values = {
+        "decision_function": float_min,
+        "predict_log_proba": float_min,
+        "predict_proba": 0,
+    }
+    n_targets = y.shape[1]
+    expected_preds = []
+    for i_col in range(n_targets):
+        n_classes_in_label = len(set(y[:, i_col]))
+        if n_classes_in_label == 2 and method == "decision_function":
+            exp_shape = (len(X),)
+        else:
+            exp_shape = (len(X), n_classes_in_label)
+        expected_preds.append(
+            np.full(exp_shape, default_values[method], dtype=np.float64)
+        )
+
+    # Generate expected outputs
+    y_enc_cols = [
+        np.unique(y[:, i], return_inverse=True)[1][:, np.newaxis]
+        for i in range(y.shape[1])
+    ]
+    y_enc = np.concatenate(y_enc_cols, axis=1)
+    for train, test in cv.split(X, y_enc):
+        est = clone(est).fit(X[train], y_enc[train])
+        fold_preds = getattr(est, method)(X[test])
+        for i_col in range(n_targets):
+            fold_cols = np.unique(y_enc[train][:, i_col])
+            if expected_preds[i_col].ndim == 1:
+                # Decision function with <=2 classes
+                expected_preds[i_col][test] = fold_preds[i_col]
+            else:
+                idx = np.ix_(test, fold_cols)
+                expected_preds[i_col][idx] = fold_preds[i_col]
+
+    # Check actual outputs for several representations of y
+    for tg in [y, y + 1, y - 2, y.astype("str")]:
+        cv_predict_output = cross_val_predict(est, X, tg, method=method, cv=cv)
+        assert len(cv_predict_output) == len(expected_preds)
+        for i in range(len(cv_predict_output)):
+            assert_allclose(cv_predict_output[i], expected_preds[i])
+
+
+def check_cross_val_predict_with_method_binary(est):
+    # This test includes the decision_function with two classes.
+    # This is a special case: it has only one column of output.
+    X, y = make_classification(n_classes=2, random_state=0)
+    for method in ["decision_function", "predict_proba", "predict_log_proba"]:
+        check_cross_val_predict_binary(est, X, y, method)
+
+
+def check_cross_val_predict_with_method_multiclass(est):
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X, y = shuffle(X, y, random_state=0)
+    for method in ["decision_function", "predict_proba", "predict_log_proba"]:
+        check_cross_val_predict_multiclass(est, X, y, method)
+
+
+def test_cross_val_predict_with_method():
+    check_cross_val_predict_with_method_binary(LogisticRegression(solver="liblinear"))
+    check_cross_val_predict_with_method_multiclass(
+        LogisticRegression(solver="liblinear")
+    )
+
+
+def test_cross_val_predict_method_checking():
+    # Regression test for issue #9639. Tests that cross_val_predict does not
+    # check estimator methods (e.g. predict_proba) before fitting
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X, y = shuffle(X, y, random_state=0)
+    for method in ["decision_function", "predict_proba", "predict_log_proba"]:
+        est = SGDClassifier(loss="log_loss", random_state=2)
+        check_cross_val_predict_multiclass(est, X, y, method)
+
+
+def test_gridsearchcv_cross_val_predict_with_method():
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X, y = shuffle(X, y, random_state=0)
+    est = GridSearchCV(
+        LogisticRegression(random_state=42, solver="liblinear"), {"C": [0.1, 1]}, cv=2
+    )
+    for method in ["decision_function", "predict_proba", "predict_log_proba"]:
+        check_cross_val_predict_multiclass(est, X, y, method)
+
+
+def test_cross_val_predict_with_method_multilabel_ovr():
+    # OVR does multilabel predictions, but only arrays of
+    # binary indicator columns. The output of predict_proba
+    # is a 2D array with shape (n_samples, n_classes).
+    n_samp = 100
+    n_classes = 4
+    X, y = make_multilabel_classification(
+        n_samples=n_samp, n_labels=3, n_classes=n_classes, n_features=5, random_state=42
+    )
+    est = OneVsRestClassifier(LogisticRegression(solver="liblinear", random_state=0))
+    for method in ["predict_proba", "decision_function"]:
+        check_cross_val_predict_binary(est, X, y, method=method)
+
+
+class RFWithDecisionFunction(RandomForestClassifier):
+    # None of the current multioutput-multiclass estimators have
+    # decision function methods. Create a mock decision function
+    # to test the cross_val_predict function's handling of this case.
+    def decision_function(self, X):
+        probs = self.predict_proba(X)
+        msg = "This helper should only be used on multioutput-multiclass tasks"
+        assert isinstance(probs, list), msg
+        probs = [p[:, -1] if p.shape[1] == 2 else p for p in probs]
+        return probs
+
+
+def test_cross_val_predict_with_method_multilabel_rf():
+    # The RandomForest allows multiple classes in each label.
+    # Output of predict_proba is a list of outputs of predict_proba
+    # for each individual label.
+    n_classes = 4
+    X, y = make_multilabel_classification(
+        n_samples=100, n_labels=3, n_classes=n_classes, n_features=5, random_state=42
+    )
+    y[:, 0] += y[:, 1]  # Put three classes in the first column
+    for method in ["predict_proba", "predict_log_proba", "decision_function"]:
+        est = RFWithDecisionFunction(n_estimators=5, random_state=0)
+        with warnings.catch_warnings():
+            # Suppress "RuntimeWarning: divide by zero encountered in log"
+            warnings.simplefilter("ignore")
+            check_cross_val_predict_multilabel(est, X, y, method=method)
+
+
+def test_cross_val_predict_with_method_rare_class():
+    # Test a multiclass problem where one class will be missing from
+    # one of the CV training sets.
+    rng = np.random.RandomState(0)
+    X = rng.normal(0, 1, size=(14, 10))
+    y = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 3])
+    est = LogisticRegression(solver="liblinear")
+    for method in ["predict_proba", "predict_log_proba", "decision_function"]:
+        with warnings.catch_warnings():
+            # Suppress warning about too few examples of a class
+            warnings.simplefilter("ignore")
+            check_cross_val_predict_multiclass(est, X, y, method)
+
+
+def test_cross_val_predict_with_method_multilabel_rf_rare_class():
+    # The RandomForest allows anything for the contents of the labels.
+    # Output of predict_proba is a list of outputs of predict_proba
+    # for each individual label.
+    # In this test, the first label has a class with a single example.
+    # We'll have one CV fold where the training data don't include it.
+    rng = np.random.RandomState(0)
+    X = rng.normal(0, 1, size=(5, 10))
+    y = np.array([[0, 0], [1, 1], [2, 1], [0, 1], [1, 0]])
+    for method in ["predict_proba", "predict_log_proba"]:
+        est = RFWithDecisionFunction(n_estimators=5, random_state=0)
+        with warnings.catch_warnings():
+            # Suppress "RuntimeWarning: divide by zero encountered in log"
+            warnings.simplefilter("ignore")
+            check_cross_val_predict_multilabel(est, X, y, method=method)
+
+
+def get_expected_predictions(X, y, cv, classes, est, method):
+    expected_predictions = np.zeros([len(y), classes])
+    func = getattr(est, method)
+
+    for train, test in cv.split(X, y):
+        est.fit(X[train], y[train])
+        expected_predictions_ = func(X[test])
+        # To avoid 2 dimensional indexing
+        if method == "predict_proba":
+            exp_pred_test = np.zeros((len(test), classes))
+        else:
+            exp_pred_test = np.full(
+                (len(test), classes), np.finfo(expected_predictions.dtype).min
+            )
+        exp_pred_test[:, est.classes_] = expected_predictions_
+        expected_predictions[test] = exp_pred_test
+
+    return expected_predictions
+
+
+def test_cross_val_predict_class_subset():
+    X = np.arange(200).reshape(100, 2)
+    y = np.array([x // 10 for x in range(100)])
+    classes = 10
+
+    kfold3 = KFold(n_splits=3)
+    kfold4 = KFold(n_splits=4)
+
+    le = LabelEncoder()
+
+    methods = ["decision_function", "predict_proba", "predict_log_proba"]
+    for method in methods:
+        est = LogisticRegression(solver="liblinear")
+
+        # Test with n_splits=3
+        predictions = cross_val_predict(est, X, y, method=method, cv=kfold3)
+
+        # Runs a naive loop (should be same as cross_val_predict):
+        expected_predictions = get_expected_predictions(
+            X, y, kfold3, classes, est, method
+        )
+        assert_array_almost_equal(expected_predictions, predictions)
+
+        # Test with n_splits=4
+        predictions = cross_val_predict(est, X, y, method=method, cv=kfold4)
+        expected_predictions = get_expected_predictions(
+            X, y, kfold4, classes, est, method
+        )
+        assert_array_almost_equal(expected_predictions, predictions)
+
+        # Testing unordered labels
+        y = shuffle(np.repeat(range(10), 10), random_state=0)
+        predictions = cross_val_predict(est, X, y, method=method, cv=kfold3)
+        y = le.fit_transform(y)
+        expected_predictions = get_expected_predictions(
+            X, y, kfold3, classes, est, method
+        )
+        assert_array_almost_equal(expected_predictions, predictions)
+
+
+def test_score_memmap():
+    # Ensure a scalar score of memmap type is accepted
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    clf = MockClassifier()
+    tf = tempfile.NamedTemporaryFile(mode="wb", delete=False)
+    tf.write(b"Hello world!!!!!")
+    tf.close()
+    scores = np.memmap(tf.name, dtype=np.float64)
+    score = np.memmap(tf.name, shape=(), mode="r", dtype=np.float64)
+    try:
+        cross_val_score(clf, X, y, scoring=lambda est, X, y: score)
+        with pytest.raises(ValueError):
+            cross_val_score(clf, X, y, scoring=lambda est, X, y: scores)
+    finally:
+        # Best effort to release the mmap file handles before deleting the
+        # backing file under Windows
+        scores, score = None, None
+        for _ in range(3):
+            try:
+                os.unlink(tf.name)
+                break
+            except OSError:
+                sleep(1.0)
+
+
+def test_permutation_test_score_pandas():
+    # check permutation_test_score doesn't destroy pandas dataframe
+    types = [(MockDataFrame, MockDataFrame)]
+    try:
+        from pandas import DataFrame, Series
+
+        types.append((Series, DataFrame))
+    except ImportError:
+        pass
+    for TargetType, InputFeatureType in types:
+        # X dataframe, y series
+        iris = load_iris()
+        X, y = iris.data, iris.target
+        X_df, y_ser = InputFeatureType(X), TargetType(y)
+        check_df = lambda x: isinstance(x, InputFeatureType)
+        check_series = lambda x: isinstance(x, TargetType)
+        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
+        permutation_test_score(clf, X_df, y_ser)
+
+
+def test_fit_and_score_failing():
+    # Create a failing classifier to deliberately fail
+    failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER)
+    # dummy X data
+    X = np.arange(1, 10)
+    train, test = np.arange(0, 5), np.arange(5, 9)
+    fit_and_score_args = dict(
+        estimator=failing_clf,
+        X=X,
+        y=None,
+        scorer=dict(),
+        train=train,
+        test=test,
+        verbose=0,
+        parameters=None,
+        fit_params=None,
+        score_params=None,
+    )
+    # passing error score to trigger the warning message
+    fit_and_score_args["error_score"] = "raise"
+    # check if exception was raised, with default error_score='raise'
+    with pytest.raises(ValueError, match="Failing classifier failed as required"):
+        _fit_and_score(**fit_and_score_args)
+
+    assert failing_clf.score() == 0.0  # FailingClassifier coverage
+
+
+def test_fit_and_score_working():
+    X, y = make_classification(n_samples=30, random_state=0)
+    clf = SVC(kernel="linear", random_state=0)
+    train, test = next(ShuffleSplit().split(X))
+    # Test return_parameters option
+    fit_and_score_args = dict(
+        estimator=clf,
+        X=X,
+        y=y,
+        scorer=dict(),
+        train=train,
+        test=test,
+        verbose=0,
+        parameters={"max_iter": 100, "tol": 0.1},
+        fit_params=None,
+        score_params=None,
+        return_parameters=True,
+    )
+    result = _fit_and_score(**fit_and_score_args)
+    assert result["parameters"] == fit_and_score_args["parameters"]
+
+
+class DataDependentFailingClassifier(BaseEstimator):
+    def __init__(self, max_x_value=None):
+        self.max_x_value = max_x_value
+
+    def fit(self, X, y=None):
+        num_values_too_high = (X > self.max_x_value).sum()
+        if num_values_too_high:
+            raise ValueError(
+                f"Classifier fit failed with {num_values_too_high} values too high"
+            )
+
+    def score(self, X=None, Y=None):
+        return 0.0
+
+
+@pytest.mark.parametrize("error_score", [np.nan, 0])
+def test_cross_validate_some_failing_fits_warning(error_score):
+    # Create a failing classifier to deliberately fail
+    failing_clf = DataDependentFailingClassifier(max_x_value=8)
+    # dummy X data
+    X = np.arange(1, 10)
+    y = np.ones(9)
+    # passing error score to trigger the warning message
+    cross_validate_args = [failing_clf, X, y]
+    cross_validate_kwargs = {"cv": 3, "error_score": error_score}
+    # check if the warning message type is as expected
+
+    individual_fit_error_message = (
+        "ValueError: Classifier fit failed with 1 values too high"
+    )
+    warning_message = re.compile(
+        (
+            "2 fits failed.+total of 3.+The score on these"
+            " train-test partitions for these parameters will be set to"
+            f" {cross_validate_kwargs['error_score']}.+{individual_fit_error_message}"
+        ),
+        flags=re.DOTALL,
+    )
+
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        cross_validate(*cross_validate_args, **cross_validate_kwargs)
+
+
+@pytest.mark.parametrize("error_score", [np.nan, 0])
+def test_cross_validate_all_failing_fits_error(error_score):
+    # Create a failing classifier to deliberately fail
+    failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER)
+    # dummy X data
+    X = np.arange(1, 10)
+    y = np.ones(9)
+
+    cross_validate_args = [failing_clf, X, y]
+    cross_validate_kwargs = {"cv": 7, "error_score": error_score}
+
+    individual_fit_error_message = "ValueError: Failing classifier failed as required"
+    error_message = re.compile(
+        (
+            "All the 7 fits failed.+your model is misconfigured.+"
+            f"{individual_fit_error_message}"
+        ),
+        flags=re.DOTALL,
+    )
+
+    with pytest.raises(ValueError, match=error_message):
+        cross_validate(*cross_validate_args, **cross_validate_kwargs)
+
+
+def _failing_scorer(estimator, X, y, error_msg):
+    raise ValueError(error_msg)
+
+
+@pytest.mark.filterwarnings("ignore:lbfgs failed to converge")
+@pytest.mark.parametrize("error_score", [np.nan, 0, "raise"])
+def test_cross_val_score_failing_scorer(error_score):
+    # check that an estimator can fail during scoring in `cross_val_score` and
+    # that we can optionally replaced it with `error_score`
+    X, y = load_iris(return_X_y=True)
+    clf = LogisticRegression(max_iter=5).fit(X, y)
+
+    error_msg = "This scorer is supposed to fail!!!"
+    failing_scorer = partial(_failing_scorer, error_msg=error_msg)
+
+    if error_score == "raise":
+        with pytest.raises(ValueError, match=error_msg):
+            cross_val_score(
+                clf, X, y, cv=3, scoring=failing_scorer, error_score=error_score
+            )
+    else:
+        warning_msg = (
+            "Scoring failed. The score on this train-test partition for "
+            f"these parameters will be set to {error_score}"
+        )
+        with pytest.warns(UserWarning, match=warning_msg):
+            scores = cross_val_score(
+                clf, X, y, cv=3, scoring=failing_scorer, error_score=error_score
+            )
+            assert_allclose(scores, error_score)
+
+
+@pytest.mark.filterwarnings("ignore:lbfgs failed to converge")
+@pytest.mark.parametrize("error_score", [np.nan, 0, "raise"])
+@pytest.mark.parametrize("return_train_score", [True, False])
+@pytest.mark.parametrize("with_multimetric", [False, True])
+def test_cross_validate_failing_scorer(
+    error_score, return_train_score, with_multimetric
+):
+    # Check that an estimator can fail during scoring in `cross_validate` and
+    # that we can optionally replace it with `error_score`. In the multimetric
+    # case also check the result of a non-failing scorer where the other scorers
+    # are failing.
+    X, y = load_iris(return_X_y=True)
+    clf = LogisticRegression(max_iter=5).fit(X, y)
+
+    error_msg = "This scorer is supposed to fail!!!"
+    failing_scorer = partial(_failing_scorer, error_msg=error_msg)
+    if with_multimetric:
+        non_failing_scorer = make_scorer(mean_squared_error)
+        scoring = {
+            "score_1": failing_scorer,
+            "score_2": non_failing_scorer,
+            "score_3": failing_scorer,
+        }
+    else:
+        scoring = failing_scorer
+
+    if error_score == "raise":
+        with pytest.raises(ValueError, match=error_msg):
+            cross_validate(
+                clf,
+                X,
+                y,
+                cv=3,
+                scoring=scoring,
+                return_train_score=return_train_score,
+                error_score=error_score,
+            )
+    else:
+        warning_msg = (
+            "Scoring failed. The score on this train-test partition for "
+            f"these parameters will be set to {error_score}"
+        )
+        with pytest.warns(UserWarning, match=warning_msg):
+            results = cross_validate(
+                clf,
+                X,
+                y,
+                cv=3,
+                scoring=scoring,
+                return_train_score=return_train_score,
+                error_score=error_score,
+            )
+            for key in results:
+                if "_score" in key:
+                    if "_score_2" in key:
+                        # check the test (and optionally train) score for the
+                        # scorer that should be non-failing
+                        for i in results[key]:
+                            assert isinstance(i, float)
+                    else:
+                        # check the test (and optionally train) score for all
+                        # scorers that should be assigned to `error_score`.
+                        assert_allclose(results[key], error_score)
+
+
+def three_params_scorer(i, j, k):
+    return 3.4213
+
+
+@pytest.mark.parametrize(
+    "train_score, scorer, verbose, split_prg, cdt_prg, expected",
+    [
+        (
+            False,
+            three_params_scorer,
+            2,
+            (1, 3),
+            (0, 1),
+            r"\[CV\] END ...................................................."
+            r" total time=   0.\ds",
+        ),
+        (
+            True,
+            _MultimetricScorer(
+                scorers={"sc1": three_params_scorer, "sc2": three_params_scorer}
+            ),
+            3,
+            (1, 3),
+            (0, 1),
+            r"\[CV 2/3\] END  sc1: \(train=3.421, test=3.421\) sc2: "
+            r"\(train=3.421, test=3.421\) total time=   0.\ds",
+        ),
+        (
+            False,
+            _MultimetricScorer(
+                scorers={"sc1": three_params_scorer, "sc2": three_params_scorer}
+            ),
+            10,
+            (1, 3),
+            (0, 1),
+            r"\[CV 2/3; 1/1\] END ....... sc1: \(test=3.421\) sc2: \(test=3.421\)"
+            r" total time=   0.\ds",
+        ),
+    ],
+)
+def test_fit_and_score_verbosity(
+    capsys, train_score, scorer, verbose, split_prg, cdt_prg, expected
+):
+    X, y = make_classification(n_samples=30, random_state=0)
+    clf = SVC(kernel="linear", random_state=0)
+    train, test = next(ShuffleSplit().split(X))
+
+    # test print without train score
+    fit_and_score_args = dict(
+        estimator=clf,
+        X=X,
+        y=y,
+        scorer=scorer,
+        train=train,
+        test=test,
+        verbose=verbose,
+        parameters=None,
+        fit_params=None,
+        score_params=None,
+        return_train_score=train_score,
+        split_progress=split_prg,
+        candidate_progress=cdt_prg,
+    )
+    _fit_and_score(**fit_and_score_args)
+    out, _ = capsys.readouterr()
+    outlines = out.split("\n")
+    if len(outlines) > 2:
+        assert re.match(expected, outlines[1])
+    else:
+        assert re.match(expected, outlines[0])
+
+
+def test_score():
+    error_message = "scoring must return a number, got None"
+
+    def two_params_scorer(estimator, X_test):
+        return None
+
+    with pytest.raises(ValueError, match=error_message):
+        _score(
+            estimator=None,
+            X_test=None,
+            y_test=None,
+            scorer=two_params_scorer,
+            score_params=None,
+            error_score=np.nan,
+        )
+
+
+def test_callable_multimetric_confusion_matrix_cross_validate():
+    def custom_scorer(clf, X, y):
+        y_pred = clf.predict(X)
+        cm = confusion_matrix(y, y_pred)
+        return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
+
+    X, y = make_classification(n_samples=40, n_features=4, random_state=42)
+    est = LinearSVC(random_state=42)
+    est.fit(X, y)
+    cv_results = cross_validate(est, X, y, cv=5, scoring=custom_scorer)
+
+    score_names = ["tn", "fp", "fn", "tp"]
+    for name in score_names:
+        assert "test_{}".format(name) in cv_results
+
+
+def test_learning_curve_partial_fit_regressors():
+    """Check that regressors with partial_fit is supported.
+
+    Non-regression test for #22981.
+    """
+    X, y = make_regression(random_state=42)
+
+    # Does not error
+    learning_curve(MLPRegressor(), X, y, exploit_incremental_learning=True, cv=2)
+
+
+def test_learning_curve_some_failing_fits_warning(global_random_seed):
+    """Checks for fit failures in `learning_curve` and raises the required warning"""
+
+    X, y = make_classification(
+        n_samples=30,
+        n_classes=3,
+        n_informative=6,
+        shuffle=False,
+        random_state=global_random_seed,
+    )
+    # sorting the target to trigger SVC error on the 2 first splits because a single
+    # class is present
+    sorted_idx = np.argsort(y)
+    X, y = X[sorted_idx], y[sorted_idx]
+
+    svc = SVC()
+    warning_message = "10 fits failed out of a total of 25"
+
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        _, train_score, test_score, *_ = learning_curve(
+            svc, X, y, cv=5, error_score=np.nan
+        )
+
+    # the first 2 splits should lead to warnings and thus np.nan scores
+    for idx in range(2):
+        assert np.isnan(train_score[idx]).all()
+        assert np.isnan(test_score[idx]).all()
+
+    for idx in range(2, train_score.shape[0]):
+        assert not np.isnan(train_score[idx]).any()
+        assert not np.isnan(test_score[idx]).any()
+
+
+def test_cross_validate_return_indices(global_random_seed):
+    """Check the behaviour of `return_indices` in `cross_validate`."""
+    X, y = load_iris(return_X_y=True)
+    X = scale(X)  # scale features for better convergence
+    estimator = LogisticRegression()
+
+    cv = KFold(n_splits=3, shuffle=True, random_state=global_random_seed)
+    cv_results = cross_validate(estimator, X, y, cv=cv, n_jobs=2, return_indices=False)
+    assert "indices" not in cv_results
+
+    cv_results = cross_validate(estimator, X, y, cv=cv, n_jobs=2, return_indices=True)
+    assert "indices" in cv_results
+    train_indices = cv_results["indices"]["train"]
+    test_indices = cv_results["indices"]["test"]
+    assert len(train_indices) == cv.n_splits
+    assert len(test_indices) == cv.n_splits
+
+    assert_array_equal([indices.size for indices in train_indices], 100)
+    assert_array_equal([indices.size for indices in test_indices], 50)
+
+    for split_idx, (expected_train_idx, expected_test_idx) in enumerate(cv.split(X, y)):
+        assert_array_equal(train_indices[split_idx], expected_train_idx)
+        assert_array_equal(test_indices[split_idx], expected_test_idx)
+
+
+# Tests for metadata routing in cross_val* and in *curve
+# ======================================================
+
+
+# TODO(1.8): remove `learning_curve`, `validation_curve` and `permutation_test_score`.
+@pytest.mark.parametrize(
+    "func, extra_args",
+    [
+        (learning_curve, {}),
+        (permutation_test_score, {}),
+        (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}),
+    ],
+)
+def test_fit_param_deprecation(func, extra_args):
+    """Check that we warn about deprecating `fit_params`."""
+    with pytest.warns(FutureWarning, match="`fit_params` is deprecated"):
+        func(
+            estimator=ConsumingClassifier(), X=X, y=y, cv=2, fit_params={}, **extra_args
+        )
+
+    with pytest.raises(
+        ValueError, match="`params` and `fit_params` cannot both be provided"
+    ):
+        func(
+            estimator=ConsumingClassifier(),
+            X=X,
+            y=y,
+            fit_params={},
+            params={},
+            **extra_args,
+        )
+
+
+@pytest.mark.parametrize(
+    "func, extra_args",
+    [
+        (cross_validate, {}),
+        (cross_val_score, {}),
+        (cross_val_predict, {}),
+        (learning_curve, {}),
+        (permutation_test_score, {}),
+        (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_groups_with_routing_validation(func, extra_args):
+    """Check that we raise an error if `groups` are passed to the cv method instead
+    of `params` when metadata routing is enabled.
+    """
+    with pytest.raises(ValueError, match="`groups` can only be passed if"):
+        func(
+            estimator=ConsumingClassifier(),
+            X=X,
+            y=y,
+            groups=[],
+            **extra_args,
+        )
+
+
+@pytest.mark.parametrize(
+    "func, extra_args",
+    [
+        (cross_validate, {}),
+        (cross_val_score, {}),
+        (cross_val_predict, {}),
+        (learning_curve, {}),
+        (permutation_test_score, {}),
+        (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_passed_unrequested_metadata(func, extra_args):
+    """Check that we raise an error when passing metadata that is not
+    requested."""
+    err_msg = re.escape("but are not explicitly set as requested or not requested")
+    with pytest.raises(ValueError, match=err_msg):
+        func(
+            estimator=ConsumingClassifier(),
+            X=X,
+            y=y,
+            params=dict(metadata=[]),
+            **extra_args,
+        )
+
+
+@pytest.mark.parametrize(
+    "func, extra_args",
+    [
+        (cross_validate, {}),
+        (cross_val_score, {}),
+        (cross_val_predict, {}),
+        (learning_curve, {}),
+        (permutation_test_score, {}),
+        (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_validation_functions_routing(func, extra_args):
+    """Check that the respective cv method is properly dispatching the metadata
+    to the consumer."""
+    scorer_registry = _Registry()
+    scorer = ConsumingScorer(registry=scorer_registry).set_score_request(
+        sample_weight="score_weights", metadata="score_metadata"
+    )
+    splitter_registry = _Registry()
+    splitter = ConsumingSplitter(registry=splitter_registry).set_split_request(
+        groups="split_groups", metadata="split_metadata"
+    )
+    estimator_registry = _Registry()
+    estimator = ConsumingClassifier(registry=estimator_registry).set_fit_request(
+        sample_weight="fit_sample_weight", metadata="fit_metadata"
+    )
+
+    n_samples = _num_samples(X)
+    rng = np.random.RandomState(0)
+    score_weights = rng.rand(n_samples)
+    score_metadata = rng.rand(n_samples)
+    split_groups = rng.randint(0, 3, n_samples)
+    split_metadata = rng.rand(n_samples)
+    fit_sample_weight = rng.rand(n_samples)
+    fit_metadata = rng.rand(n_samples)
+
+    scoring_args = {
+        cross_validate: dict(scoring=dict(my_scorer=scorer, accuracy="accuracy")),
+        cross_val_score: dict(scoring=scorer),
+        learning_curve: dict(scoring=scorer),
+        validation_curve: dict(scoring=scorer),
+        permutation_test_score: dict(scoring=scorer),
+        cross_val_predict: dict(),
+    }
+
+    params = dict(
+        split_groups=split_groups,
+        split_metadata=split_metadata,
+        fit_sample_weight=fit_sample_weight,
+        fit_metadata=fit_metadata,
+    )
+
+    if func is not cross_val_predict:
+        params.update(
+            score_weights=score_weights,
+            score_metadata=score_metadata,
+        )
+
+    func(
+        estimator,
+        X=X,
+        y=y,
+        cv=splitter,
+        **scoring_args[func],
+        **extra_args,
+        params=params,
+    )
+
+    if func is not cross_val_predict:
+        # cross_val_predict doesn't need a scorer
+        assert len(scorer_registry)
+    for _scorer in scorer_registry:
+        check_recorded_metadata(
+            obj=_scorer,
+            method="score",
+            parent=func.__name__,
+            split_params=("sample_weight", "metadata"),
+            sample_weight=score_weights,
+            metadata=score_metadata,
+        )
+
+    assert len(splitter_registry)
+    for _splitter in splitter_registry:
+        check_recorded_metadata(
+            obj=_splitter,
+            method="split",
+            parent=func.__name__,
+            groups=split_groups,
+            metadata=split_metadata,
+        )
+
+    assert len(estimator_registry)
+    for _estimator in estimator_registry:
+        check_recorded_metadata(
+            obj=_estimator,
+            method="fit",
+            parent=func.__name__,
+            split_params=("sample_weight", "metadata"),
+            sample_weight=fit_sample_weight,
+            metadata=fit_metadata,
+        )
+
+
+@config_context(enable_metadata_routing=True)
+def test_learning_curve_exploit_incremental_learning_routing():
+    """Test that learning_curve routes metadata to the estimator correctly while
+    partial_fitting it with `exploit_incremental_learning=True`."""
+
+    n_samples = _num_samples(X)
+    rng = np.random.RandomState(0)
+    fit_sample_weight = rng.rand(n_samples)
+    fit_metadata = rng.rand(n_samples)
+
+    estimator_registry = _Registry()
+    estimator = ConsumingClassifier(
+        registry=estimator_registry
+    ).set_partial_fit_request(
+        sample_weight="fit_sample_weight", metadata="fit_metadata"
+    )
+
+    learning_curve(
+        estimator,
+        X=X,
+        y=y,
+        cv=ConsumingSplitter(),
+        exploit_incremental_learning=True,
+        params=dict(fit_sample_weight=fit_sample_weight, fit_metadata=fit_metadata),
+    )
+
+    assert len(estimator_registry)
+    for _estimator in estimator_registry:
+        check_recorded_metadata(
+            obj=_estimator,
+            method="partial_fit",
+            parent="learning_curve",
+            split_params=("sample_weight", "metadata"),
+            sample_weight=fit_sample_weight,
+            metadata=fit_metadata,
+        )
+
+
+# End of metadata routing tests
+# =============================
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/__init__.py b/.venv/Lib/site-packages/sklearn/neighbors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fd0b2ea9663bdd7905ed771de61fccb227bd62c
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/__init__.py
@@ -0,0 +1,42 @@
+"""The k-nearest neighbors algorithms."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._ball_tree import BallTree
+from ._base import VALID_METRICS, VALID_METRICS_SPARSE, sort_graph_by_row_values
+from ._classification import KNeighborsClassifier, RadiusNeighborsClassifier
+from ._graph import (
+    KNeighborsTransformer,
+    RadiusNeighborsTransformer,
+    kneighbors_graph,
+    radius_neighbors_graph,
+)
+from ._kd_tree import KDTree
+from ._kde import KernelDensity
+from ._lof import LocalOutlierFactor
+from ._nca import NeighborhoodComponentsAnalysis
+from ._nearest_centroid import NearestCentroid
+from ._regression import KNeighborsRegressor, RadiusNeighborsRegressor
+from ._unsupervised import NearestNeighbors
+
+__all__ = [
+    "BallTree",
+    "KDTree",
+    "KNeighborsClassifier",
+    "KNeighborsRegressor",
+    "KNeighborsTransformer",
+    "NearestCentroid",
+    "NearestNeighbors",
+    "RadiusNeighborsClassifier",
+    "RadiusNeighborsRegressor",
+    "RadiusNeighborsTransformer",
+    "kneighbors_graph",
+    "radius_neighbors_graph",
+    "KernelDensity",
+    "LocalOutlierFactor",
+    "NeighborhoodComponentsAnalysis",
+    "sort_graph_by_row_values",
+    "VALID_METRICS",
+    "VALID_METRICS_SPARSE",
+]
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.lib
new file mode 100644
index 0000000000000000000000000000000000000000..39689881f1436f44d1ecdfc104a676a25a40bcc4
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.lib differ
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..45d28aa5e3d717c6386baf90c580c2df711f963e
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.cp39-win_amd64.pyd differ
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.pyx.tp b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..e0132c10facbc11ea1353d0d7848f9ff8fd0a17c
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_ball_tree.pyx.tp
@@ -0,0 +1,284 @@
+{{py:
+
+# Generated file: _ball_tree.pyx
+
+implementation_specific_values = [
+    # The values are arranged as follows:
+    #
+    #       name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
+    #
+    ('64', 'float64_t', 'np.float64'),
+    ('32', 'float32_t', 'np.float32')
+]
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+}}
+
+
+__all__ = ['BallTree', 'BallTree64', 'BallTree32']
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+DOC_DICT{{name_suffix}} = {
+    'BinaryTree': 'BallTree{{name_suffix}}',
+    'binary_tree': 'ball_tree{{name_suffix}}',
+}
+
+VALID_METRICS{{name_suffix}} = [
+    'BrayCurtisDistance{{name_suffix}}',
+    'CanberraDistance{{name_suffix}}',
+    'ChebyshevDistance{{name_suffix}}',
+    'DiceDistance{{name_suffix}}',
+    'EuclideanDistance{{name_suffix}}',
+    'HammingDistance{{name_suffix}}',
+    'HaversineDistance{{name_suffix}}',
+    'JaccardDistance{{name_suffix}}',
+    'MahalanobisDistance{{name_suffix}}',
+    'ManhattanDistance{{name_suffix}}',
+    'MinkowskiDistance{{name_suffix}}',
+    'PyFuncDistance{{name_suffix}}',
+    'RogersTanimotoDistance{{name_suffix}}',
+    'RussellRaoDistance{{name_suffix}}',
+    'SEuclideanDistance{{name_suffix}}',
+    'SokalMichenerDistance{{name_suffix}}',
+    'SokalSneathDistance{{name_suffix}}',
+    'WMinkowskiDistance{{name_suffix}}',
+]
+
+{{endfor}}
+
+include "_binary_tree.pxi"
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+# Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}}
+cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}):
+    __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}})
+    pass
+
+{{endfor}}
+
+
+#----------------------------------------------------------------------
+# The functions below specialized the Binary Tree as a Ball Tree
+#
+#   Note that these functions use the concept of "reduced distance".
+#   The reduced distance, defined for some metrics, is a quantity which
+#   is more efficient to compute than the distance, but preserves the
+#   relative rankings of the true distance.  For example, the reduced
+#   distance for the Euclidean metric is the squared-euclidean distance.
+#   For some metrics, the reduced distance is simply the distance.
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+cdef int allocate_data{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t n_nodes,
+    intp_t n_features,
+) except -1:
+    """Allocate arrays needed for the KD Tree"""
+    tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype={{INPUT_DTYPE}})
+    return 0
+
+
+cdef int init_node{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    NodeData_t[::1] node_data,
+    intp_t i_node,
+    intp_t idx_start,
+    intp_t idx_end,
+) except -1:
+    """Initialize the node for the dataset stored in tree.data"""
+    cdef intp_t n_features = tree.data.shape[1]
+    cdef intp_t n_points = idx_end - idx_start
+
+    cdef intp_t i, j
+    cdef float64_t radius
+    cdef const {{INPUT_DTYPE_t}} *this_pt
+
+    cdef intp_t* idx_array = &tree.idx_array[0]
+    cdef const {{INPUT_DTYPE_t}}* data = &tree.data[0, 0]
+    cdef {{INPUT_DTYPE_t}}* centroid = &tree.node_bounds[0, i_node, 0]
+
+    cdef bint with_sample_weight = tree.sample_weight is not None
+    cdef const {{INPUT_DTYPE_t}}* sample_weight
+    cdef float64_t sum_weight_node
+    if with_sample_weight:
+        sample_weight = &tree.sample_weight[0]
+
+    # determine Node centroid
+    for j in range(n_features):
+        centroid[j] = 0
+
+    if with_sample_weight:
+        sum_weight_node = 0
+        for i in range(idx_start, idx_end):
+            sum_weight_node += sample_weight[idx_array[i]]
+            this_pt = data + n_features * idx_array[i]
+            for j from 0 <= j < n_features:
+                centroid[j] += this_pt[j] * sample_weight[idx_array[i]]
+
+        for j in range(n_features):
+            centroid[j] /= sum_weight_node
+    else:
+        for i in range(idx_start, idx_end):
+            this_pt = data + n_features * idx_array[i]
+            for j from 0 <= j < n_features:
+                centroid[j] += this_pt[j]
+
+        for j in range(n_features):
+            centroid[j] /= n_points
+
+    # determine Node radius
+    radius = 0
+    for i in range(idx_start, idx_end):
+        radius = fmax(radius,
+                      tree.rdist(centroid,
+                                 data + n_features * idx_array[i],
+                                 n_features))
+
+    node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius)
+    node_data[i_node].idx_start = idx_start
+    node_data[i_node].idx_end = idx_end
+    return 0
+
+
+cdef inline float64_t min_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1 nogil:
+    """Compute the minimum distance between a point and a node"""
+    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
+                                     tree.data.shape[1])
+    return fmax(0, dist_pt - tree.node_data[i_node].radius)
+
+
+cdef inline float64_t max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the maximum distance between a point and a node"""
+    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
+                                     tree.data.shape[1])
+    return dist_pt + tree.node_data[i_node].radius
+
+
+cdef inline int min_max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+    float64_t* min_dist,
+    float64_t* max_dist,
+) except -1 nogil:
+    """Compute the minimum and maximum distance between a point and a node"""
+    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
+                                     tree.data.shape[1])
+    cdef float64_t rad = tree.node_data[i_node].radius
+    min_dist[0] = fmax(0, dist_pt - rad)
+    max_dist[0] = dist_pt + rad
+    return 0
+
+
+cdef inline float64_t min_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1 nogil:
+    """Compute the minimum reduced-distance between a point and a node"""
+    if tree.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            min_dist{{name_suffix}}(tree, i_node, pt)
+        )
+    else:
+        return tree.dist_metric._dist_to_rdist(
+            min_dist{{name_suffix}}(tree, i_node, pt)
+        )
+
+
+cdef inline float64_t max_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the maximum reduced-distance between a point and a node"""
+    if tree.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            max_dist{{name_suffix}}(tree, i_node, pt)
+        )
+    else:
+        return tree.dist_metric._dist_to_rdist(
+            max_dist{{name_suffix}}(tree, i_node, pt)
+        )
+
+
+cdef inline float64_t min_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the minimum distance between two nodes"""
+    cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0],
+                                      &tree1.node_bounds[0, i_node1, 0],
+                                      tree1.data.shape[1])
+    return fmax(0, (dist_pt - tree1.node_data[i_node1].radius
+                    - tree2.node_data[i_node2].radius))
+
+
+cdef inline float64_t max_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the maximum distance between two nodes"""
+    cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0],
+                                      &tree1.node_bounds[0, i_node1, 0],
+                                      tree1.data.shape[1])
+    return (dist_pt + tree1.node_data[i_node1].radius
+            + tree2.node_data[i_node2].radius)
+
+
+cdef inline float64_t min_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the minimum reduced distance between two nodes"""
+    if tree1.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+    else:
+        return tree1.dist_metric._dist_to_rdist(
+            min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+
+
+cdef inline float64_t max_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the maximum reduced distance between two nodes"""
+    if tree1.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+    else:
+        return tree1.dist_metric._dist_to_rdist(
+            max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+
+{{endfor}}
+
+
+class BallTree(BallTree64):
+    __doc__ = CLASS_DOC.format(BinaryTree="BallTree")
+    pass
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_base.py b/.venv/Lib/site-packages/sklearn/neighbors/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..419f84ebacda869f3f62f93432c8f9e9a3d039dc
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_base.py
@@ -0,0 +1,1403 @@
+"""Base and mixin classes for nearest neighbors."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import numbers
+import warnings
+from abc import ABCMeta, abstractmethod
+from functools import partial
+from numbers import Integral, Real
+
+import numpy as np
+from joblib import effective_n_jobs
+from scipy.sparse import csr_matrix, issparse
+
+from ..base import BaseEstimator, MultiOutputMixin, is_classifier
+from ..exceptions import DataConversionWarning, EfficiencyWarning
+from ..metrics import DistanceMetric, pairwise_distances_chunked
+from ..metrics._pairwise_distances_reduction import (
+    ArgKmin,
+    RadiusNeighbors,
+)
+from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
+from ..utils import (
+    check_array,
+    gen_even_slices,
+    get_tags,
+)
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.fixes import parse_version, sp_base_version
+from ..utils.multiclass import check_classification_targets
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import _to_object_array, check_is_fitted, validate_data
+from ._ball_tree import BallTree
+from ._kd_tree import KDTree
+
+SCIPY_METRICS = [
+    "braycurtis",
+    "canberra",
+    "chebyshev",
+    "correlation",
+    "cosine",
+    "dice",
+    "hamming",
+    "jaccard",
+    "mahalanobis",
+    "minkowski",
+    "rogerstanimoto",
+    "russellrao",
+    "seuclidean",
+    "sokalsneath",
+    "sqeuclidean",
+    "yule",
+]
+if sp_base_version < parse_version("1.17"):
+    # Deprecated in SciPy 1.15 and removed in SciPy 1.17
+    SCIPY_METRICS += ["sokalmichener"]
+if sp_base_version < parse_version("1.11"):
+    # Deprecated in SciPy 1.9 and removed in SciPy 1.11
+    SCIPY_METRICS += ["kulsinski"]
+if sp_base_version < parse_version("1.9"):
+    # Deprecated in SciPy 1.0 and removed in SciPy 1.9
+    SCIPY_METRICS += ["matching"]
+
+VALID_METRICS = dict(
+    ball_tree=BallTree.valid_metrics,
+    kd_tree=KDTree.valid_metrics,
+    # The following list comes from the
+    # sklearn.metrics.pairwise doc string
+    brute=sorted(set(PAIRWISE_DISTANCE_FUNCTIONS).union(SCIPY_METRICS)),
+)
+
+VALID_METRICS_SPARSE = dict(
+    ball_tree=[],
+    kd_tree=[],
+    brute=(PAIRWISE_DISTANCE_FUNCTIONS.keys() - {"haversine", "nan_euclidean"}),
+)
+
+
+def _get_weights(dist, weights):
+    """Get the weights from an array of distances and a parameter ``weights``.
+
+    Assume weights have already been validated.
+
+    Parameters
+    ----------
+    dist : ndarray
+        The input distances.
+
+    weights : {'uniform', 'distance'}, callable or None
+        The kind of weighting used.
+
+    Returns
+    -------
+    weights_arr : array of the same shape as ``dist``
+        If ``weights == 'uniform'``, then returns None.
+    """
+    if weights in (None, "uniform"):
+        return None
+
+    if weights == "distance":
+        # if user attempts to classify a point that was zero distance from one
+        # or more training points, those training points are weighted as 1.0
+        # and the other points as 0.0
+        if dist.dtype is np.dtype(object):
+            for point_dist_i, point_dist in enumerate(dist):
+                # check if point_dist is iterable
+                # (ex: RadiusNeighborClassifier.predict may set an element of
+                # dist to 1e-6 to represent an 'outlier')
+                if hasattr(point_dist, "__contains__") and 0.0 in point_dist:
+                    dist[point_dist_i] = point_dist == 0.0
+                else:
+                    dist[point_dist_i] = 1.0 / point_dist
+        else:
+            with np.errstate(divide="ignore"):
+                dist = 1.0 / dist
+            inf_mask = np.isinf(dist)
+            inf_row = np.any(inf_mask, axis=1)
+            dist[inf_row] = inf_mask[inf_row]
+        return dist
+
+    if callable(weights):
+        return weights(dist)
+
+
+def _is_sorted_by_data(graph):
+    """Return whether the graph's non-zero entries are sorted by data.
+
+    The non-zero entries are stored in graph.data and graph.indices.
+    For each row (or sample), the non-zero entries can be either:
+        - sorted by indices, as after graph.sort_indices();
+        - sorted by data, as after _check_precomputed(graph);
+        - not sorted.
+
+    Parameters
+    ----------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Neighbors graph as given by `kneighbors_graph` or
+        `radius_neighbors_graph`. Matrix should be of format CSR format.
+
+    Returns
+    -------
+    res : bool
+        Whether input graph is sorted by data.
+    """
+    assert graph.format == "csr"
+    out_of_order = graph.data[:-1] > graph.data[1:]
+    line_change = np.unique(graph.indptr[1:-1] - 1)
+    line_change = line_change[line_change < out_of_order.shape[0]]
+    return out_of_order.sum() == out_of_order[line_change].sum()
+
+
+def _check_precomputed(X):
+    """Check precomputed distance matrix.
+
+    If the precomputed distance matrix is sparse, it checks that the non-zero
+    entries are sorted by distances. If not, the matrix is copied and sorted.
+
+    Parameters
+    ----------
+    X : {sparse matrix, array-like}, (n_samples, n_samples)
+        Distance matrix to other samples. X may be a sparse matrix, in which
+        case only non-zero elements may be considered neighbors.
+
+    Returns
+    -------
+    X : {sparse matrix, array-like}, (n_samples, n_samples)
+        Distance matrix to other samples. X may be a sparse matrix, in which
+        case only non-zero elements may be considered neighbors.
+    """
+    if not issparse(X):
+        X = check_array(X, ensure_non_negative=True, input_name="X")
+        return X
+    else:
+        graph = X
+
+    if graph.format not in ("csr", "csc", "coo", "lil"):
+        raise TypeError(
+            "Sparse matrix in {!r} format is not supported due to "
+            "its handling of explicit zeros".format(graph.format)
+        )
+    copied = graph.format != "csr"
+    graph = check_array(
+        graph,
+        accept_sparse="csr",
+        ensure_non_negative=True,
+        input_name="precomputed distance matrix",
+    )
+    graph = sort_graph_by_row_values(graph, copy=not copied, warn_when_not_sorted=True)
+
+    return graph
+
+
+@validate_params(
+    {
+        "graph": ["sparse matrix"],
+        "copy": ["boolean"],
+        "warn_when_not_sorted": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def sort_graph_by_row_values(graph, copy=False, warn_when_not_sorted=True):
+    """Sort a sparse graph such that each row is stored with increasing values.
+
+    .. versionadded:: 1.2
+
+    Parameters
+    ----------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Distance matrix to other samples, where only non-zero elements are
+        considered neighbors. Matrix is converted to CSR format if not already.
+
+    copy : bool, default=False
+        If True, the graph is copied before sorting. If False, the sorting is
+        performed inplace. If the graph is not of CSR format, `copy` must be
+        True to allow the conversion to CSR format, otherwise an error is
+        raised.
+
+    warn_when_not_sorted : bool, default=True
+        If True, a :class:`~sklearn.exceptions.EfficiencyWarning` is raised
+        when the input graph is not sorted by row values.
+
+    Returns
+    -------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Distance matrix to other samples, where only non-zero elements are
+        considered neighbors. Matrix is in CSR format.
+
+    Examples
+    --------
+    >>> from scipy.sparse import csr_matrix
+    >>> from sklearn.neighbors import sort_graph_by_row_values
+    >>> X = csr_matrix(
+    ...     [[0., 3., 1.],
+    ...      [3., 0., 2.],
+    ...      [1., 2., 0.]])
+    >>> X.data
+    array([3., 1., 3., 2., 1., 2.])
+    >>> X_ = sort_graph_by_row_values(X)
+    >>> X_.data
+    array([1., 3., 2., 3., 1., 2.])
+    """
+    if graph.format == "csr" and _is_sorted_by_data(graph):
+        return graph
+
+    if warn_when_not_sorted:
+        warnings.warn(
+            (
+                "Precomputed sparse input was not sorted by row values. Use the"
+                " function sklearn.neighbors.sort_graph_by_row_values to sort the input"
+                " by row values, with warn_when_not_sorted=False to remove this"
+                " warning."
+            ),
+            EfficiencyWarning,
+        )
+
+    if graph.format not in ("csr", "csc", "coo", "lil"):
+        raise TypeError(
+            f"Sparse matrix in {graph.format!r} format is not supported due to "
+            "its handling of explicit zeros"
+        )
+    elif graph.format != "csr":
+        if not copy:
+            raise ValueError(
+                "The input graph is not in CSR format. Use copy=True to allow "
+                "the conversion to CSR format."
+            )
+        graph = graph.asformat("csr")
+    elif copy:  # csr format with copy=True
+        graph = graph.copy()
+
+    row_nnz = np.diff(graph.indptr)
+    if row_nnz.max() == row_nnz.min():
+        # if each sample has the same number of provided neighbors
+        n_samples = graph.shape[0]
+        distances = graph.data.reshape(n_samples, -1)
+
+        order = np.argsort(distances, kind="mergesort")
+        order += np.arange(n_samples)[:, None] * row_nnz[0]
+        order = order.ravel()
+        graph.data = graph.data[order]
+        graph.indices = graph.indices[order]
+
+    else:
+        for start, stop in zip(graph.indptr, graph.indptr[1:]):
+            order = np.argsort(graph.data[start:stop], kind="mergesort")
+            graph.data[start:stop] = graph.data[start:stop][order]
+            graph.indices[start:stop] = graph.indices[start:stop][order]
+
+    return graph
+
+
+def _kneighbors_from_graph(graph, n_neighbors, return_distance):
+    """Decompose a nearest neighbors sparse graph into distances and indices.
+
+    Parameters
+    ----------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Neighbors graph as given by `kneighbors_graph` or
+        `radius_neighbors_graph`. Matrix should be of format CSR format.
+
+    n_neighbors : int
+        Number of neighbors required for each sample.
+
+    return_distance : bool
+        Whether or not to return the distances.
+
+    Returns
+    -------
+    neigh_dist : ndarray of shape (n_samples, n_neighbors)
+        Distances to nearest neighbors. Only present if `return_distance=True`.
+
+    neigh_ind : ndarray of shape (n_samples, n_neighbors)
+        Indices of nearest neighbors.
+    """
+    n_samples = graph.shape[0]
+    assert graph.format == "csr"
+
+    # number of neighbors by samples
+    row_nnz = np.diff(graph.indptr)
+    row_nnz_min = row_nnz.min()
+    if n_neighbors is not None and row_nnz_min < n_neighbors:
+        raise ValueError(
+            "%d neighbors per samples are required, but some samples have only"
+            " %d neighbors in precomputed graph matrix. Decrease number of "
+            "neighbors used or recompute the graph with more neighbors."
+            % (n_neighbors, row_nnz_min)
+        )
+
+    def extract(a):
+        # if each sample has the same number of provided neighbors
+        if row_nnz.max() == row_nnz_min:
+            return a.reshape(n_samples, -1)[:, :n_neighbors]
+        else:
+            idx = np.tile(np.arange(n_neighbors), (n_samples, 1))
+            idx += graph.indptr[:-1, None]
+            return a.take(idx, mode="clip").reshape(n_samples, n_neighbors)
+
+    if return_distance:
+        return extract(graph.data), extract(graph.indices)
+    else:
+        return extract(graph.indices)
+
+
+def _radius_neighbors_from_graph(graph, radius, return_distance):
+    """Decompose a nearest neighbors sparse graph into distances and indices.
+
+    Parameters
+    ----------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Neighbors graph as given by `kneighbors_graph` or
+        `radius_neighbors_graph`. Matrix should be of format CSR format.
+
+    radius : float
+        Radius of neighborhoods which should be strictly positive.
+
+    return_distance : bool
+        Whether or not to return the distances.
+
+    Returns
+    -------
+    neigh_dist : ndarray of shape (n_samples,) of arrays
+        Distances to nearest neighbors. Only present if `return_distance=True`.
+
+    neigh_ind : ndarray of shape (n_samples,) of arrays
+        Indices of nearest neighbors.
+    """
+    assert graph.format == "csr"
+
+    no_filter_needed = bool(graph.data.max() <= radius)
+
+    if no_filter_needed:
+        data, indices, indptr = graph.data, graph.indices, graph.indptr
+    else:
+        mask = graph.data <= radius
+        if return_distance:
+            data = np.compress(mask, graph.data)
+        indices = np.compress(mask, graph.indices)
+        indptr = np.concatenate(([0], np.cumsum(mask)))[graph.indptr]
+
+    indices = indices.astype(np.intp, copy=no_filter_needed)
+
+    if return_distance:
+        neigh_dist = _to_object_array(np.split(data, indptr[1:-1]))
+    neigh_ind = _to_object_array(np.split(indices, indptr[1:-1]))
+
+    if return_distance:
+        return neigh_dist, neigh_ind
+    else:
+        return neigh_ind
+
+
+class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for nearest neighbors estimators."""
+
+    _parameter_constraints: dict = {
+        "n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
+        "radius": [Interval(Real, 0, None, closed="both"), None],
+        "algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})],
+        "leaf_size": [Interval(Integral, 1, None, closed="left")],
+        "p": [Interval(Real, 0, None, closed="right"), None],
+        "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable],
+        "metric_params": [dict, None],
+        "n_jobs": [Integral, None],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        n_neighbors=None,
+        radius=None,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        self.n_neighbors = n_neighbors
+        self.radius = radius
+        self.algorithm = algorithm
+        self.leaf_size = leaf_size
+        self.metric = metric
+        self.metric_params = metric_params
+        self.p = p
+        self.n_jobs = n_jobs
+
+    def _check_algorithm_metric(self):
+        if self.algorithm == "auto":
+            if self.metric == "precomputed":
+                alg_check = "brute"
+            elif (
+                callable(self.metric)
+                or self.metric in VALID_METRICS["ball_tree"]
+                or isinstance(self.metric, DistanceMetric)
+            ):
+                alg_check = "ball_tree"
+            else:
+                alg_check = "brute"
+        else:
+            alg_check = self.algorithm
+
+        if callable(self.metric):
+            if self.algorithm == "kd_tree":
+                # callable metric is only valid for brute force and ball_tree
+                raise ValueError(
+                    "kd_tree does not support callable metric '%s'"
+                    "Function call overhead will result"
+                    "in very poor performance." % self.metric
+                )
+        elif self.metric not in VALID_METRICS[alg_check] and not isinstance(
+            self.metric, DistanceMetric
+        ):
+            raise ValueError(
+                "Metric '%s' not valid. Use "
+                "sorted(sklearn.neighbors.VALID_METRICS['%s']) "
+                "to get valid options. "
+                "Metric can also be a callable function." % (self.metric, alg_check)
+            )
+
+        if self.metric_params is not None and "p" in self.metric_params:
+            if self.p is not None:
+                warnings.warn(
+                    (
+                        "Parameter p is found in metric_params. "
+                        "The corresponding parameter from __init__ "
+                        "is ignored."
+                    ),
+                    SyntaxWarning,
+                    stacklevel=3,
+                )
+
+    def _fit(self, X, y=None):
+        ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True
+        if self.__sklearn_tags__().target_tags.required:
+            if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
+                X, y = validate_data(
+                    self,
+                    X,
+                    y,
+                    accept_sparse="csr",
+                    multi_output=True,
+                    order="C",
+                    ensure_all_finite=ensure_all_finite,
+                )
+
+            if is_classifier(self):
+                # Classification targets require a specific format
+                if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
+                    if y.ndim != 1:
+                        warnings.warn(
+                            (
+                                "A column-vector y was passed when a "
+                                "1d array was expected. Please change "
+                                "the shape of y to (n_samples,), for "
+                                "example using ravel()."
+                            ),
+                            DataConversionWarning,
+                            stacklevel=2,
+                        )
+
+                    self.outputs_2d_ = False
+                    y = y.reshape((-1, 1))
+                else:
+                    self.outputs_2d_ = True
+
+                check_classification_targets(y)
+                self.classes_ = []
+                # Using `dtype=np.intp` is necessary since `np.bincount`
+                # (called in _classification.py) fails when dealing
+                # with a float64 array on 32bit systems.
+                self._y = np.empty(y.shape, dtype=np.intp)
+                for k in range(self._y.shape[1]):
+                    classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True)
+                    self.classes_.append(classes)
+
+                if not self.outputs_2d_:
+                    self.classes_ = self.classes_[0]
+                    self._y = self._y.ravel()
+            else:
+                self._y = y
+
+        else:
+            if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
+                X = validate_data(
+                    self,
+                    X,
+                    ensure_all_finite=ensure_all_finite,
+                    accept_sparse="csr",
+                    order="C",
+                )
+
+        self._check_algorithm_metric()
+        if self.metric_params is None:
+            self.effective_metric_params_ = {}
+        else:
+            self.effective_metric_params_ = self.metric_params.copy()
+
+        effective_p = self.effective_metric_params_.get("p", self.p)
+        if self.metric == "minkowski":
+            self.effective_metric_params_["p"] = effective_p
+
+        self.effective_metric_ = self.metric
+        # For minkowski distance, use more efficient methods where available
+        if self.metric == "minkowski":
+            p = self.effective_metric_params_.pop("p", 2)
+            w = self.effective_metric_params_.pop("w", None)
+
+            if p == 1 and w is None:
+                self.effective_metric_ = "manhattan"
+            elif p == 2 and w is None:
+                self.effective_metric_ = "euclidean"
+            elif p == np.inf and w is None:
+                self.effective_metric_ = "chebyshev"
+            else:
+                # Use the generic minkowski metric, possibly weighted.
+                self.effective_metric_params_["p"] = p
+                self.effective_metric_params_["w"] = w
+
+        if isinstance(X, NeighborsBase):
+            self._fit_X = X._fit_X
+            self._tree = X._tree
+            self._fit_method = X._fit_method
+            self.n_samples_fit_ = X.n_samples_fit_
+            return self
+
+        elif isinstance(X, BallTree):
+            self._fit_X = X.data
+            self._tree = X
+            self._fit_method = "ball_tree"
+            self.n_samples_fit_ = X.data.shape[0]
+            return self
+
+        elif isinstance(X, KDTree):
+            self._fit_X = X.data
+            self._tree = X
+            self._fit_method = "kd_tree"
+            self.n_samples_fit_ = X.data.shape[0]
+            return self
+
+        if self.metric == "precomputed":
+            X = _check_precomputed(X)
+            # Precomputed matrix X must be squared
+            if X.shape[0] != X.shape[1]:
+                raise ValueError(
+                    "Precomputed matrix must be square."
+                    " Input is a {}x{} matrix.".format(X.shape[0], X.shape[1])
+                )
+            self.n_features_in_ = X.shape[1]
+
+        n_samples = X.shape[0]
+        if n_samples == 0:
+            raise ValueError("n_samples must be greater than 0")
+
+        if issparse(X):
+            if self.algorithm not in ("auto", "brute"):
+                warnings.warn("cannot use tree with sparse input: using brute force")
+
+            if (
+                self.effective_metric_ not in VALID_METRICS_SPARSE["brute"]
+                and not callable(self.effective_metric_)
+                and not isinstance(self.effective_metric_, DistanceMetric)
+            ):
+                raise ValueError(
+                    "Metric '%s' not valid for sparse input. "
+                    "Use sorted(sklearn.neighbors."
+                    "VALID_METRICS_SPARSE['brute']) "
+                    "to get valid options. "
+                    "Metric can also be a callable function." % (self.effective_metric_)
+                )
+            self._fit_X = X.copy()
+            self._tree = None
+            self._fit_method = "brute"
+            self.n_samples_fit_ = X.shape[0]
+            return self
+
+        self._fit_method = self.algorithm
+        self._fit_X = X
+        self.n_samples_fit_ = X.shape[0]
+
+        if self._fit_method == "auto":
+            # A tree approach is better for small number of neighbors or small
+            # number of features, with KDTree generally faster when available
+            if (
+                self.metric == "precomputed"
+                or self._fit_X.shape[1] > 15
+                or (
+                    self.n_neighbors is not None
+                    and self.n_neighbors >= self._fit_X.shape[0] // 2
+                )
+            ):
+                self._fit_method = "brute"
+            else:
+                if (
+                    self.effective_metric_ == "minkowski"
+                    and self.effective_metric_params_["p"] < 1
+                ):
+                    self._fit_method = "brute"
+                elif (
+                    self.effective_metric_ == "minkowski"
+                    and self.effective_metric_params_.get("w") is not None
+                ):
+                    # 'minkowski' with weights is not supported by KDTree but is
+                    # supported byBallTree.
+                    self._fit_method = "ball_tree"
+                elif self.effective_metric_ in VALID_METRICS["kd_tree"]:
+                    self._fit_method = "kd_tree"
+                elif (
+                    callable(self.effective_metric_)
+                    or self.effective_metric_ in VALID_METRICS["ball_tree"]
+                ):
+                    self._fit_method = "ball_tree"
+                else:
+                    self._fit_method = "brute"
+
+        if (
+            self.effective_metric_ == "minkowski"
+            and self.effective_metric_params_["p"] < 1
+        ):
+            # For 0 < p < 1 Minkowski distances aren't valid distance
+            # metric as they do not satisfy triangular inequality:
+            # they are semi-metrics.
+            # algorithm="kd_tree" and algorithm="ball_tree" can't be used because
+            # KDTree and BallTree require a proper distance metric to work properly.
+            # However, the brute-force algorithm supports semi-metrics.
+            if self._fit_method == "brute":
+                warnings.warn(
+                    "Mind that for 0 < p < 1, Minkowski metrics are not distance"
+                    " metrics. Continuing the execution with `algorithm='brute'`."
+                )
+            else:  # self._fit_method in ("kd_tree", "ball_tree")
+                raise ValueError(
+                    f'algorithm="{self._fit_method}" does not support 0 < p < 1 for '
+                    "the Minkowski metric. To resolve this problem either "
+                    'set p >= 1 or algorithm="brute".'
+                )
+
+        if self._fit_method == "ball_tree":
+            self._tree = BallTree(
+                X,
+                self.leaf_size,
+                metric=self.effective_metric_,
+                **self.effective_metric_params_,
+            )
+        elif self._fit_method == "kd_tree":
+            if (
+                self.effective_metric_ == "minkowski"
+                and self.effective_metric_params_.get("w") is not None
+            ):
+                raise ValueError(
+                    "algorithm='kd_tree' is not valid for "
+                    "metric='minkowski' with a weight parameter 'w': "
+                    "try algorithm='ball_tree' "
+                    "or algorithm='brute' instead."
+                )
+            self._tree = KDTree(
+                X,
+                self.leaf_size,
+                metric=self.effective_metric_,
+                **self.effective_metric_params_,
+            )
+        elif self._fit_method == "brute":
+            self._tree = None
+
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # For cross-validation routines to split data correctly
+        tags.input_tags.pairwise = self.metric == "precomputed"
+        # when input is precomputed metric values, all those values need to be positive
+        tags.input_tags.positive_only = tags.input_tags.pairwise
+        tags.input_tags.allow_nan = self.metric == "nan_euclidean"
+        return tags
+
+
+class KNeighborsMixin:
+    """Mixin for k-neighbors searches."""
+
+    def _kneighbors_reduce_func(self, dist, start, n_neighbors, return_distance):
+        """Reduce a chunk of distances to the nearest neighbors.
+
+        Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`
+
+        Parameters
+        ----------
+        dist : ndarray of shape (n_samples_chunk, n_samples)
+            The distance matrix.
+
+        start : int
+            The index in X which the first row of dist corresponds to.
+
+        n_neighbors : int
+            Number of neighbors required for each sample.
+
+        return_distance : bool
+            Whether or not to return the distances.
+
+        Returns
+        -------
+        dist : array of shape (n_samples_chunk, n_neighbors)
+            Returned only if `return_distance=True`.
+
+        neigh : array of shape (n_samples_chunk, n_neighbors)
+            The neighbors indices.
+        """
+        sample_range = np.arange(dist.shape[0])[:, None]
+        neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
+        neigh_ind = neigh_ind[:, :n_neighbors]
+        # argpartition doesn't guarantee sorted order, so we sort again
+        neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])]
+        if return_distance:
+            if self.effective_metric_ == "euclidean":
+                result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
+            else:
+                result = dist[sample_range, neigh_ind], neigh_ind
+        else:
+            result = neigh_ind
+        return result
+
+    def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
+        """Find the K-neighbors of a point.
+
+        Returns indices of and distances to the neighbors of each point.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_queries, n_features), \
+            or (n_queries, n_indexed) if metric == 'precomputed', default=None
+            The query point or points.
+            If not provided, neighbors of each indexed point are returned.
+            In this case, the query point is not considered its own neighbor.
+
+        n_neighbors : int, default=None
+            Number of neighbors required for each sample. The default is the
+            value passed to the constructor.
+
+        return_distance : bool, default=True
+            Whether or not to return the distances.
+
+        Returns
+        -------
+        neigh_dist : ndarray of shape (n_queries, n_neighbors)
+            Array representing the lengths to points, only present if
+            return_distance=True.
+
+        neigh_ind : ndarray of shape (n_queries, n_neighbors)
+            Indices of the nearest points in the population matrix.
+
+        Examples
+        --------
+        In the following example, we construct a NearestNeighbors
+        class from an array representing our data set and ask who's
+        the closest point to [1,1,1]
+
+        >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
+        >>> from sklearn.neighbors import NearestNeighbors
+        >>> neigh = NearestNeighbors(n_neighbors=1)
+        >>> neigh.fit(samples)
+        NearestNeighbors(n_neighbors=1)
+        >>> print(neigh.kneighbors([[1., 1., 1.]]))
+        (array([[0.5]]), array([[2]]))
+
+        As you can see, it returns [[0.5]], and [[2]], which means that the
+        element is at distance 0.5 and is the third element of samples
+        (indexes start at 0). You can also query for multiple points:
+
+        >>> X = [[0., 1., 0.], [1., 0., 1.]]
+        >>> neigh.kneighbors(X, return_distance=False)
+        array([[1],
+               [2]]...)
+        """
+        check_is_fitted(self)
+
+        if n_neighbors is None:
+            n_neighbors = self.n_neighbors
+        elif n_neighbors <= 0:
+            raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors)
+        elif not isinstance(n_neighbors, numbers.Integral):
+            raise TypeError(
+                "n_neighbors does not take %s value, enter integer value"
+                % type(n_neighbors)
+            )
+
+        ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True
+        query_is_train = X is None
+        if query_is_train:
+            X = self._fit_X
+            # Include an extra neighbor to account for the sample itself being
+            # returned, which is removed later
+            n_neighbors += 1
+        else:
+            if self.metric == "precomputed":
+                X = _check_precomputed(X)
+            else:
+                X = validate_data(
+                    self,
+                    X,
+                    ensure_all_finite=ensure_all_finite,
+                    accept_sparse="csr",
+                    reset=False,
+                    order="C",
+                )
+
+        n_samples_fit = self.n_samples_fit_
+        if n_neighbors > n_samples_fit:
+            if query_is_train:
+                n_neighbors -= 1  # ok to modify inplace because an error is raised
+                inequality_str = "n_neighbors < n_samples_fit"
+            else:
+                inequality_str = "n_neighbors <= n_samples_fit"
+            raise ValueError(
+                f"Expected {inequality_str}, but "
+                f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, "
+                f"n_samples = {X.shape[0]}"  # include n_samples for common tests
+            )
+
+        n_jobs = effective_n_jobs(self.n_jobs)
+        chunked_results = None
+        use_pairwise_distances_reductions = (
+            self._fit_method == "brute"
+            and ArgKmin.is_usable_for(
+                X if X is not None else self._fit_X, self._fit_X, self.effective_metric_
+            )
+        )
+        if use_pairwise_distances_reductions:
+            results = ArgKmin.compute(
+                X=X,
+                Y=self._fit_X,
+                k=n_neighbors,
+                metric=self.effective_metric_,
+                metric_kwargs=self.effective_metric_params_,
+                strategy="auto",
+                return_distance=return_distance,
+            )
+
+        elif (
+            self._fit_method == "brute" and self.metric == "precomputed" and issparse(X)
+        ):
+            results = _kneighbors_from_graph(
+                X, n_neighbors=n_neighbors, return_distance=return_distance
+            )
+
+        elif self._fit_method == "brute":
+            # Joblib-based backend, which is used when user-defined callable
+            # are passed for metric.
+
+            # This won't be used in the future once PairwiseDistancesReductions
+            # support:
+            #   - DistanceMetrics which work on supposedly binary data
+            #   - CSR-dense and dense-CSR case if 'euclidean' in metric.
+            reduce_func = partial(
+                self._kneighbors_reduce_func,
+                n_neighbors=n_neighbors,
+                return_distance=return_distance,
+            )
+
+            # for efficiency, use squared euclidean distances
+            if self.effective_metric_ == "euclidean":
+                kwds = {"squared": True}
+            else:
+                kwds = self.effective_metric_params_
+
+            chunked_results = list(
+                pairwise_distances_chunked(
+                    X,
+                    self._fit_X,
+                    reduce_func=reduce_func,
+                    metric=self.effective_metric_,
+                    n_jobs=n_jobs,
+                    **kwds,
+                )
+            )
+
+        elif self._fit_method in ["ball_tree", "kd_tree"]:
+            if issparse(X):
+                raise ValueError(
+                    "%s does not work with sparse matrices. Densify the data, "
+                    "or set algorithm='brute'" % self._fit_method
+                )
+            chunked_results = Parallel(n_jobs, prefer="threads")(
+                delayed(self._tree.query)(X[s], n_neighbors, return_distance)
+                for s in gen_even_slices(X.shape[0], n_jobs)
+            )
+        else:
+            raise ValueError("internal: _fit_method not recognized")
+
+        if chunked_results is not None:
+            if return_distance:
+                neigh_dist, neigh_ind = zip(*chunked_results)
+                results = np.vstack(neigh_dist), np.vstack(neigh_ind)
+            else:
+                results = np.vstack(chunked_results)
+
+        if not query_is_train:
+            return results
+        else:
+            # If the query data is the same as the indexed data, we would like
+            # to ignore the first nearest neighbor of every sample, i.e
+            # the sample itself.
+            if return_distance:
+                neigh_dist, neigh_ind = results
+            else:
+                neigh_ind = results
+
+            n_queries, _ = X.shape
+            sample_range = np.arange(n_queries)[:, None]
+            sample_mask = neigh_ind != sample_range
+
+            # Corner case: When the number of duplicates are more
+            # than the number of neighbors, the first NN will not
+            # be the sample, but a duplicate.
+            # In that case mask the first duplicate.
+            dup_gr_nbrs = np.all(sample_mask, axis=1)
+            sample_mask[:, 0][dup_gr_nbrs] = False
+            neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1))
+
+            if return_distance:
+                neigh_dist = np.reshape(
+                    neigh_dist[sample_mask], (n_queries, n_neighbors - 1)
+                )
+                return neigh_dist, neigh_ind
+            return neigh_ind
+
+    def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"):
+        """Compute the (weighted) graph of k-Neighbors for points in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+            or (n_queries, n_indexed) if metric == 'precomputed', default=None
+            The query point or points.
+            If not provided, neighbors of each indexed point are returned.
+            In this case, the query point is not considered its own neighbor.
+            For ``metric='precomputed'`` the shape should be
+            (n_queries, n_indexed). Otherwise the shape should be
+            (n_queries, n_features).
+
+        n_neighbors : int, default=None
+            Number of neighbors for each sample. The default is the value
+            passed to the constructor.
+
+        mode : {'connectivity', 'distance'}, default='connectivity'
+            Type of returned matrix: 'connectivity' will return the
+            connectivity matrix with ones and zeros, in 'distance' the
+            edges are distances between points, type of distance
+            depends on the selected metric parameter in
+            NearestNeighbors class.
+
+        Returns
+        -------
+        A : sparse-matrix of shape (n_queries, n_samples_fit)
+            `n_samples_fit` is the number of samples in the fitted data.
+            `A[i, j]` gives the weight of the edge connecting `i` to `j`.
+            The matrix is of CSR format.
+
+        See Also
+        --------
+        NearestNeighbors.radius_neighbors_graph : Compute the (weighted) graph
+            of Neighbors for points in X.
+
+        Examples
+        --------
+        >>> X = [[0], [3], [1]]
+        >>> from sklearn.neighbors import NearestNeighbors
+        >>> neigh = NearestNeighbors(n_neighbors=2)
+        >>> neigh.fit(X)
+        NearestNeighbors(n_neighbors=2)
+        >>> A = neigh.kneighbors_graph(X)
+        >>> A.toarray()
+        array([[1., 0., 1.],
+               [0., 1., 1.],
+               [1., 0., 1.]])
+        """
+        check_is_fitted(self)
+        if n_neighbors is None:
+            n_neighbors = self.n_neighbors
+
+        # check the input only in self.kneighbors
+
+        # construct CSR matrix representation of the k-NN graph
+        if mode == "connectivity":
+            A_ind = self.kneighbors(X, n_neighbors, return_distance=False)
+            n_queries = A_ind.shape[0]
+            A_data = np.ones(n_queries * n_neighbors)
+
+        elif mode == "distance":
+            A_data, A_ind = self.kneighbors(X, n_neighbors, return_distance=True)
+            A_data = np.ravel(A_data)
+
+        else:
+            raise ValueError(
+                'Unsupported mode, must be one of "connectivity", '
+                f'or "distance" but got "{mode}" instead'
+            )
+
+        n_queries = A_ind.shape[0]
+        n_samples_fit = self.n_samples_fit_
+        n_nonzero = n_queries * n_neighbors
+        A_indptr = np.arange(0, n_nonzero + 1, n_neighbors)
+
+        kneighbors_graph = csr_matrix(
+            (A_data, A_ind.ravel(), A_indptr), shape=(n_queries, n_samples_fit)
+        )
+
+        return kneighbors_graph
+
+
+class RadiusNeighborsMixin:
+    """Mixin for radius-based neighbors searches."""
+
+    def _radius_neighbors_reduce_func(self, dist, start, radius, return_distance):
+        """Reduce a chunk of distances to the nearest neighbors.
+
+        Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`
+
+        Parameters
+        ----------
+        dist : ndarray of shape (n_samples_chunk, n_samples)
+            The distance matrix.
+
+        start : int
+            The index in X which the first row of dist corresponds to.
+
+        radius : float
+            The radius considered when making the nearest neighbors search.
+
+        return_distance : bool
+            Whether or not to return the distances.
+
+        Returns
+        -------
+        dist : list of ndarray of shape (n_samples_chunk,)
+            Returned only if `return_distance=True`.
+
+        neigh : list of ndarray of shape (n_samples_chunk,)
+            The neighbors indices.
+        """
+        neigh_ind = [np.where(d <= radius)[0] for d in dist]
+
+        if return_distance:
+            if self.effective_metric_ == "euclidean":
+                dist = [np.sqrt(d[neigh_ind[i]]) for i, d in enumerate(dist)]
+            else:
+                dist = [d[neigh_ind[i]] for i, d in enumerate(dist)]
+            results = dist, neigh_ind
+        else:
+            results = neigh_ind
+        return results
+
+    def radius_neighbors(
+        self, X=None, radius=None, return_distance=True, sort_results=False
+    ):
+        """Find the neighbors within a given radius of a point or points.
+
+        Return the indices and distances of each point from the dataset
+        lying in a ball with size ``radius`` around the points of the query
+        array. Points lying on the boundary are included in the results.
+
+        The result points are *not* necessarily sorted by distance to their
+        query point.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of (n_samples, n_features), default=None
+            The query point or points.
+            If not provided, neighbors of each indexed point are returned.
+            In this case, the query point is not considered its own neighbor.
+
+        radius : float, default=None
+            Limiting distance of neighbors to return. The default is the value
+            passed to the constructor.
+
+        return_distance : bool, default=True
+            Whether or not to return the distances.
+
+        sort_results : bool, default=False
+            If True, the distances and indices will be sorted by increasing
+            distances before being returned. If False, the results may not
+            be sorted. If `return_distance=False`, setting `sort_results=True`
+            will result in an error.
+
+            .. versionadded:: 0.22
+
+        Returns
+        -------
+        neigh_dist : ndarray of shape (n_samples,) of arrays
+            Array representing the distances to each point, only present if
+            `return_distance=True`. The distance values are computed according
+            to the ``metric`` constructor parameter.
+
+        neigh_ind : ndarray of shape (n_samples,) of arrays
+            An array of arrays of indices of the approximate nearest points
+            from the population matrix that lie within a ball of size
+            ``radius`` around the query points.
+
+        Notes
+        -----
+        Because the number of neighbors of each point is not necessarily
+        equal, the results for multiple query points cannot be fit in a
+        standard data array.
+        For efficiency, `radius_neighbors` returns arrays of objects, where
+        each object is a 1D array of indices or distances.
+
+        Examples
+        --------
+        In the following example, we construct a NeighborsClassifier
+        class from an array representing our data set and ask who's
+        the closest point to [1, 1, 1]:
+
+        >>> import numpy as np
+        >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
+        >>> from sklearn.neighbors import NearestNeighbors
+        >>> neigh = NearestNeighbors(radius=1.6)
+        >>> neigh.fit(samples)
+        NearestNeighbors(radius=1.6)
+        >>> rng = neigh.radius_neighbors([[1., 1., 1.]])
+        >>> print(np.asarray(rng[0][0]))
+        [1.5 0.5]
+        >>> print(np.asarray(rng[1][0]))
+        [1 2]
+
+        The first array returned contains the distances to all points which
+        are closer than 1.6, while the second array returned contains their
+        indices.  In general, multiple points can be queried at the same time.
+        """
+        check_is_fitted(self)
+
+        if sort_results and not return_distance:
+            raise ValueError("return_distance must be True if sort_results is True.")
+
+        ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True
+        query_is_train = X is None
+        if query_is_train:
+            X = self._fit_X
+        else:
+            if self.metric == "precomputed":
+                X = _check_precomputed(X)
+            else:
+                X = validate_data(
+                    self,
+                    X,
+                    ensure_all_finite=ensure_all_finite,
+                    accept_sparse="csr",
+                    reset=False,
+                    order="C",
+                )
+
+        if radius is None:
+            radius = self.radius
+
+        use_pairwise_distances_reductions = (
+            self._fit_method == "brute"
+            and RadiusNeighbors.is_usable_for(
+                X if X is not None else self._fit_X, self._fit_X, self.effective_metric_
+            )
+        )
+
+        if use_pairwise_distances_reductions:
+            results = RadiusNeighbors.compute(
+                X=X,
+                Y=self._fit_X,
+                radius=radius,
+                metric=self.effective_metric_,
+                metric_kwargs=self.effective_metric_params_,
+                strategy="auto",
+                return_distance=return_distance,
+                sort_results=sort_results,
+            )
+
+        elif (
+            self._fit_method == "brute" and self.metric == "precomputed" and issparse(X)
+        ):
+            results = _radius_neighbors_from_graph(
+                X, radius=radius, return_distance=return_distance
+            )
+
+        elif self._fit_method == "brute":
+            # Joblib-based backend, which is used when user-defined callable
+            # are passed for metric.
+
+            # This won't be used in the future once PairwiseDistancesReductions
+            # support:
+            #   - DistanceMetrics which work on supposedly binary data
+            #   - CSR-dense and dense-CSR case if 'euclidean' in metric.
+
+            # for efficiency, use squared euclidean distances
+            if self.effective_metric_ == "euclidean":
+                radius *= radius
+                kwds = {"squared": True}
+            else:
+                kwds = self.effective_metric_params_
+
+            reduce_func = partial(
+                self._radius_neighbors_reduce_func,
+                radius=radius,
+                return_distance=return_distance,
+            )
+
+            chunked_results = pairwise_distances_chunked(
+                X,
+                self._fit_X,
+                reduce_func=reduce_func,
+                metric=self.effective_metric_,
+                n_jobs=self.n_jobs,
+                **kwds,
+            )
+            if return_distance:
+                neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results)
+                neigh_dist_list = sum(neigh_dist_chunks, [])
+                neigh_ind_list = sum(neigh_ind_chunks, [])
+                neigh_dist = _to_object_array(neigh_dist_list)
+                neigh_ind = _to_object_array(neigh_ind_list)
+                results = neigh_dist, neigh_ind
+            else:
+                neigh_ind_list = sum(chunked_results, [])
+                results = _to_object_array(neigh_ind_list)
+
+            if sort_results:
+                for ii in range(len(neigh_dist)):
+                    order = np.argsort(neigh_dist[ii], kind="mergesort")
+                    neigh_ind[ii] = neigh_ind[ii][order]
+                    neigh_dist[ii] = neigh_dist[ii][order]
+                results = neigh_dist, neigh_ind
+
+        elif self._fit_method in ["ball_tree", "kd_tree"]:
+            if issparse(X):
+                raise ValueError(
+                    "%s does not work with sparse matrices. Densify the data, "
+                    "or set algorithm='brute'" % self._fit_method
+                )
+
+            n_jobs = effective_n_jobs(self.n_jobs)
+            delayed_query = delayed(self._tree.query_radius)
+            chunked_results = Parallel(n_jobs, prefer="threads")(
+                delayed_query(X[s], radius, return_distance, sort_results=sort_results)
+                for s in gen_even_slices(X.shape[0], n_jobs)
+            )
+            if return_distance:
+                neigh_ind, neigh_dist = tuple(zip(*chunked_results))
+                results = np.hstack(neigh_dist), np.hstack(neigh_ind)
+            else:
+                results = np.hstack(chunked_results)
+        else:
+            raise ValueError("internal: _fit_method not recognized")
+
+        if not query_is_train:
+            return results
+        else:
+            # If the query data is the same as the indexed data, we would like
+            # to ignore the first nearest neighbor of every sample, i.e
+            # the sample itself.
+            if return_distance:
+                neigh_dist, neigh_ind = results
+            else:
+                neigh_ind = results
+
+            for ind, ind_neighbor in enumerate(neigh_ind):
+                mask = ind_neighbor != ind
+
+                neigh_ind[ind] = ind_neighbor[mask]
+                if return_distance:
+                    neigh_dist[ind] = neigh_dist[ind][mask]
+
+            if return_distance:
+                return neigh_dist, neigh_ind
+            return neigh_ind
+
+    def radius_neighbors_graph(
+        self, X=None, radius=None, mode="connectivity", sort_results=False
+    ):
+        """Compute the (weighted) graph of Neighbors for points in X.
+
+        Neighborhoods are restricted the points at a distance lower than
+        radius.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None
+            The query point or points.
+            If not provided, neighbors of each indexed point are returned.
+            In this case, the query point is not considered its own neighbor.
+
+        radius : float, default=None
+            Radius of neighborhoods. The default is the value passed to the
+            constructor.
+
+        mode : {'connectivity', 'distance'}, default='connectivity'
+            Type of returned matrix: 'connectivity' will return the
+            connectivity matrix with ones and zeros, in 'distance' the
+            edges are distances between points, type of distance
+            depends on the selected metric parameter in
+            NearestNeighbors class.
+
+        sort_results : bool, default=False
+            If True, in each row of the result, the non-zero entries will be
+            sorted by increasing distances. If False, the non-zero entries may
+            not be sorted. Only used with mode='distance'.
+
+            .. versionadded:: 0.22
+
+        Returns
+        -------
+        A : sparse-matrix of shape (n_queries, n_samples_fit)
+            `n_samples_fit` is the number of samples in the fitted data.
+            `A[i, j]` gives the weight of the edge connecting `i` to `j`.
+            The matrix is of CSR format.
+
+        See Also
+        --------
+        kneighbors_graph : Compute the (weighted) graph of k-Neighbors for
+            points in X.
+
+        Examples
+        --------
+        >>> X = [[0], [3], [1]]
+        >>> from sklearn.neighbors import NearestNeighbors
+        >>> neigh = NearestNeighbors(radius=1.5)
+        >>> neigh.fit(X)
+        NearestNeighbors(radius=1.5)
+        >>> A = neigh.radius_neighbors_graph(X)
+        >>> A.toarray()
+        array([[1., 0., 1.],
+               [0., 1., 0.],
+               [1., 0., 1.]])
+        """
+        check_is_fitted(self)
+
+        # check the input only in self.radius_neighbors
+
+        if radius is None:
+            radius = self.radius
+
+        # construct CSR matrix representation of the NN graph
+        if mode == "connectivity":
+            A_ind = self.radius_neighbors(X, radius, return_distance=False)
+            A_data = None
+        elif mode == "distance":
+            dist, A_ind = self.radius_neighbors(
+                X, radius, return_distance=True, sort_results=sort_results
+            )
+            A_data = np.concatenate(list(dist))
+        else:
+            raise ValueError(
+                'Unsupported mode, must be one of "connectivity", '
+                f'or "distance" but got "{mode}" instead'
+            )
+
+        n_queries = A_ind.shape[0]
+        n_samples_fit = self.n_samples_fit_
+        n_neighbors = np.array([len(a) for a in A_ind])
+        A_ind = np.concatenate(list(A_ind))
+        if A_data is None:
+            A_data = np.ones(len(A_ind))
+        A_indptr = np.concatenate((np.zeros(1, dtype=int), np.cumsum(n_neighbors)))
+
+        return csr_matrix((A_data, A_ind, A_indptr), shape=(n_queries, n_samples_fit))
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = self.metric == "nan_euclidean"
+        return tags
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_binary_tree.pxi.tp b/.venv/Lib/site-packages/sklearn/neighbors/_binary_tree.pxi.tp
new file mode 100644
index 0000000000000000000000000000000000000000..a7fe50f0aa94e19203cc16612fc7c83dc7c113ef
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_binary_tree.pxi.tp
@@ -0,0 +1,2481 @@
+{{py:
+
+# Generated file: _binary_tree.pxi
+
+implementation_specific_values = [
+    # The values are arranged as follows:
+    #
+    #       name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE
+    #
+    ('64', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'),
+    ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT')
+]
+
+# KD Tree and Ball Tree
+# =====================
+#
+#    Author: Jake Vanderplas <jakevdp@cs.washington.edu>, 2012-2013
+#            Omar Salman <omar.salman@arbisoft.com>
+#
+#    License: BSD
+#
+# _binary_tree.pxi is generated and is then literally Cython included in
+# ball_tree.pyx and kd_tree.pyx. See ball_tree.pyx.tp and kd_tree.pyx.tp.
+
+}}
+
+
+# KD Tree and Ball Tree
+# =====================
+#
+# The routines here are the core algorithms of the KDTree and BallTree
+# structures.  If Cython supported polymorphism, we would be able to
+# create a subclass and derive KDTree and BallTree from it.  Because
+# polymorphism is not an option, we use this single BinaryTree class
+# as a literal include to avoid duplicating the entire file.
+#
+# A series of functions are implemented in kd_tree.pyx and ball_tree.pyx
+# which use the information here to calculate the lower and upper bounds
+# between a node and a point, and between two nodes.  These functions are
+# used here, and are all that are needed to differentiate between the two
+# tree types.
+#
+# Description of Binary Tree Algorithms
+# -------------------------------------
+# A binary tree can be thought of as a collection of nodes.  The top node
+# contains all the points.  The next level consists of two nodes with half
+# the points in each, and this continues recursively.  Each node contains
+# metadata which allow fast computation of distance bounds: in the case of
+# a ball tree, the metadata is a center and a radius.  In the case of a
+# KD tree, the metadata is the minimum and maximum bound along each dimension.
+#
+# In a typical KD Tree or Ball Tree implementation, the nodes are implemented
+# as dynamically allocated structures with pointers linking them.  Here we
+# take a different approach, storing all relevant data in a set of arrays
+# so that the entire tree object can be saved in a pickle file. For efficiency,
+# the data can be stored in such a way that explicit pointers are not
+# necessary: for node data stored at index i, the two child nodes are at
+# index (2 * i + 1) and (2 * i + 2); the parent node is (i - 1) // 2
+# (where // indicates integer division).
+#
+# The data arrays used here are as follows:
+#   data : the [n_samples x n_features] array of data from which the tree
+#          is built
+#   idx_array : the length n_samples array used to keep track of the indices
+#          of data within each node.  Each node has values idx_start and
+#          idx_end: the points within the node are given by (using numpy
+#          syntax) data[idx_array[idx_start:idx_end]].
+#   node_data : the length n_nodes array of structures which store the node
+#          indices, node radii, and leaf information for each node.
+#   node_bounds : the [* x n_nodes x n_features] array containing the node
+#          bound information.  For ball tree, the first dimension is 1, and
+#          each row contains the centroid of the node.  For kd tree, the first
+#          dimension is 2 and the rows for each point contain the arrays of
+#          lower bounds and upper bounds in each direction.
+#
+# The lack of dynamic allocation means the number of nodes must be computed
+# before the building of the tree. This can be done assuming the points are
+# divided equally between child nodes at each step; although this removes
+# some flexibility in tree creation, it ensures a balanced tree and ensures
+# that the number of nodes required can be computed beforehand.  Given a
+# specified leaf_size (the minimum number of points in any node), it is
+# possible to show that a balanced tree will have
+#
+#     n_levels = 1 + max(0, floor(log2((n_samples - 1) / leaf_size)))
+#
+# in order to satisfy
+#
+#     leaf_size <= min(n_points) <= 2 * leaf_size
+#
+# with the exception of the special case where n_samples < leaf_size.
+# for a given number of levels, the number of nodes in the tree is given by
+#
+#     n_nodes = 2 ** n_levels - 1
+#
+# both these results can be straightforwardly shown by induction.  The
+# following code uses these values in the construction of the tree.
+#
+# Distance Metrics
+# ----------------
+# For flexibility, the trees can be built using a variety of distance metrics.
+# The metrics are described in the DistanceMetric class: the standard
+# Euclidean distance is the default, and is inlined to be faster than other
+# metrics.  In addition, each metric defines both a distance and a
+# "reduced distance", which is often faster to compute, and is therefore
+# used in the query architecture whenever possible. (For example, in the
+# case of the standard Euclidean distance, the reduced distance is the
+# squared-distance).
+#
+# Implementation Notes
+# --------------------
+# This implementation uses the common object-oriented approach of having an
+# abstract base class which is extended by the KDTree and BallTree
+# specializations.
+#
+# The BinaryTree "base class" is defined here and then subclassed in the BallTree
+# and KDTree pyx files. These files include implementations of the
+# "abstract" methods.
+
+# Necessary Helper Functions
+# --------------------------
+# These are the names and descriptions of the "abstract" functions which are
+# defined in kd_tree.pyx and ball_tree.pyx:
+
+# cdef int allocate_data(BinaryTree tree, intp_t n_nodes, intp_t n_features):
+#     """Allocate arrays needed for the KD Tree"""
+
+# cdef int init_node(BinaryTree tree, intp_t i_node,
+#                    intp_t idx_start, intp_t idx_end):
+#    """Initialize the node for the dataset stored in tree.data"""
+
+# cdef float64_t min_rdist(BinaryTree tree, intp_t i_node, float64_t* pt):
+#     """Compute the minimum reduced-distance between a point and a node"""
+
+# cdef float64_t min_dist(BinaryTree tree, intp_t i_node, float64_t* pt):
+#     """Compute the minimum distance between a point and a node"""
+
+# cdef float64_t max_rdist(BinaryTree tree, intp_t i_node, float64_t* pt):
+#     """Compute the maximum reduced-distance between a point and a node"""
+
+# cdef float64_t max_dist(BinaryTree tree, intp_t i_node, float64_t* pt):
+#     """Compute the maximum distance between a point and a node"""
+
+# cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt,
+#                              float64_t* min_dist, float64_t* max_dist):
+#     """Compute the minimum and maximum distance between a point and a node"""
+
+# cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1,
+#                                    BinaryTree tree2, intp_t i_node2):
+#     """Compute the minimum reduced distance between two nodes"""
+
+# cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1,
+#                                   BinaryTree tree2, intp_t i_node2):
+#     """Compute the minimum distance between two nodes"""
+
+# cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1,
+#                                    BinaryTree tree2, intp_t i_node2):
+#     """Compute the maximum reduced distance between two nodes"""
+
+# cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1,
+#                                   BinaryTree tree2, intp_t i_node2):
+#     """Compute the maximum distance between two nodes"""
+
+cimport numpy as cnp
+from cython cimport floating
+from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma
+from libc.math cimport fmin, fmax
+from libc.stdlib cimport calloc, malloc, free
+from libc.string cimport memcpy
+
+import numpy as np
+import warnings
+
+from ..metrics._dist_metrics cimport (
+    DistanceMetric,
+    DistanceMetric64,
+    DistanceMetric32,
+    euclidean_dist64,
+    euclidean_dist32,
+    euclidean_rdist64,
+    euclidean_rdist32,
+    euclidean_dist_to_rdist64,
+    euclidean_dist_to_rdist32,
+)
+
+from ._partition_nodes cimport partition_node_indices
+
+from ..utils import check_array
+from ..utils._typedefs cimport float32_t, float64_t, intp_t
+from ..utils._heap cimport heap_push
+from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort
+
+cnp.import_array()
+
+
+# TODO: use cnp.PyArray_ENABLEFLAGS when Cython>=3.0 is used.
+cdef extern from "numpy/arrayobject.h":
+    void PyArray_ENABLEFLAGS(cnp.ndarray arr, int flags)
+
+
+# some handy constants
+cdef float64_t INF = np.inf
+cdef float64_t NEG_INF = -np.inf
+cdef float64_t PI = np.pi
+cdef float64_t ROOT_2PI = sqrt(2 * PI)
+cdef float64_t LOG_PI = log(PI)
+cdef float64_t LOG_2PI = log(2 * PI)
+
+
+# Some compound datatypes used below:
+cdef struct NodeHeapData_t:
+    float64_t val
+    intp_t i1
+    intp_t i2
+
+# build the corresponding numpy dtype for NodeHeapData
+cdef NodeHeapData_t nhd_tmp
+NodeHeapData = np.asarray(<NodeHeapData_t[:1]>(&nhd_tmp)).dtype
+
+cdef struct NodeData_t:
+    intp_t idx_start
+    intp_t idx_end
+    intp_t is_leaf
+    float64_t radius
+
+# build the corresponding numpy dtype for NodeData
+cdef NodeData_t nd_tmp
+NodeData = np.asarray(<NodeData_t[:1]>(&nd_tmp)).dtype
+
+
+######################################################################
+# Define doc strings, substituting the appropriate class name using
+# the DOC_DICT variable defined in the pyx files.
+CLASS_DOC = """{BinaryTree} for fast generalized N-point problems
+
+Read more in the :ref:`User Guide <unsupervised_neighbors>`.
+
+Parameters
+----------
+X : array-like of shape (n_samples, n_features)
+    n_samples is the number of points in the data set, and
+    n_features is the dimension of the parameter space.
+    Note: if X is a C-contiguous array of doubles then data will
+    not be copied. Otherwise, an internal copy will be made.
+
+leaf_size : positive int, default=40
+    Number of points at which to switch to brute-force. Changing
+    leaf_size will not affect the results of a query, but can
+    significantly impact the speed of a query and the memory required
+    to store the constructed tree.  The amount of memory needed to
+    store the tree scales as approximately n_samples / leaf_size.
+    For a specified ``leaf_size``, a leaf node is guaranteed to
+    satisfy ``leaf_size <= n_points <= 2 * leaf_size``, except in
+    the case that ``n_samples < leaf_size``.
+
+metric : str or DistanceMetric64 object, default='minkowski'
+    Metric to use for distance computation. Default is "minkowski", which
+    results in the standard Euclidean distance when p = 2.
+    A list of valid metrics for {BinaryTree} is given by the attribute
+    `valid_metrics`.
+    See the documentation of `scipy.spatial.distance
+    <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+    the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for
+    more information on any distance metric.
+
+Additional keywords are passed to the distance metric class.
+Note: Callable functions in the metric parameter are NOT supported for KDTree
+and Ball Tree. Function call overhead will result in very poor performance.
+
+Attributes
+----------
+data : memory view
+    The training data
+valid_metrics: list of str
+    List of valid distance metrics.
+
+Examples
+--------
+Query for k-nearest neighbors
+
+    >>> import numpy as np
+    >>> from sklearn.neighbors import {BinaryTree}
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.random_sample((10, 3))  # 10 points in 3 dimensions
+    >>> tree = {BinaryTree}(X, leaf_size=2)              # doctest: +SKIP
+    >>> dist, ind = tree.query(X[:1], k=3)                # doctest: +SKIP
+    >>> print(ind)  # indices of 3 closest neighbors
+    [0 3 1]
+    >>> print(dist)  # distances to 3 closest neighbors
+    [ 0.          0.19662693  0.29473397]
+
+Pickle and Unpickle a tree.  Note that the state of the tree is saved in the
+pickle operation: the tree needs not be rebuilt upon unpickling.
+
+    >>> import numpy as np
+    >>> import pickle
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.random_sample((10, 3))  # 10 points in 3 dimensions
+    >>> tree = {BinaryTree}(X, leaf_size=2)        # doctest: +SKIP
+    >>> s = pickle.dumps(tree)                     # doctest: +SKIP
+    >>> tree_copy = pickle.loads(s)                # doctest: +SKIP
+    >>> dist, ind = tree_copy.query(X[:1], k=3)     # doctest: +SKIP
+    >>> print(ind)  # indices of 3 closest neighbors
+    [0 3 1]
+    >>> print(dist)  # distances to 3 closest neighbors
+    [ 0.          0.19662693  0.29473397]
+
+Query for neighbors within a given radius
+
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.random_sample((10, 3))  # 10 points in 3 dimensions
+    >>> tree = {BinaryTree}(X, leaf_size=2)     # doctest: +SKIP
+    >>> print(tree.query_radius(X[:1], r=0.3, count_only=True))
+    3
+    >>> ind = tree.query_radius(X[:1], r=0.3)  # doctest: +SKIP
+    >>> print(ind)  # indices of neighbors within distance 0.3
+    [3 0 1]
+
+
+Compute a gaussian kernel density estimate:
+
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(42)
+    >>> X = rng.random_sample((100, 3))
+    >>> tree = {BinaryTree}(X)                # doctest: +SKIP
+    >>> tree.kernel_density(X[:3], h=0.1, kernel='gaussian')
+    array([ 6.94114649,  7.83281226,  7.2071716 ])
+
+Compute a two-point auto-correlation function
+
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.random_sample((30, 3))
+    >>> r = np.linspace(0, 1, 5)
+    >>> tree = {BinaryTree}(X)                # doctest: +SKIP
+    >>> tree.two_point_correlation(X, r)
+    array([ 30,  62, 278, 580, 820])
+
+"""
+
+
+######################################################################
+# Utility functions
+cdef float64_t logaddexp(float64_t x1, float64_t x2):
+    """logaddexp(x1, x2) -> log(exp(x1) + exp(x2))"""
+    cdef float64_t a = fmax(x1, x2)
+    if a == NEG_INF:
+        return NEG_INF
+    else:
+        return a + log(exp(x1 - a) + exp(x2 - a))
+
+cdef float64_t logsubexp(float64_t x1, float64_t x2):
+    """logsubexp(x1, x2) -> log(exp(x1) - exp(x2))"""
+    if x1 <= x2:
+        return NEG_INF
+    else:
+        return x1 + log(1 - exp(x2 - x1))
+
+
+######################################################################
+# Kernel functions
+#
+# Note: Kernels assume dist is non-negative and h is positive
+#       All kernel functions are normalized such that K(0, h) = 1.
+#       The fully normalized kernel is:
+#         K = exp[kernel_norm(h, d, kernel) + compute_kernel(dist, h, kernel)]
+#       The code only works with non-negative kernels: i.e. K(d, h) >= 0
+#       for all valid d and h.  Note that for precision, the log of both
+#       the kernel and kernel norm is returned.
+cdef enum KernelType:
+    GAUSSIAN_KERNEL = 1
+    TOPHAT_KERNEL = 2
+    EPANECHNIKOV_KERNEL = 3
+    EXPONENTIAL_KERNEL = 4
+    LINEAR_KERNEL = 5
+    COSINE_KERNEL = 6
+
+
+cdef inline float64_t log_gaussian_kernel(float64_t dist, float64_t h):
+    """log of the gaussian kernel for bandwidth h (unnormalized)"""
+    return -0.5 * (dist * dist) / (h * h)
+
+
+cdef inline float64_t log_tophat_kernel(float64_t dist, float64_t h):
+    """log of the tophat kernel for bandwidth h (unnormalized)"""
+    if dist < h:
+        return 0.0
+    else:
+        return NEG_INF
+
+
+cdef inline float64_t log_epanechnikov_kernel(float64_t dist, float64_t h):
+    """log of the epanechnikov kernel for bandwidth h (unnormalized)"""
+    if dist < h:
+        return log(1.0 - (dist * dist) / (h * h))
+    else:
+        return NEG_INF
+
+
+cdef inline float64_t log_exponential_kernel(float64_t dist, float64_t h):
+    """log of the exponential kernel for bandwidth h (unnormalized)"""
+    return -dist / h
+
+
+cdef inline float64_t log_linear_kernel(float64_t dist, float64_t h):
+    """log of the linear kernel for bandwidth h (unnormalized)"""
+    if dist < h:
+        return log(1 - dist / h)
+    else:
+        return NEG_INF
+
+
+cdef inline float64_t log_cosine_kernel(float64_t dist, float64_t h):
+    """log of the cosine kernel for bandwidth h (unnormalized)"""
+    if dist < h:
+        return log(cos(0.5 * PI * dist / h))
+    else:
+        return NEG_INF
+
+
+cdef inline float64_t compute_log_kernel(float64_t dist, float64_t h,
+                                         KernelType kernel):
+    """Given a KernelType enumeration, compute the appropriate log-kernel"""
+    if kernel == GAUSSIAN_KERNEL:
+        return log_gaussian_kernel(dist, h)
+    elif kernel == TOPHAT_KERNEL:
+        return log_tophat_kernel(dist, h)
+    elif kernel == EPANECHNIKOV_KERNEL:
+        return log_epanechnikov_kernel(dist, h)
+    elif kernel == EXPONENTIAL_KERNEL:
+        return log_exponential_kernel(dist, h)
+    elif kernel == LINEAR_KERNEL:
+        return log_linear_kernel(dist, h)
+    elif kernel == COSINE_KERNEL:
+        return log_cosine_kernel(dist, h)
+
+
+# ------------------------------------------------------------
+# Kernel norms are defined via the volume element V_n
+# and surface element S_(n-1) of an n-sphere.
+cdef float64_t logVn(intp_t n):
+    """V_n = pi^(n/2) / gamma(n/2 - 1)"""
+    return 0.5 * n * LOG_PI - lgamma(0.5 * n + 1)
+
+
+cdef float64_t logSn(intp_t n):
+    """V_(n+1) = int_0^1 S_n r^n dr"""
+    return LOG_2PI + logVn(n - 1)
+
+
+cdef float64_t _log_kernel_norm(float64_t h, intp_t d,
+                                KernelType kernel) except -1:
+    """Given a KernelType enumeration, compute the kernel normalization.
+
+    h is the bandwidth, d is the dimension.
+    """
+    cdef float64_t tmp, factor = 0
+    cdef intp_t k
+    if kernel == GAUSSIAN_KERNEL:
+        factor = 0.5 * d * LOG_2PI
+    elif kernel == TOPHAT_KERNEL:
+        factor = logVn(d)
+    elif kernel == EPANECHNIKOV_KERNEL:
+        factor = logVn(d) + log(2. / (d + 2.))
+    elif kernel == EXPONENTIAL_KERNEL:
+        factor = logSn(d - 1) + lgamma(d)
+    elif kernel == LINEAR_KERNEL:
+        factor = logVn(d) - log(d + 1.)
+    elif kernel == COSINE_KERNEL:
+        # this is derived from a chain rule integration
+        factor = 0
+        tmp = 2. / PI
+        for k in range(1, d + 1, 2):
+            factor += tmp
+            tmp *= -(d - k) * (d - k - 1) * (2. / PI) ** 2
+        factor = log(factor) + logSn(d - 1)
+    else:
+        raise ValueError("Kernel code not recognized")
+    return -factor - d * log(h)
+
+
+def kernel_norm(h, d, kernel, return_log=False):
+    """Given a string specification of a kernel, compute the normalization.
+
+    Parameters
+    ----------
+    h : float
+        The bandwidth of the kernel.
+    d : int
+        The dimension of the space in which the kernel norm is computed.
+    kernel : str
+        The kernel identifier.  Must be one of
+        ['gaussian'|'tophat'|'epanechnikov'|
+         'exponential'|'linear'|'cosine']
+    return_log : bool, default=False
+        If True, return the log of the kernel norm.  Otherwise, return the
+        kernel norm.
+    Returns
+    -------
+    knorm or log_knorm : float
+        the kernel norm or logarithm of the kernel norm.
+    """
+    if kernel == 'gaussian':
+        result = _log_kernel_norm(h, d, GAUSSIAN_KERNEL)
+    elif kernel == 'tophat':
+        result = _log_kernel_norm(h, d, TOPHAT_KERNEL)
+    elif kernel == 'epanechnikov':
+        result = _log_kernel_norm(h, d, EPANECHNIKOV_KERNEL)
+    elif kernel == 'exponential':
+        result = _log_kernel_norm(h, d, EXPONENTIAL_KERNEL)
+    elif kernel == 'linear':
+        result = _log_kernel_norm(h, d, LINEAR_KERNEL)
+    elif kernel == 'cosine':
+        result = _log_kernel_norm(h, d, COSINE_KERNEL)
+    else:
+        raise ValueError('kernel not recognized')
+
+    if return_log:
+        return result
+    else:
+        return np.exp(result)
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}}
+
+cdef class NeighborsHeap{{name_suffix}}:
+    """A max-heap structure to keep track of distances/indices of neighbors
+
+    This implements an efficient pre-allocated set of fixed-size heaps
+    for chasing neighbors, holding both an index and a distance.
+    When any row of the heap is full, adding an additional point will push
+    the furthest point off the heap.
+
+    Parameters
+    ----------
+    n_pts : int
+        the number of heaps to use
+    n_nbrs : int
+        the size of each heap.
+    """
+    cdef {{INPUT_DTYPE_t}}[:, ::1] distances
+    cdef intp_t[:, ::1] indices
+
+    def __cinit__(self):
+        # One-element arrays are used as placeholders to prevent
+        # any problem due to potential access to those attributes
+        # (e.g. assigning to NULL or a to value in another segment).
+        self.distances = np.zeros((1, 1), dtype={{INPUT_DTYPE}}, order='C')
+        self.indices = np.zeros((1, 1), dtype=np.intp, order='C')
+
+    def __init__(self, n_pts, n_nbrs):
+        self.distances = np.full(
+            (n_pts, n_nbrs), np.inf, dtype={{INPUT_DTYPE}}, order='C'
+        )
+        self.indices = np.zeros((n_pts, n_nbrs), dtype=np.intp, order='C')
+
+    def get_arrays(self, sort=True):
+        """Get the arrays of distances and indices within the heap.
+
+        If sort=True, then simultaneously sort the indices and distances,
+        so the closer points are listed first.
+        """
+        if sort:
+            self._sort()
+        return self.distances.base, self.indices.base
+
+    cdef inline float64_t largest(self, intp_t row) except -1 nogil:
+        """Return the largest distance in the given row"""
+        return self.distances[row, 0]
+
+    def push(self, intp_t row, float64_t val, intp_t i_val):
+        return self._push(row, val, i_val)
+
+    cdef int _push(self, intp_t row, float64_t val,
+                   intp_t i_val) except -1 nogil:
+        """push (val, i_val) into the given row"""
+        return heap_push(
+            values=&self.distances[row, 0],
+            indices=&self.indices[row, 0],
+            size=self.distances.shape[1],
+            val=val,
+            val_idx=i_val,
+        )
+
+    cdef int _sort(self) except -1:
+        """simultaneously sort the distances and indices"""
+        cdef intp_t row
+        for row in range(self.distances.shape[0]):
+            _simultaneous_sort(
+                dist=&self.distances[row, 0],
+                idx=&self.indices[row, 0],
+                size=self.distances.shape[1],
+            )
+        return 0
+
+{{endfor}}
+
+#------------------------------------------------------------
+# find_node_split_dim:
+#  this computes the equivalent of
+#  j_max = np.argmax(np.max(data, 0) - np.min(data, 0))
+cdef intp_t find_node_split_dim(const floating* data,
+                                 const intp_t* node_indices,
+                                 intp_t n_features,
+                                 intp_t n_points) except -1:
+    """Find the dimension with the largest spread.
+
+    Parameters
+    ----------
+    data : double pointer
+        Pointer to a 2D array of the training data, of shape [N, n_features].
+        N must be greater than any of the values in node_indices.
+    node_indices : int pointer
+        Pointer to a 1D array of length n_points.  This lists the indices of
+        each of the points within the current node.
+
+    Returns
+    -------
+    i_max : int
+        The index of the feature (dimension) within the node that has the
+        largest spread.
+
+    Notes
+    -----
+    In numpy, this operation is equivalent to
+
+    def find_node_split_dim(data, node_indices):
+        return np.argmax(data[node_indices].max(0) - data[node_indices].min(0))
+
+    The cython version is much more efficient in both computation and memory.
+    """
+    cdef float64_t min_val, max_val, val, spread, max_spread
+    cdef intp_t i, j, j_max
+
+    j_max = 0
+    max_spread = 0
+
+    for j in range(n_features):
+        max_val = data[node_indices[0] * n_features + j]
+        min_val = max_val
+        for i in range(1, n_points):
+            val = data[node_indices[i] * n_features + j]
+            max_val = fmax(max_val, val)
+            min_val = fmin(min_val, val)
+        spread = max_val - min_val
+        if spread > max_spread:
+            max_spread = spread
+            j_max = j
+    return j_max
+
+
+######################################################################
+# NodeHeap : min-heap used to keep track of nodes during
+#            breadth-first query
+cdef inline void swap_nodes(NodeHeapData_t* arr, intp_t i1, intp_t i2):
+    cdef NodeHeapData_t tmp = arr[i1]
+    arr[i1] = arr[i2]
+    arr[i2] = tmp
+
+
+cdef class NodeHeap:
+    """NodeHeap
+
+    This is a min-heap implementation for keeping track of nodes
+    during a breadth-first search.  Unlike the NeighborsHeap above,
+    the NodeHeap does not have a fixed size and must be able to grow
+    as elements are added.
+
+    Internally, the data is stored in a simple binary heap which meets
+    the min heap condition:
+
+        heap[i].val < min(heap[2 * i + 1].val, heap[2 * i + 2].val)
+    """
+    cdef NodeHeapData_t[:] data
+    cdef intp_t n
+
+    def __cinit__(self):
+        # A one-elements array is used as a placeholder to prevent
+        # any problem due to potential access to this attribute
+        # (e.g. assigning to NULL or a to value in another segment).
+        self.data = np.zeros(1, dtype=NodeHeapData, order='C')
+
+    def __init__(self, size_guess=100):
+        size_guess = max(size_guess, 1)  # need space for at least one item
+        self.data = np.zeros(size_guess, dtype=NodeHeapData, order='C')
+        self.n = size_guess
+        self.clear()
+
+    cdef int resize(self, intp_t new_size) except -1:
+        """Resize the heap to be either larger or smaller"""
+        cdef:
+            NodeHeapData_t *data_ptr
+            NodeHeapData_t *new_data_ptr
+            intp_t i
+            intp_t size = self.data.shape[0]
+            NodeHeapData_t[:] new_data = np.zeros(
+                new_size,
+                dtype=NodeHeapData,
+            )
+
+        if size > 0 and new_size > 0:
+            data_ptr = &self.data[0]
+            new_data_ptr = &new_data[0]
+            for i in range(min(size, new_size)):
+                new_data_ptr[i] = data_ptr[i]
+
+        if new_size < size:
+            self.n = new_size
+
+        self.data = new_data
+        return 0
+
+    cdef int push(self, NodeHeapData_t data) except -1:
+        """Push a new item onto the heap"""
+        cdef intp_t i, i_parent
+        cdef NodeHeapData_t* data_arr
+        self.n += 1
+        if self.n > self.data.shape[0]:
+            self.resize(2 * self.n)
+
+        # put the new element at the end,
+        # and then perform swaps until the heap is in order
+        data_arr = &self.data[0]
+        i = self.n - 1
+        data_arr[i] = data
+
+        while i > 0:
+            i_parent = (i - 1) // 2
+            if data_arr[i_parent].val <= data_arr[i].val:
+                break
+            else:
+                swap_nodes(data_arr, i, i_parent)
+                i = i_parent
+        return 0
+
+    cdef NodeHeapData_t peek(self):
+        """Peek at the root of the heap, without removing it"""
+        return self.data[0]
+
+    cdef NodeHeapData_t pop(self):
+        """Remove the root of the heap, and update the remaining nodes"""
+        if self.n == 0:
+            raise ValueError('cannot pop on empty heap')
+
+        cdef intp_t i, i_child1, i_child2, i_swap
+        cdef NodeHeapData_t* data_arr = &self.data[0]
+        cdef NodeHeapData_t popped_element = data_arr[0]
+
+        # pop off the first element, move the last element to the front,
+        # and then perform swaps until the heap is back in order
+        data_arr[0] = data_arr[self.n - 1]
+        self.n -= 1
+
+        i = 0
+
+        while (i < self.n):
+            i_child1 = 2 * i + 1
+            i_child2 = 2 * i + 2
+            i_swap = 0
+
+            if i_child2 < self.n:
+                if data_arr[i_child1].val <= data_arr[i_child2].val:
+                    i_swap = i_child1
+                else:
+                    i_swap = i_child2
+            elif i_child1 < self.n:
+                i_swap = i_child1
+            else:
+                break
+
+            if (i_swap > 0) and (data_arr[i_swap].val <= data_arr[i].val):
+                swap_nodes(data_arr, i, i_swap)
+                i = i_swap
+            else:
+                break
+
+        return popped_element
+
+    cdef void clear(self):
+        """Clear the heap"""
+        self.n = 0
+
+
+######################################################################
+# newObj function
+#  this is a helper function for pickling
+def newObj(obj):
+    return obj.__new__(obj)
+
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}}
+
+######################################################################
+# define the reverse mapping of VALID_METRICS{{name_suffix}}
+from sklearn.metrics._dist_metrics import get_valid_metric_ids
+VALID_METRIC_IDS{{name_suffix}} = get_valid_metric_ids(VALID_METRICS{{name_suffix}})
+
+
+######################################################################
+# Binary Tree class
+cdef class BinaryTree{{name_suffix}}:
+
+    cdef readonly const {{INPUT_DTYPE_t}}[:, ::1] data
+    cdef readonly const {{INPUT_DTYPE_t}}[::1] sample_weight
+    cdef public float64_t sum_weight
+
+    # TODO: idx_array and node_bounds must not be const, but this change needs
+    # to happen in a way which preserves pickling
+    # See also: https://github.com/cython/cython/issues/5639
+    cdef public const intp_t[::1] idx_array
+    cdef public const NodeData_t[::1] node_data
+    cdef public const {{INPUT_DTYPE_t}}[:, :, ::1] node_bounds
+
+    cdef intp_t leaf_size
+    cdef intp_t n_levels
+    cdef intp_t n_nodes
+
+    cdef DistanceMetric{{name_suffix}} dist_metric
+    cdef int euclidean
+
+    # variables to keep track of building & querying stats
+    cdef int n_trims
+    cdef int n_leaves
+    cdef int n_splits
+    cdef int n_calls
+
+    valid_metrics = VALID_METRIC_IDS{{name_suffix}}
+
+    # Use cinit to initialize all arrays to empty: this will prevent memory
+    # errors and seg-faults in rare cases where __init__ is not called
+    # A one-elements array is used as a placeholder to prevent
+    # any problem due to potential access to this attribute
+    # (e.g. assigning to NULL or a to value in another segment).
+    def __cinit__(self):
+        self.data = np.empty((1, 1), dtype={{INPUT_DTYPE}}, order='C')
+        self.sample_weight = np.empty(1, dtype={{INPUT_DTYPE}}, order='C')
+        self.idx_array = np.empty(1, dtype=np.intp, order='C')
+        self.node_data = np.empty(1, dtype=NodeData, order='C')
+        self.node_bounds = np.empty((1, 1, 1), dtype={{INPUT_DTYPE}})
+
+        self.leaf_size = 0
+        self.n_levels = 0
+        self.n_nodes = 0
+
+        self.euclidean = False
+
+        self.n_trims = 0
+        self.n_leaves = 0
+        self.n_splits = 0
+        self.n_calls = 0
+
+    def __init__(self, data,
+                 leaf_size=40, metric='minkowski', sample_weight=None, **kwargs):
+        # validate data
+        self.data = check_array(data, dtype={{INPUT_DTYPE}}, order='C')
+        if self.data.size == 0:
+            raise ValueError("X is an empty array")
+
+        n_samples = self.data.shape[0]
+        n_features = self.data.shape[1]
+
+        if leaf_size < 1:
+            raise ValueError("leaf_size must be greater than or equal to 1")
+        self.leaf_size = leaf_size
+
+        self.dist_metric = DistanceMetric.get_metric(metric, dtype={{INPUT_DTYPE}}, **kwargs)
+        self.euclidean = (self.dist_metric.__class__.__name__
+                          == 'EuclideanDistance{{name_suffix}}')
+
+        metric = self.dist_metric.__class__.__name__
+        if metric not in VALID_METRICS{{name_suffix}}:
+            raise ValueError('metric {metric} is not valid for '
+                             '{BinaryTree}'.format(metric=metric,
+                                                   **DOC_DICT{{name_suffix}}))
+        self.dist_metric._validate_data(self.data)
+
+        # determine number of levels in the tree, and from this
+        # the number of nodes in the tree.  This results in leaf nodes
+        # with numbers of points between leaf_size and 2 * leaf_size
+        self.n_levels = int(
+            np.log2(fmax(1, (n_samples - 1) / self.leaf_size)) + 1)
+        self.n_nodes = <int> (2 ** self.n_levels) - 1
+
+        # allocate arrays for storage
+        self.idx_array = np.arange(n_samples, dtype=np.intp)
+        self.node_data = np.zeros(self.n_nodes, dtype=NodeData)
+
+        self._update_sample_weight(n_samples, sample_weight)
+
+        # Allocate tree-specific data
+        allocate_data{{name_suffix}}(self, self.n_nodes, n_features)
+        self._recursive_build(
+            node_data=self.node_data.base,
+            i_node=0,
+            idx_start=0,
+            idx_end=n_samples
+        )
+
+    def _update_sample_weight(self, n_samples, sample_weight):
+        if sample_weight is not None:
+            self.sample_weight = np.asarray(
+                sample_weight, dtype={{INPUT_DTYPE}}, order='C')
+            self.sum_weight = np.sum(self.sample_weight)
+        else:
+            self.sample_weight = None
+            self.sum_weight = <float64_t> n_samples
+
+    def __reduce__(self):
+        """
+        reduce method used for pickling
+        """
+        return (newObj, (type(self),), self.__getstate__())
+
+    def __getstate__(self):
+        """
+        get state for pickling
+        """
+        if self.sample_weight is not None:
+            # pass the numpy array
+            sample_weight = self.sample_weight.base
+        else:
+            # pass None to avoid confusion with the empty place holder
+            # of size 1 from __cinit__
+            sample_weight = None
+        return (self.data.base,
+                self.idx_array.base,
+                self.node_data.base,
+                self.node_bounds.base,
+                int(self.leaf_size),
+                int(self.n_levels),
+                int(self.n_nodes),
+                int(self.n_trims),
+                int(self.n_leaves),
+                int(self.n_splits),
+                int(self.n_calls),
+                self.dist_metric,
+                sample_weight)
+
+    def __setstate__(self, state):
+        """
+        set state for pickling
+        """
+        self.data = state[0]
+        self.idx_array = state[1]
+        self.node_data = state[2]
+        self.node_bounds = state[3]
+        self.leaf_size = state[4]
+        self.n_levels = state[5]
+        self.n_nodes = state[6]
+        self.n_trims = state[7]
+        self.n_leaves = state[8]
+        self.n_splits = state[9]
+        self.n_calls = state[10]
+        self.dist_metric = state[11]
+        sample_weight = state[12]
+
+        self.euclidean = (self.dist_metric.__class__.__name__
+                          == 'EuclideanDistance64')
+        n_samples = self.data.shape[0]
+        self._update_sample_weight(n_samples, sample_weight)
+
+    def get_tree_stats(self):
+        """
+        get_tree_stats()
+
+        Get tree status.
+
+        Returns
+        -------
+        tree_stats: tuple of int
+            (number of trims, number of leaves, number of splits)
+        """
+        return (self.n_trims, self.n_leaves, self.n_splits)
+
+    def reset_n_calls(self):
+        """
+        reset_n_calls()
+
+        Reset number of calls to 0.
+        """
+        self.n_calls = 0
+
+    def get_n_calls(self):
+        """
+        get_n_calls()
+
+        Get number of calls.
+
+        Returns
+        -------
+        n_calls: int
+            number of distance computation calls
+        """
+        return self.n_calls
+
+    def get_arrays(self):
+        """
+        get_arrays()
+
+        Get data and node arrays.
+
+        Returns
+        -------
+        arrays: tuple of array
+            Arrays for storing tree data, index, node data and node bounds.
+        """
+        return (
+            self.data.base,
+            self.idx_array.base,
+            self.node_data.base,
+            self.node_bounds.base,
+        )
+
+    cdef inline float64_t dist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2,
+                             intp_t size) except -1 nogil:
+        """Compute the distance between arrays x1 and x2"""
+        self.n_calls += 1
+        if self.euclidean:
+            return euclidean_dist{{name_suffix}}(x1, x2, size)
+        else:
+            return self.dist_metric.dist(x1, x2, size)
+
+    cdef inline float64_t rdist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2,
+                              intp_t size) except -1 nogil:
+        """Compute the reduced distance between arrays x1 and x2.
+
+        The reduced distance, defined for some metrics, is a quantity which
+        is more efficient to compute than the distance, but preserves the
+        relative rankings of the true distance.  For example, the reduced
+        distance for the Euclidean metric is the squared-euclidean distance.
+        """
+        self.n_calls += 1
+        if self.euclidean:
+            return euclidean_rdist{{name_suffix}}(x1, x2, size)
+        else:
+            return self.dist_metric.rdist(x1, x2, size)
+
+    cdef int _recursive_build(self, NodeData_t[::1] node_data, intp_t i_node, intp_t idx_start,
+                              intp_t idx_end) except -1:
+        """Recursively build the tree.
+
+        Parameters
+        ----------
+        i_node : int
+            the node for the current step
+        idx_start, idx_end : int
+            the bounding indices in the idx_array which define the points that
+            belong to this node.
+        """
+        cdef intp_t imax
+        cdef intp_t n_features = self.data.shape[1]
+        cdef intp_t n_points = idx_end - idx_start
+        cdef intp_t n_mid = n_points / 2
+        cdef intp_t* idx_array = &self.idx_array[idx_start]
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+
+        # initialize node data
+        init_node{{name_suffix}}(self, node_data, i_node, idx_start, idx_end)
+
+        if 2 * i_node + 1 >= self.n_nodes:
+            node_data[i_node].is_leaf = True
+            if idx_end - idx_start > 2 * self.leaf_size:
+                # this shouldn't happen if our memory allocation is correct
+                # we'll proactively prevent memory errors, but raise a
+                # warning saying we're doing so.
+                import warnings
+                warnings.warn("Internal: memory layout is flawed: "
+                              "not enough nodes allocated")
+
+        elif idx_end - idx_start < 2:
+            # again, this shouldn't happen if our memory allocation
+            # is correct.  Raise a warning.
+            import warnings
+            warnings.warn("Internal: memory layout is flawed: "
+                          "too many nodes allocated")
+            node_data[i_node].is_leaf = True
+
+        else:
+            # split node and recursively construct child nodes.
+            node_data[i_node].is_leaf = False
+            i_max = find_node_split_dim(data, idx_array,
+                                        n_features, n_points)
+            partition_node_indices(data, idx_array, i_max, n_mid,
+                                   n_features, n_points)
+            self._recursive_build(node_data, 2 * i_node + 1,
+                                  idx_start, idx_start + n_mid)
+            self._recursive_build(node_data, 2 * i_node + 2,
+                                  idx_start + n_mid, idx_end)
+
+    def query(self, X, k=1, return_distance=True,
+              dualtree=False, breadth_first=False,
+              sort_results=True):
+        """
+        query(X, k=1, return_distance=True,
+              dualtree=False, breadth_first=False)
+
+        query the tree for the k nearest neighbors
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            An array of points to query
+        k : int, default=1
+            The number of nearest neighbors to return
+        return_distance : bool, default=True
+            if True, return a tuple (d, i) of distances and indices
+            if False, return array i
+        dualtree : bool, default=False
+            if True, use the dual tree formalism for the query: a tree is
+            built for the query points, and the pair of trees is used to
+            efficiently search this space.  This can lead to better
+            performance as the number of points grows large.
+        breadth_first : bool, default=False
+            if True, then query the nodes in a breadth-first manner.
+            Otherwise, query the nodes in a depth-first manner.
+        sort_results : bool, default=True
+            if True, then distances and indices of each point are sorted
+            on return, so that the first column contains the closest points.
+            Otherwise, neighbors are returned in an arbitrary order.
+
+        Returns
+        -------
+        i    : if return_distance == False
+        (d,i) : if return_distance == True
+
+        d : ndarray of shape X.shape[:-1] + (k,), dtype=double
+            Each entry gives the list of distances to the neighbors of the
+            corresponding point.
+
+        i : ndarray of shape X.shape[:-1] + (k,), dtype=int
+            Each entry gives the list of indices of neighbors of the
+            corresponding point.
+        """
+        # XXX: we should allow X to be a pre-built tree.
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
+
+        if X.shape[X.ndim - 1] != self.data.shape[1]:
+            raise ValueError("query data dimension must "
+                             "match training data dimension")
+
+        if self.data.shape[0] < k:
+            raise ValueError("k must be less than or equal "
+                             "to the number of training points")
+
+        # flatten X, and save original shape information
+        np_Xarr = X.reshape((-1, self.data.shape[1]))
+        cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr
+        cdef float64_t reduced_dist_LB
+        cdef intp_t i
+        cdef const {{INPUT_DTYPE_t}}* pt
+
+        # initialize heap for neighbors
+        cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(Xarr.shape[0], k)
+
+        # node heap for breadth-first queries
+        cdef NodeHeap nodeheap
+        if breadth_first:
+            nodeheap = NodeHeap(self.data.shape[0] // self.leaf_size)
+
+        # bounds is needed for the dual tree algorithm
+        cdef float64_t[::1] bounds
+
+        self.n_trims = 0
+        self.n_leaves = 0
+        self.n_splits = 0
+
+        if dualtree:
+            other = self.__class__(np_Xarr, metric=self.dist_metric,
+                                   leaf_size=self.leaf_size)
+            if breadth_first:
+                self._query_dual_breadthfirst(other, heap, nodeheap)
+            else:
+                reduced_dist_LB = min_rdist_dual{{name_suffix}}(self, 0, other, 0)
+                bounds = np.full(other.node_data.shape[0], np.inf)
+                self._query_dual_depthfirst(0, other, 0, bounds,
+                                            heap, reduced_dist_LB)
+
+        else:
+            pt = &Xarr[0, 0]
+            if breadth_first:
+                for i in range(Xarr.shape[0]):
+                    self._query_single_breadthfirst(pt, i, heap, nodeheap)
+                    pt += Xarr.shape[1]
+            else:
+                with nogil:
+                    for i in range(Xarr.shape[0]):
+                        reduced_dist_LB = min_rdist{{name_suffix}}(self, 0, pt)
+                        self._query_single_depthfirst(0, pt, i, heap,
+                                                      reduced_dist_LB)
+                        pt += Xarr.shape[1]
+
+        distances, indices = heap.get_arrays(sort=sort_results)
+        distances = self.dist_metric.rdist_to_dist(distances)
+
+        # deflatten results
+        if return_distance:
+            return (distances.reshape(X.shape[:X.ndim - 1] + (k,)),
+                    indices.reshape(X.shape[:X.ndim - 1] + (k,)))
+        else:
+            return indices.reshape(X.shape[:X.ndim - 1] + (k,))
+
+    def query_radius(self, X, r, int return_distance=False,
+                     int count_only=False, int sort_results=False):
+        """
+        query_radius(X, r, return_distance=False,
+        count_only=False, sort_results=False)
+
+        query the tree for neighbors within a radius r
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            An array of points to query
+        r : distance within which neighbors are returned
+            r can be a single value, or an array of values of shape
+            x.shape[:-1] if different radii are desired for each point.
+        return_distance : bool, default=False
+            if True,  return distances to neighbors of each point
+            if False, return only neighbors
+            Note that unlike the query() method, setting return_distance=True
+            here adds to the computation time.  Not all distances need to be
+            calculated explicitly for return_distance=False.  Results are
+            not sorted by default: see ``sort_results`` keyword.
+        count_only : bool, default=False
+            if True,  return only the count of points within distance r
+            if False, return the indices of all points within distance r
+            If return_distance==True, setting count_only=True will
+            result in an error.
+        sort_results : bool, default=False
+            if True, the distances and indices will be sorted before being
+            returned.  If False, the results will not be sorted.  If
+            return_distance == False, setting sort_results = True will
+            result in an error.
+
+        Returns
+        -------
+        count       : if count_only == True
+        ind         : if count_only == False and return_distance == False
+        (ind, dist) : if count_only == False and return_distance == True
+
+        count : ndarray of shape X.shape[:-1], dtype=int
+            Each entry gives the number of neighbors within a distance r of the
+            corresponding point.
+
+        ind : ndarray of shape X.shape[:-1], dtype=object
+            Each element is a numpy integer array listing the indices of
+            neighbors of the corresponding point.  Note that unlike
+            the results of a k-neighbors query, the returned neighbors
+            are not sorted by distance by default.
+
+        dist : ndarray of shape X.shape[:-1], dtype=object
+            Each element is a numpy double array listing the distances
+            corresponding to indices in i.
+        """
+        if count_only and return_distance:
+            raise ValueError("count_only and return_distance "
+                             "cannot both be true")
+
+        if sort_results and not return_distance:
+            raise ValueError("return_distance must be True "
+                             "if sort_results is True")
+
+        cdef intp_t i, count_i = 0
+        cdef intp_t n_features = self.data.shape[1]
+        cdef {{INPUT_DTYPE_t}}[::1] dist_arr_i
+        cdef intp_t[::1] idx_arr_i, counts
+        cdef const {{INPUT_DTYPE_t}}* pt
+        cdef intp_t** indices = NULL
+        cdef {{INPUT_DTYPE_t}}** distances = NULL
+
+        # validate X and prepare for query
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
+
+        if X.shape[X.ndim - 1] != self.data.shape[1]:
+            raise ValueError("query data dimension must "
+                             "match training data dimension")
+
+        cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = X.reshape((-1, self.data.shape[1]))
+
+        # prepare r for query
+        r = np.asarray(r, dtype=np.float64, order='C')
+        r = np.atleast_1d(r)
+        if r.shape == (1,):
+            r = np.full(X.shape[:X.ndim - 1], r[0], dtype=np.float64)
+        else:
+            if r.shape != X.shape[:X.ndim - 1]:
+                raise ValueError("r must be broadcastable to X.shape")
+
+        rarr_np = r.reshape(-1)  # store explicitly to keep in scope
+        cdef float64_t[::1] rarr = rarr_np
+
+        if not count_only:
+            indices = <intp_t**>calloc(Xarr.shape[0], sizeof(intp_t*))
+            if indices == NULL:
+                raise MemoryError()
+            if return_distance:
+                distances = <{{INPUT_DTYPE_t}}**>calloc(Xarr.shape[0], sizeof({{INPUT_DTYPE_t}}*))
+                if distances == NULL:
+                    free(indices)
+                    raise MemoryError()
+
+        np_idx_arr = np.zeros(self.data.shape[0], dtype=np.intp)
+        idx_arr_i = np_idx_arr
+
+        np_dist_arr = np.zeros(self.data.shape[0], dtype={{INPUT_DTYPE}})
+        dist_arr_i = np_dist_arr
+
+        counts_arr = np.zeros(Xarr.shape[0], dtype=np.intp)
+        counts = counts_arr
+
+        pt = &Xarr[0, 0]
+        memory_error = False
+        with nogil:
+            for i in range(Xarr.shape[0]):
+                counts[i] = self._query_radius_single(0, pt, rarr[i],
+                                                      &idx_arr_i[0],
+                                                      &dist_arr_i[0],
+                                                      0, count_only,
+                                                      return_distance)
+                pt += n_features
+
+                if count_only:
+                    continue
+
+                if sort_results:
+                    _simultaneous_sort(&dist_arr_i[0], &idx_arr_i[0],
+                                       counts[i])
+
+                # equivalent to: indices[i] = np_idx_arr[:counts[i]].copy()
+                indices[i] = <intp_t*>malloc(counts[i] * sizeof(intp_t))
+                if indices[i] == NULL:
+                    memory_error = True
+                    break
+                memcpy(indices[i], &idx_arr_i[0], counts[i] * sizeof(intp_t))
+
+                if return_distance:
+                    # equivalent to: distances[i] = np_dist_arr[:counts[i]].copy()
+                    distances[i] = <{{INPUT_DTYPE_t}}*>malloc(counts[i] * sizeof({{INPUT_DTYPE_t}}))
+                    if distances[i] == NULL:
+                        memory_error = True
+                        break
+                    memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof({{INPUT_DTYPE_t}}))
+
+        try:
+            if memory_error:
+                raise MemoryError()
+
+            if count_only:
+                # deflatten results
+                return counts_arr.reshape(X.shape[:X.ndim - 1])
+            elif return_distance:
+                indices_npy = np.zeros(Xarr.shape[0], dtype='object')
+                distances_npy = np.zeros(Xarr.shape[0], dtype='object')
+                for i in range(Xarr.shape[0]):
+                    # make a new numpy array that wraps the existing data
+                    # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0
+                    indices_npy[i] = cnp.PyArray_SimpleNewFromData(1, <cnp.intp_t*>&counts[i], cnp.NPY_INTP, indices[i])
+                    # make sure the data will be freed when the numpy array is garbage collected
+                    PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_ARRAY_OWNDATA)
+                    # make sure the data is not freed twice
+                    indices[i] = NULL
+
+                    # make a new numpy array that wraps the existing data
+                    # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0
+                    distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, <cnp.intp_t*>&counts[i], {{NPY_TYPE}}, distances[i])
+                    # make sure the data will be freed when the numpy array is garbage collected
+                    PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_ARRAY_OWNDATA)
+                    # make sure the data is not freed twice
+                    distances[i] = NULL
+
+                # deflatten results
+                return (indices_npy.reshape(X.shape[:X.ndim - 1]),
+                        distances_npy.reshape(X.shape[:X.ndim - 1]))
+            else:
+                indices_npy = np.zeros(Xarr.shape[0], dtype='object')
+                for i in range(Xarr.shape[0]):
+                    # make a new numpy array that wraps the existing data
+                    # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0
+                    indices_npy[i] = cnp.PyArray_SimpleNewFromData(1, <cnp.intp_t*>&counts[i], cnp.NPY_INTP, indices[i])
+                    # make sure the data will be freed when the numpy array is garbage collected
+                    PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_ARRAY_OWNDATA)
+                    # make sure the data is not freed twice
+                    indices[i] = NULL
+
+                # deflatten results
+                return indices_npy.reshape(X.shape[:X.ndim - 1])
+        except MemoryError:
+            # free any buffer that is not owned by a numpy array
+            for i in range(Xarr.shape[0]):
+                free(indices[i])
+                if return_distance:
+                    free(distances[i])
+            raise
+        finally:
+            free(indices)
+            free(distances)
+
+    def kernel_density(self, X, h, kernel='gaussian',
+                       atol=0, rtol=1E-8,
+                       breadth_first=True, return_log=False):
+        """
+        kernel_density(X, h, kernel='gaussian', atol=0, rtol=1E-8,
+                       breadth_first=True, return_log=False)
+
+        Compute the kernel density estimate at points X with the given kernel,
+        using the distance metric specified at tree creation.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            An array of points to query.  Last dimension should match dimension
+            of training data.
+        h : float
+            the bandwidth of the kernel
+        kernel : str, default="gaussian"
+            specify the kernel to use.  Options are
+            - 'gaussian'
+            - 'tophat'
+            - 'epanechnikov'
+            - 'exponential'
+            - 'linear'
+            - 'cosine'
+            Default is kernel = 'gaussian'
+        atol : float, default=0
+            Specify the desired absolute tolerance of the result.
+            If the true result is `K_true`, then the returned result `K_ret`
+            satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret``
+            The default is zero (i.e. machine precision).
+        rtol : float, default=1e-8
+            Specify the desired relative tolerance of the result.
+            If the true result is `K_true`, then the returned result `K_ret`
+            satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret``
+            The default is `1e-8` (i.e. machine precision).
+        breadth_first : bool, default=False
+            If True, use a breadth-first search.  If False (default) use a
+            depth-first search.  Breadth-first is generally faster for
+            compact kernels and/or high tolerances.
+        return_log : bool, default=False
+            Return the logarithm of the result.  This can be more accurate
+            than returning the result itself for narrow kernels.
+
+        Returns
+        -------
+        density : ndarray of shape X.shape[:-1]
+            The array of (log)-density evaluations
+        """
+        cdef float64_t h_c = h
+        cdef float64_t log_atol = log(atol)
+        cdef float64_t log_rtol = log(rtol)
+        cdef float64_t log_min_bound, log_max_bound, log_bound_spread
+        cdef float64_t dist_LB = 0, dist_UB = 0
+
+        cdef intp_t n_samples = self.data.shape[0]
+        cdef intp_t n_features = self.data.shape[1]
+        cdef intp_t i
+        cdef KernelType kernel_c
+
+        # validate kernel
+        if kernel == 'gaussian':
+            kernel_c = GAUSSIAN_KERNEL
+        elif kernel == 'tophat':
+            kernel_c = TOPHAT_KERNEL
+        elif kernel == 'epanechnikov':
+            kernel_c = EPANECHNIKOV_KERNEL
+        elif kernel == 'exponential':
+            kernel_c = EXPONENTIAL_KERNEL
+        elif kernel == 'linear':
+            kernel_c = LINEAR_KERNEL
+        elif kernel == 'cosine':
+            kernel_c = COSINE_KERNEL
+        else:
+            raise ValueError("kernel = '%s' not recognized" % kernel)
+
+        cdef float64_t log_knorm = _log_kernel_norm(h_c, n_features, kernel_c)
+
+        # validate X and prepare for query
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
+
+        if X.shape[X.ndim - 1] != n_features:
+            raise ValueError("query data dimension must "
+                             "match training data dimension")
+        Xarr_np = X.reshape((-1, n_features))
+        cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = Xarr_np
+
+        log_density_arr = np.zeros(Xarr.shape[0], dtype={{INPUT_DTYPE}})
+        cdef {{INPUT_DTYPE_t}}[::1] log_density = log_density_arr
+
+        cdef const {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0]
+
+        cdef NodeHeap nodeheap
+        if breadth_first:
+            nodeheap = NodeHeap(self.data.shape[0] // self.leaf_size)
+        cdef float64_t[::1] node_log_min_bounds
+        cdef float64_t[::1] node_bound_widths
+        # TODO: implement dual tree approach.
+        #       this is difficult because of the need to cache values
+        #       computed between node pairs.
+        if breadth_first:
+            node_log_min_bounds_arr = np.full(self.n_nodes, -np.inf)
+            node_log_min_bounds = node_log_min_bounds_arr
+            node_bound_widths_arr = np.zeros(self.n_nodes)
+            node_bound_widths = node_bound_widths_arr
+            for i in range(Xarr.shape[0]):
+                log_density[i] = self._kde_single_breadthfirst(
+                                            pt, kernel_c, h_c,
+                                            log_knorm, log_atol, log_rtol,
+                                            nodeheap,
+                                            &node_log_min_bounds[0],
+                                            &node_bound_widths[0])
+                pt += n_features
+        else:
+            for i in range(Xarr.shape[0]):
+                min_max_dist{{name_suffix}}(self, 0, pt, &dist_LB, &dist_UB)
+                # compute max & min bounds on density within top node
+                log_min_bound = (log(self.sum_weight) +
+                                 compute_log_kernel(dist_UB,
+                                                    h_c, kernel_c))
+                log_max_bound = (log(self.sum_weight) +
+                                 compute_log_kernel(dist_LB,
+                                                    h_c, kernel_c))
+                log_bound_spread = logsubexp(log_max_bound, log_min_bound)
+                self._kde_single_depthfirst(0, pt, kernel_c, h_c,
+                                            log_knorm, log_atol, log_rtol,
+                                            log_min_bound,
+                                            log_bound_spread,
+                                            &log_min_bound,
+                                            &log_bound_spread)
+                log_density[i] = logaddexp(log_min_bound,
+                                           log_bound_spread - log(2))
+                pt += n_features
+
+        # normalize the results
+        for i in range(log_density.shape[0]):
+            log_density[i] += log_knorm
+
+        log_density_arr = log_density_arr.reshape(X.shape[:X.ndim - 1])
+
+        if return_log:
+            return log_density_arr
+        else:
+            return np.exp(log_density_arr)
+
+    def two_point_correlation(self, X, r, dualtree=False):
+        """
+        two_point_correlation(X, r, dualtree=False)
+
+        Compute the two-point correlation function
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            An array of points to query.  Last dimension should match dimension
+            of training data.
+        r : array-like
+            A one-dimensional array of distances
+        dualtree : bool, default=False
+            If True, use a dualtree algorithm.  Otherwise, use a single-tree
+            algorithm.  Dual tree algorithms can have better scaling for
+            large N.
+
+        Returns
+        -------
+        counts : ndarray
+            counts[i] contains the number of pairs of points with distance
+            less than or equal to r[i]
+        """
+        cdef intp_t n_features = self.data.shape[1]
+        cdef intp_t i
+
+        # validate X and prepare for query
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
+
+        if X.shape[X.ndim - 1] != self.data.shape[1]:
+            raise ValueError("query data dimension must "
+                             "match training data dimension")
+
+        np_Xarr = X.reshape((-1, self.data.shape[1]))
+        cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr
+
+        # prepare r for query
+        r = np.asarray(r, dtype=np.float64, order='C')
+        r = np.atleast_1d(r)
+        if r.ndim != 1:
+            raise ValueError("r must be a 1-dimensional array")
+        i_rsort = np.argsort(r)
+        rarr_np = r[i_rsort]  # needed to keep memory in scope
+        cdef float64_t[::1] rarr = rarr_np
+
+        # create array to hold counts
+        count = np.zeros(r.shape[0], dtype=np.intp)
+        cdef intp_t[::1] carr = count
+
+        cdef const {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0]
+
+        if dualtree:
+            other = self.__class__(Xarr, metric=self.dist_metric,
+                                   leaf_size=self.leaf_size)
+            self._two_point_dual(0, other, 0, &rarr[0], &carr[0],
+                                 0, rarr.shape[0])
+        else:
+            for i in range(Xarr.shape[0]):
+                self._two_point_single(0, pt, &rarr[0], &carr[0],
+                                       0, rarr.shape[0])
+                pt += n_features
+
+        return count
+
+    cdef int _query_single_depthfirst(
+        self,
+        intp_t i_node,
+        const {{INPUT_DTYPE_t}}* pt,
+        intp_t i_pt,
+        NeighborsHeap{{name_suffix}} heap,
+        float64_t reduced_dist_LB,
+    ) except -1 nogil:
+        """Recursive Single-tree k-neighbors query, depth-first approach"""
+        cdef NodeData_t node_info = self.node_data[i_node]
+
+        cdef float64_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2
+        cdef intp_t i, i1, i2
+
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+
+        # ------------------------------------------------------------
+        # Case 1: query point is outside node radius:
+        #         trim it from the query
+        if reduced_dist_LB > heap.largest(i_pt):
+            self.n_trims += 1
+
+        # ------------------------------------------------------------
+        # Case 2: this is a leaf node.  Update set of nearby points
+        elif node_info.is_leaf:
+            self.n_leaves += 1
+            for i in range(node_info.idx_start, node_info.idx_end):
+                dist_pt = self.rdist(pt,
+                                     &self.data[self.idx_array[i], 0],
+                                     self.data.shape[1])
+                heap._push(i_pt, dist_pt, self.idx_array[i])
+
+        # ------------------------------------------------------------
+        # Case 3: Node is not a leaf.  Recursively query subnodes
+        #         starting with the closest
+        else:
+            self.n_splits += 1
+            i1 = 2 * i_node + 1
+            i2 = i1 + 1
+            reduced_dist_LB_1 = min_rdist{{name_suffix}}(self, i1, pt)
+            reduced_dist_LB_2 = min_rdist{{name_suffix}}(self, i2, pt)
+
+            # recursively query subnodes
+            if reduced_dist_LB_1 <= reduced_dist_LB_2:
+                self._query_single_depthfirst(i1, pt, i_pt, heap,
+                                              reduced_dist_LB_1)
+                self._query_single_depthfirst(i2, pt, i_pt, heap,
+                                              reduced_dist_LB_2)
+            else:
+                self._query_single_depthfirst(i2, pt, i_pt, heap,
+                                              reduced_dist_LB_2)
+                self._query_single_depthfirst(i1, pt, i_pt, heap,
+                                              reduced_dist_LB_1)
+        return 0
+
+    cdef int _query_single_breadthfirst(
+        self,
+        const {{INPUT_DTYPE_t}}* pt,
+        intp_t i_pt,
+        NeighborsHeap{{name_suffix}} heap,
+        NodeHeap nodeheap,
+    ) except -1:
+        """Non-recursive single-tree k-neighbors query, breadth-first search"""
+        cdef intp_t i, i_node
+        cdef float64_t dist_pt, reduced_dist_LB
+        cdef const NodeData_t* node_data = &self.node_data[0]
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+
+        # Set up the node heap and push the head node onto it
+        cdef NodeHeapData_t nodeheap_item
+        nodeheap_item.val = min_rdist{{name_suffix}}(self, 0, pt)
+        nodeheap_item.i1 = 0
+        nodeheap.push(nodeheap_item)
+
+        while nodeheap.n > 0:
+            nodeheap_item = nodeheap.pop()
+            reduced_dist_LB = nodeheap_item.val
+            i_node = nodeheap_item.i1
+            node_info = node_data[i_node]
+
+            # ------------------------------------------------------------
+            # Case 1: query point is outside node radius:
+            #         trim it from the query
+            if reduced_dist_LB > heap.largest(i_pt):
+                self.n_trims += 1
+
+            # ------------------------------------------------------------
+            # Case 2: this is a leaf node.  Update set of nearby points
+            elif node_data[i_node].is_leaf:
+                self.n_leaves += 1
+                for i in range(node_data[i_node].idx_start,
+                               node_data[i_node].idx_end):
+                    dist_pt = self.rdist(pt,
+                                         &self.data[self.idx_array[i], 0],
+                                         self.data.shape[1])
+                    heap._push(i_pt, dist_pt, self.idx_array[i])
+
+            # ------------------------------------------------------------
+            # Case 3: Node is not a leaf.  Add subnodes to the node heap
+            else:
+                self.n_splits += 1
+                for i in range(2 * i_node + 1, 2 * i_node + 3):
+                    nodeheap_item.i1 = i
+                    nodeheap_item.val = min_rdist{{name_suffix}}(self, i, pt)
+                    nodeheap.push(nodeheap_item)
+        return 0
+
+    cdef int _query_dual_depthfirst(
+        self,
+        intp_t i_node1,
+        BinaryTree{{name_suffix}} other,
+        intp_t i_node2,
+        float64_t[::1] bounds,
+        NeighborsHeap{{name_suffix}} heap,
+        float64_t reduced_dist_LB,
+    ) except -1:
+        """Recursive dual-tree k-neighbors query, depth-first"""
+        # note that the array `bounds` is maintained such that
+        # bounds[i] is the largest distance among any of the
+        # current neighbors in node i of the other tree.
+        cdef NodeData_t node_info1 = self.node_data[i_node1]
+        cdef NodeData_t node_info2 = other.node_data[i_node2]
+
+        cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0]
+        cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0]
+        cdef intp_t n_features = self.data.shape[1]
+
+        cdef float64_t bound_max, dist_pt, reduced_dist_LB1, reduced_dist_LB2
+        cdef intp_t i1, i2, i_pt, i_parent
+
+        # ------------------------------------------------------------
+        # Case 1: nodes are further apart than the current bound:
+        #         trim both from the query
+        if reduced_dist_LB > bounds[i_node2]:
+            pass
+
+        # ------------------------------------------------------------
+        # Case 2: both nodes are leaves:
+        #         do a brute-force search comparing all pairs
+        elif node_info1.is_leaf and node_info2.is_leaf:
+            bounds[i_node2] = 0
+
+            for i2 in range(node_info2.idx_start, node_info2.idx_end):
+                i_pt = other.idx_array[i2]
+
+                if heap.largest(i_pt) <= reduced_dist_LB:
+                    continue
+
+                for i1 in range(node_info1.idx_start, node_info1.idx_end):
+                    dist_pt = self.rdist(
+                        data1 + n_features * self.idx_array[i1],
+                        data2 + n_features * i_pt,
+                        n_features)
+                    heap._push(i_pt, dist_pt, self.idx_array[i1])
+
+                # keep track of node bound
+                bounds[i_node2] = fmax(bounds[i_node2],
+                                       heap.largest(i_pt))
+
+            # update bounds up the tree
+            while i_node2 > 0:
+                i_parent = (i_node2 - 1) // 2
+                bound_max = fmax(bounds[2 * i_parent + 1],
+                                 bounds[2 * i_parent + 2])
+                if bound_max < bounds[i_parent]:
+                    bounds[i_parent] = bound_max
+                    i_node2 = i_parent
+                else:
+                    break
+
+        # ------------------------------------------------------------
+        # Case 3a: node 1 is a leaf or is smaller: split node 2 and
+        #          recursively query, starting with the nearest subnode
+        elif node_info1.is_leaf or (not node_info2.is_leaf
+                                    and node_info2.radius > node_info1.radius):
+            reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, i_node1,
+                                              other, 2 * i_node2 + 1)
+            reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, i_node1,
+                                              other, 2 * i_node2 + 2)
+
+            if reduced_dist_LB1 < reduced_dist_LB2:
+                self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 1,
+                                            bounds, heap, reduced_dist_LB1)
+                self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 2,
+                                            bounds, heap, reduced_dist_LB2)
+            else:
+                self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 2,
+                                            bounds, heap, reduced_dist_LB2)
+                self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 1,
+                                            bounds, heap, reduced_dist_LB1)
+
+        # ------------------------------------------------------------
+        # Case 3b: node 2 is a leaf or is smaller: split node 1 and
+        #          recursively query, starting with the nearest subnode
+        else:
+            reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 1,
+                                              other, i_node2)
+            reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 2,
+                                              other, i_node2)
+
+            if reduced_dist_LB1 < reduced_dist_LB2:
+                self._query_dual_depthfirst(2 * i_node1 + 1, other, i_node2,
+                                            bounds, heap, reduced_dist_LB1)
+                self._query_dual_depthfirst(2 * i_node1 + 2, other, i_node2,
+                                            bounds, heap, reduced_dist_LB2)
+            else:
+                self._query_dual_depthfirst(2 * i_node1 + 2, other, i_node2,
+                                            bounds, heap, reduced_dist_LB2)
+                self._query_dual_depthfirst(2 * i_node1 + 1, other, i_node2,
+                                            bounds, heap, reduced_dist_LB1)
+        return 0
+
+    cdef int _query_dual_breadthfirst(
+        self,
+        BinaryTree{{name_suffix}} other,
+        NeighborsHeap{{name_suffix}} heap,
+        NodeHeap nodeheap,
+    ) except -1:
+        """Non-recursive dual-tree k-neighbors query, breadth-first"""
+        cdef intp_t i, i1, i2, i_node1, i_node2, i_pt
+        cdef float64_t dist_pt, reduced_dist_LB
+        cdef float64_t[::1] bounds = np.full(other.node_data.shape[0], np.inf)
+        cdef const NodeData_t* node_data1 = &self.node_data[0]
+        cdef const NodeData_t* node_data2 = &other.node_data[0]
+        cdef NodeData_t node_info1, node_info2
+        cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0]
+        cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0]
+        cdef intp_t n_features = self.data.shape[1]
+
+        # Set up the node heap and push the head nodes onto it
+        cdef NodeHeapData_t nodeheap_item
+        nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, 0, other, 0)
+        nodeheap_item.i1 = 0
+        nodeheap_item.i2 = 0
+        nodeheap.push(nodeheap_item)
+
+        while nodeheap.n > 0:
+            nodeheap_item = nodeheap.pop()
+            reduced_dist_LB = nodeheap_item.val
+            i_node1 = nodeheap_item.i1
+            i_node2 = nodeheap_item.i2
+
+            node_info1 = node_data1[i_node1]
+            node_info2 = node_data2[i_node2]
+
+            # ------------------------------------------------------------
+            # Case 1: nodes are further apart than the current bound:
+            #         trim both from the query
+            if reduced_dist_LB > bounds[i_node2]:
+                pass
+
+            # ------------------------------------------------------------
+            # Case 2: both nodes are leaves:
+            #         do a brute-force search comparing all pairs
+            elif node_info1.is_leaf and node_info2.is_leaf:
+                bounds[i_node2] = -1
+
+                for i2 in range(node_info2.idx_start, node_info2.idx_end):
+                    i_pt = other.idx_array[i2]
+
+                    if heap.largest(i_pt) <= reduced_dist_LB:
+                        continue
+
+                    for i1 in range(node_info1.idx_start, node_info1.idx_end):
+                        dist_pt = self.rdist(
+                            data1 + n_features * self.idx_array[i1],
+                            data2 + n_features * i_pt,
+                            n_features)
+                        heap._push(i_pt, dist_pt, self.idx_array[i1])
+
+                    # keep track of node bound
+                    bounds[i_node2] = fmax(bounds[i_node2],
+                                           heap.largest(i_pt))
+
+            # ------------------------------------------------------------
+            # Case 3a: node 1 is a leaf or is smaller: split node 2 and
+            #          recursively query, starting with the nearest subnode
+            elif node_info1.is_leaf or (not node_info2.is_leaf
+                                        and (node_info2.radius
+                                             > node_info1.radius)):
+                nodeheap_item.i1 = i_node1
+                for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3):
+                    nodeheap_item.i2 = i2
+                    nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i_node1,
+                                                       other, i2)
+                    nodeheap.push(nodeheap_item)
+
+            # ------------------------------------------------------------
+            # Case 3b: node 2 is a leaf or is smaller: split node 1 and
+            #          recursively query, starting with the nearest subnode
+            else:
+                nodeheap_item.i2 = i_node2
+                for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3):
+                    nodeheap_item.i1 = i1
+                    nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i1,
+                                                       other, i_node2)
+                    nodeheap.push(nodeheap_item)
+        return 0
+
+    cdef intp_t _query_radius_single(
+        self,
+        intp_t i_node,
+        const {{INPUT_DTYPE_t}}* pt,
+        float64_t r,
+        intp_t* indices,
+        {{INPUT_DTYPE_t}}* distances,
+        intp_t count,
+        int count_only,
+        int return_distance,
+    ) noexcept nogil:
+        """recursive single-tree radius query, depth-first"""
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+        cdef intp_t* idx_array = &self.idx_array[0]
+        cdef intp_t n_features = self.data.shape[1]
+        cdef NodeData_t node_info = self.node_data[i_node]
+
+        cdef intp_t i
+        cdef float64_t reduced_r
+
+        cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0
+        min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB)
+
+        # ------------------------------------------------------------
+        # Case 1: all node points are outside distance r.
+        #         prune this branch.
+        if dist_LB > r:
+            pass
+
+        # ------------------------------------------------------------
+        # Case 2: all node points are within distance r
+        #         add all points to neighbors
+        elif dist_UB <= r:
+            if count_only:
+                count += (node_info.idx_end - node_info.idx_start)
+            else:
+                for i in range(node_info.idx_start, node_info.idx_end):
+                    if (count < 0) or (count >= self.data.shape[0]):
+                        return -1
+                    indices[count] = idx_array[i]
+                    if return_distance:
+                        distances[count] = self.dist(pt, (data + n_features
+                                                          * idx_array[i]),
+                                                     n_features)
+                    count += 1
+
+        # ------------------------------------------------------------
+        # Case 3: this is a leaf node.  Go through all points to
+        #         determine if they fall within radius
+        elif node_info.is_leaf:
+            reduced_r = self.dist_metric._dist_to_rdist(r)
+
+            for i in range(node_info.idx_start, node_info.idx_end):
+                dist_pt = self.rdist(pt, (data + n_features * idx_array[i]),
+                                     n_features)
+                if dist_pt <= reduced_r:
+                    if (count < 0) or (count >= self.data.shape[0]):
+                        return -1
+                    if count_only:
+                        pass
+                    else:
+                        indices[count] = idx_array[i]
+                        if return_distance:
+                            distances[count] =\
+                                self.dist_metric._rdist_to_dist(dist_pt)
+                    count += 1
+
+        # ------------------------------------------------------------
+        # Case 4: Node is not a leaf.  Recursively query subnodes
+        else:
+            count = self._query_radius_single(2 * i_node + 1, pt, r,
+                                              indices, distances, count,
+                                              count_only, return_distance)
+            count = self._query_radius_single(2 * i_node + 2, pt, r,
+                                              indices, distances, count,
+                                              count_only, return_distance)
+
+        return count
+
+    cdef float64_t _kde_single_breadthfirst(
+        self, const {{INPUT_DTYPE_t}}* pt,
+        KernelType kernel,
+        float64_t h,
+        float64_t log_knorm,
+        float64_t log_atol,
+        float64_t log_rtol,
+        NodeHeap nodeheap,
+        float64_t* node_log_min_bounds,
+        float64_t* node_log_bound_spreads,
+    ):
+        """non-recursive single-tree kernel density estimation"""
+        # For the given point, node_log_min_bounds and node_log_bound_spreads
+        # will encode the current bounds on the density between the point
+        # and the associated node.
+        # The variables global_log_min_bound and global_log_bound_spread
+        # keep track of the global bounds on density.  The procedure here is
+        # to split nodes, updating these bounds, until the bounds are within
+        # atol & rtol.
+        cdef intp_t i, i1, i2, i_node
+        cdef float64_t N1, N2
+        cdef float64_t global_log_min_bound, global_log_bound_spread
+        cdef float64_t global_log_max_bound
+
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+        cdef bint with_sample_weight = self.sample_weight is not None
+        cdef const {{INPUT_DTYPE_t}}* sample_weight
+        if with_sample_weight:
+            sample_weight = &self.sample_weight[0]
+        cdef intp_t* idx_array = &self.idx_array[0]
+        cdef const NodeData_t* node_data = &self.node_data[0]
+        cdef float64_t N
+        cdef float64_t log_weight
+        if with_sample_weight:
+            N = self.sum_weight
+        else:
+            N = <float64_t> self.data.shape[0]
+        cdef intp_t n_features = self.data.shape[1]
+
+        cdef NodeData_t node_info
+        cdef float64_t dist_pt, log_density
+        cdef float64_t dist_LB_1 = 0, dist_LB_2 = 0
+        cdef float64_t dist_UB_1 = 0, dist_UB_2 = 0
+
+        cdef float64_t dist_UB, dist_LB
+
+        # push the top node to the heap
+        cdef NodeHeapData_t nodeheap_item
+        nodeheap_item.val = min_dist{{name_suffix}}(self, 0, pt)
+        nodeheap_item.i1 = 0
+        nodeheap.push(nodeheap_item)
+
+        global_log_min_bound = log(N) + compute_log_kernel(
+            max_dist{{name_suffix}}(self, 0, pt), h, kernel
+        )
+        global_log_max_bound = log(N) + compute_log_kernel(nodeheap_item.val,
+                                                           h, kernel)
+        global_log_bound_spread = logsubexp(global_log_max_bound,
+                                            global_log_min_bound)
+
+        node_log_min_bounds[0] = global_log_min_bound
+        node_log_bound_spreads[0] = global_log_bound_spread
+
+        while nodeheap.n > 0:
+            nodeheap_item = nodeheap.pop()
+            i_node = nodeheap_item.i1
+
+            node_info = node_data[i_node]
+            if with_sample_weight:
+                N1 = _total_node_weight(node_data, sample_weight,
+                                        idx_array, i_node)
+            else:
+                N1 = node_info.idx_end - node_info.idx_start
+
+            # ------------------------------------------------------------
+            # Case 1: local bounds are equal to within per-point tolerance.
+            if (log_knorm + node_log_bound_spreads[i_node] - log(N1) + log(N)
+                <= logaddexp(log_atol, (log_rtol + log_knorm
+                                        + node_log_min_bounds[i_node]))):
+                pass
+
+            # ------------------------------------------------------------
+            # Case 2: global bounds are within rtol & atol.
+            elif (log_knorm + global_log_bound_spread
+                  <= logaddexp(log_atol,
+                               log_rtol + log_knorm + global_log_min_bound)):
+                break
+
+            # ------------------------------------------------------------
+            # Case 3: node is a leaf. Count contributions from all points
+            elif node_info.is_leaf:
+                global_log_min_bound =\
+                    logsubexp(global_log_min_bound,
+                              node_log_min_bounds[i_node])
+                global_log_bound_spread =\
+                    logsubexp(global_log_bound_spread,
+                              node_log_bound_spreads[i_node])
+                for i in range(node_info.idx_start, node_info.idx_end):
+                    dist_pt = self.dist(pt, data + n_features * idx_array[i],
+                                        n_features)
+                    log_density = compute_log_kernel(dist_pt, h, kernel)
+                    if with_sample_weight:
+                        log_weight = np.log(sample_weight[idx_array[i]])
+                    else:
+                        log_weight = 0.
+                    global_log_min_bound = logaddexp(global_log_min_bound,
+                                                     log_density + log_weight)
+
+            # ------------------------------------------------------------
+            # Case 4: split node and query subnodes
+            else:
+                i1 = 2 * i_node + 1
+                i2 = 2 * i_node + 2
+
+                if with_sample_weight:
+                    N1 = _total_node_weight(node_data, sample_weight,
+                                            idx_array, i1)
+                    N2 = _total_node_weight(node_data, sample_weight,
+                                            idx_array, i2)
+                else:
+                    N1 = node_data[i1].idx_end - node_data[i1].idx_start
+                    N2 = node_data[i2].idx_end - node_data[i2].idx_start
+
+                min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB_1, &dist_UB_1)
+                min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB_2, &dist_UB_2)
+
+                node_log_min_bounds[i1] = (log(N1) +
+                                           compute_log_kernel(dist_UB_1,
+                                                              h, kernel))
+                node_log_bound_spreads[i1] = (log(N1) +
+                                              compute_log_kernel(dist_LB_1,
+                                                                 h, kernel))
+
+                node_log_min_bounds[i2] = (log(N2) +
+                                           compute_log_kernel(dist_UB_2,
+                                                              h, kernel))
+                node_log_bound_spreads[i2] = (log(N2) +
+                                              compute_log_kernel(dist_LB_2,
+                                                                 h, kernel))
+
+                global_log_min_bound = logsubexp(global_log_min_bound,
+                                                 node_log_min_bounds[i_node])
+                global_log_min_bound = logaddexp(global_log_min_bound,
+                                                 node_log_min_bounds[i1])
+                global_log_min_bound = logaddexp(global_log_min_bound,
+                                                 node_log_min_bounds[i2])
+
+                global_log_bound_spread =\
+                    logsubexp(global_log_bound_spread,
+                              node_log_bound_spreads[i_node])
+                global_log_bound_spread = logaddexp(global_log_bound_spread,
+                                                    node_log_bound_spreads[i1])
+                global_log_bound_spread = logaddexp(global_log_bound_spread,
+                                                    node_log_bound_spreads[i2])
+
+                # TODO: rank by the spread rather than the distance?
+                nodeheap_item.val = dist_LB_1
+                nodeheap_item.i1 = i1
+                nodeheap.push(nodeheap_item)
+
+                nodeheap_item.val = dist_LB_2
+                nodeheap_item.i1 = i2
+                nodeheap.push(nodeheap_item)
+
+        nodeheap.clear()
+        return logaddexp(global_log_min_bound,
+                         global_log_bound_spread - log(2))
+
+    cdef int _kde_single_depthfirst(
+        self,
+        intp_t i_node,
+        const {{INPUT_DTYPE_t}}* pt,
+        KernelType kernel,
+        float64_t h,
+        float64_t log_knorm,
+        float64_t log_atol,
+        float64_t log_rtol,
+        float64_t local_log_min_bound,
+        float64_t local_log_bound_spread,
+        float64_t* global_log_min_bound,
+        float64_t* global_log_bound_spread,
+    ) except -1:
+        """recursive single-tree kernel density estimate, depth-first"""
+        # For the given point, local_min_bound and local_max_bound give the
+        # minimum and maximum density for the current node, while
+        # global_min_bound and global_max_bound give the minimum and maximum
+        # density over the entire tree.  We recurse down until global_min_bound
+        # and global_max_bound are within rtol and atol.
+        cdef intp_t i, i1, i2, iw, start, end
+        cdef float64_t N1, N2
+
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+        cdef const NodeData_t* node_data = &self.node_data[0]
+        cdef bint with_sample_weight = self.sample_weight is not None
+        cdef const {{INPUT_DTYPE_t}}* sample_weight
+        cdef float64_t log_weight
+        if with_sample_weight:
+            sample_weight = &self.sample_weight[0]
+        cdef intp_t* idx_array = &self.idx_array[0]
+        cdef intp_t n_features = self.data.shape[1]
+
+        cdef NodeData_t node_info = self.node_data[i_node]
+        cdef float64_t dist_pt, log_dens_contribution
+
+        cdef float64_t child1_log_min_bound, child2_log_min_bound
+        cdef float64_t child1_log_bound_spread, child2_log_bound_spread
+        cdef float64_t dist_UB = 0, dist_LB = 0
+
+        if with_sample_weight:
+            N1 = _total_node_weight(node_data, sample_weight,
+                                    idx_array, i_node)
+            N2 = self.sum_weight
+        else:
+            N1 = <float64_t>(node_info.idx_end - node_info.idx_start)
+            N2 = <float64_t>self.data.shape[0]
+
+        # ------------------------------------------------------------
+        # Case 1: local bounds are equal to within errors.  Return
+        if (
+            log_knorm + local_log_bound_spread - log(N1) + log(N2)
+            <= logaddexp(log_atol, (log_rtol + log_knorm + local_log_min_bound))
+        ):
+            pass
+
+        # ------------------------------------------------------------
+        # Case 2: global bounds are within rtol & atol. Return
+        elif (
+            log_knorm + global_log_bound_spread[0]
+            <= logaddexp(log_atol, (log_rtol + log_knorm + global_log_min_bound[0]))
+        ):
+            pass
+
+        # ------------------------------------------------------------
+        # Case 3: node is a leaf. Count contributions from all points
+        elif node_info.is_leaf:
+            global_log_min_bound[0] = logsubexp(global_log_min_bound[0],
+                                                local_log_min_bound)
+            global_log_bound_spread[0] = logsubexp(global_log_bound_spread[0],
+                                                   local_log_bound_spread)
+            for i in range(node_info.idx_start, node_info.idx_end):
+                dist_pt = self.dist(pt, (data + n_features * idx_array[i]),
+                                    n_features)
+                log_dens_contribution = compute_log_kernel(dist_pt, h, kernel)
+                if with_sample_weight:
+                    log_weight = np.log(sample_weight[idx_array[i]])
+                else:
+                    log_weight = 0.
+                global_log_min_bound[0] = logaddexp(global_log_min_bound[0],
+                                                    (log_dens_contribution +
+                                                     log_weight))
+
+        # ------------------------------------------------------------
+        # Case 4: split node and query subnodes
+        else:
+            i1 = 2 * i_node + 1
+            i2 = 2 * i_node + 2
+
+            if with_sample_weight:
+                N1 = _total_node_weight(node_data, sample_weight,
+                                        idx_array, i1)
+                N2 = _total_node_weight(node_data, sample_weight,
+                                        idx_array, i2)
+            else:
+                N1 = <float64_t>(self.node_data[i1].idx_end - self.node_data[i1].idx_start)
+                N2 = <float64_t>(self.node_data[i2].idx_end - self.node_data[i2].idx_start)
+
+            min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB, &dist_UB)
+            child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h,
+                                                                kernel)
+            child1_log_bound_spread = logsubexp(log(N1) +
+                                                compute_log_kernel(dist_LB, h,
+                                                                   kernel),
+                                                child1_log_min_bound)
+
+            min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB, &dist_UB)
+            child2_log_min_bound = log(N2) + compute_log_kernel(dist_UB, h,
+                                                                kernel)
+            child2_log_bound_spread = logsubexp(log(N2) +
+                                                compute_log_kernel(dist_LB, h,
+                                                                   kernel),
+                                                child2_log_min_bound)
+
+            global_log_min_bound[0] = logsubexp(global_log_min_bound[0],
+                                                local_log_min_bound)
+            global_log_min_bound[0] = logaddexp(global_log_min_bound[0],
+                                                child1_log_min_bound)
+            global_log_min_bound[0] = logaddexp(global_log_min_bound[0],
+                                                child2_log_min_bound)
+
+            global_log_bound_spread[0] = logsubexp(global_log_bound_spread[0],
+                                                   local_log_bound_spread)
+            global_log_bound_spread[0] = logaddexp(global_log_bound_spread[0],
+                                                   child1_log_bound_spread)
+            global_log_bound_spread[0] = logaddexp(global_log_bound_spread[0],
+                                                   child2_log_bound_spread)
+
+            self._kde_single_depthfirst(i1, pt, kernel, h, log_knorm,
+                                        log_atol, log_rtol,
+                                        child1_log_min_bound,
+                                        child1_log_bound_spread,
+                                        global_log_min_bound,
+                                        global_log_bound_spread)
+            self._kde_single_depthfirst(i2, pt, kernel, h, log_knorm,
+                                        log_atol, log_rtol,
+                                        child2_log_min_bound,
+                                        child2_log_bound_spread,
+                                        global_log_min_bound,
+                                        global_log_bound_spread)
+        return 0
+
+    cdef int _two_point_single(
+        self,
+        intp_t i_node,
+        const {{INPUT_DTYPE_t}}* pt,
+        float64_t* r,
+        intp_t* count,
+        intp_t i_min,
+        intp_t i_max,
+    ) except -1:
+        """recursive single-tree two-point correlation function query"""
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+        cdef intp_t* idx_array = &self.idx_array[0]
+        cdef intp_t n_features = self.data.shape[1]
+        cdef NodeData_t node_info = self.node_data[i_node]
+
+        cdef intp_t i, j, Npts
+        cdef float64_t reduced_r
+
+        cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0
+        min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB)
+
+        # ------------------------------------------------------------
+        # Go through bounds and check for cuts
+        while i_min < i_max:
+            if dist_LB > r[i_min]:
+                i_min += 1
+            else:
+                break
+
+        while i_max > i_min:
+            Npts = (node_info.idx_end - node_info.idx_start)
+            if dist_UB <= r[i_max - 1]:
+                count[i_max - 1] += Npts
+                i_max -= 1
+            else:
+                break
+
+        if i_min < i_max:
+            # If node is a leaf, go through all points
+            if node_info.is_leaf:
+                for i in range(node_info.idx_start, node_info.idx_end):
+                    dist_pt = self.dist(pt, (data + n_features * idx_array[i]),
+                                        n_features)
+                    j = i_max - 1
+                    while (j >= i_min) and (dist_pt <= r[j]):
+                        count[j] += 1
+                        j -= 1
+
+            else:
+                self._two_point_single(2 * i_node + 1, pt, r,
+                                       count, i_min, i_max)
+                self._two_point_single(2 * i_node + 2, pt, r,
+                                       count, i_min, i_max)
+        return 0
+
+    cdef int _two_point_dual(
+        self,
+        intp_t i_node1,
+        BinaryTree{{name_suffix}} other,
+        intp_t i_node2,
+        float64_t* r,
+        intp_t* count,
+        intp_t i_min,
+        intp_t i_max,
+    ) except -1:
+        """recursive dual-tree two-point correlation function query"""
+        cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0]
+        cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0]
+        cdef intp_t* idx_array1 = &self.idx_array[0]
+        cdef intp_t* idx_array2 = &other.idx_array[0]
+        cdef NodeData_t node_info1 = self.node_data[i_node1]
+        cdef NodeData_t node_info2 = other.node_data[i_node2]
+
+        cdef intp_t n_features = self.data.shape[1]
+
+        cdef intp_t i1, i2, j, Npts
+        cdef float64_t reduced_r
+
+        cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0
+        dist_LB = min_dist_dual{{name_suffix}}(self, i_node1, other, i_node2)
+        dist_UB = max_dist_dual{{name_suffix}}(self, i_node1, other, i_node2)
+
+        # ------------------------------------------------------------
+        # Go through bounds and check for cuts
+        while i_min < i_max:
+            if dist_LB > r[i_min]:
+                i_min += 1
+            else:
+                break
+
+        while i_max > i_min:
+            Npts = ((node_info1.idx_end - node_info1.idx_start)
+                    * (node_info2.idx_end - node_info2.idx_start))
+            if dist_UB <= r[i_max - 1]:
+                count[i_max - 1] += Npts
+                i_max -= 1
+            else:
+                break
+
+        if i_min < i_max:
+            if node_info1.is_leaf and node_info2.is_leaf:
+                # If both nodes are leaves, go through all points
+                for i1 in range(node_info1.idx_start, node_info1.idx_end):
+                    for i2 in range(node_info2.idx_start, node_info2.idx_end):
+                        dist_pt = self.dist((data1 + n_features
+                                             * idx_array1[i1]),
+                                            (data2 + n_features
+                                             * idx_array2[i2]),
+                                            n_features)
+                        j = i_max - 1
+                        while (j >= i_min) and (dist_pt <= r[j]):
+                            count[j] += 1
+                            j -= 1
+
+            elif node_info1.is_leaf:
+                # If only one is a leaf, split the other
+                for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3):
+                    self._two_point_dual(i_node1, other, i2,
+                                         r, count, i_min, i_max)
+
+            elif node_info2.is_leaf:
+                for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3):
+                    self._two_point_dual(i1, other, i_node2,
+                                         r, count, i_min, i_max)
+
+            else:
+                # neither is a leaf: split & query both
+                for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3):
+                    for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3):
+                        self._two_point_dual(i1, other, i2,
+                                             r, count, i_min, i_max)
+        return 0
+
+{{endfor}}
+
+######################################################################
+# Python functions for benchmarking and testing C implementations
+
+def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices):
+    """In-place simultaneous sort the given row of the arrays
+
+    This python wrapper exists primarily to enable unit testing
+    of the _simultaneous_sort C routine.
+    """
+    assert distances.shape[0] == indices.shape[0]
+    assert distances.shape[1] == indices.shape[1]
+    cdef intp_t row
+    for row in range(distances.shape[0]):
+        _simultaneous_sort(&distances[row, 0],
+                           &indices[row, 0],
+                           distances.shape[1])
+
+
+def nodeheap_sort(float64_t[::1] vals):
+    """In-place reverse sort of vals using NodeHeap"""
+    cdef intp_t[::1] indices = np.zeros(vals.shape[0], dtype=np.intp)
+    cdef float64_t[::1] vals_sorted = np.zeros_like(vals)
+
+    # use initial size 0 to check corner case
+    cdef NodeHeap heap = NodeHeap(0)
+    cdef NodeHeapData_t data
+    cdef intp_t i
+    for i in range(vals.shape[0]):
+        data.val = vals[i]
+        data.i1 = i
+        data.i2 = i + 1
+        heap.push(data)
+
+    for i in range(vals.shape[0]):
+        data = heap.pop()
+        vals_sorted[i] = data.val
+        indices[i] = data.i1
+
+    return np.asarray(vals_sorted), np.asarray(indices)
+
+
+cdef inline float64_t _total_node_weight(
+    const NodeData_t* node_data,
+    const floating* sample_weight,
+    const intp_t* idx_array,
+    intp_t i_node,
+):
+    cdef intp_t i
+    cdef float64_t N = 0.0
+    for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end):
+        N += sample_weight[idx_array[i]]
+    return N
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_classification.py b/.venv/Lib/site-packages/sklearn/neighbors/_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae514004f1fd2cd6986796c29e258f4293fcaed7
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_classification.py
@@ -0,0 +1,919 @@
+"""Nearest Neighbor Classification"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral
+
+import numpy as np
+
+from sklearn.neighbors._base import _check_precomputed
+
+from ..base import ClassifierMixin, _fit_context
+from ..metrics._pairwise_distances_reduction import (
+    ArgKminClassMode,
+    RadiusNeighborsClassMode,
+)
+from ..utils._param_validation import StrOptions
+from ..utils.arrayfuncs import _all_with_any_reduction_axis_1
+from ..utils.extmath import weighted_mode
+from ..utils.fixes import _mode
+from ..utils.validation import (
+    _is_arraylike,
+    _num_samples,
+    check_is_fitted,
+    validate_data,
+)
+from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights
+
+
+def _adjusted_metric(metric, metric_kwargs, p=None):
+    metric_kwargs = metric_kwargs or {}
+    if metric == "minkowski":
+        metric_kwargs["p"] = p
+        if p == 2:
+            metric = "euclidean"
+    return metric, metric_kwargs
+
+
+class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
+    """Classifier implementing the k-nearest neighbors vote.
+
+    Read more in the :ref:`User Guide <classification>`.
+
+    Parameters
+    ----------
+    n_neighbors : int, default=5
+        Number of neighbors to use by default for :meth:`kneighbors` queries.
+
+    weights : {'uniform', 'distance'}, callable or None, default='uniform'
+        Weight function used in prediction.  Possible values:
+
+        - 'uniform' : uniform weights.  All points in each neighborhood
+          are weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - [callable] : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+
+        Refer to the example entitled
+        :ref:`sphx_glr_auto_examples_neighbors_plot_classification.py`
+        showing the impact of the `weights` parameter on the decision
+        boundary.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is equivalent
+        to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
+        For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
+        to be positive.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+        Doesn't affect :meth:`fit` method.
+
+    Attributes
+    ----------
+    classes_ : array of shape (n_classes,)
+        Class labels known to the classifier
+
+    effective_metric_ : str or callble
+        The distance metric used. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    outputs_2d_ : bool
+        False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
+        otherwise True.
+
+    See Also
+    --------
+    RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius.
+    KNeighborsRegressor: Regression based on k-nearest neighbors.
+    RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius.
+    NearestNeighbors: Unsupervised learner for implementing neighbor searches.
+
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+
+    .. warning::
+
+       Regarding the Nearest Neighbors algorithms, if it is found that two
+       neighbors, neighbor `k+1` and `k`, have identical distances
+       but different labels, the results will depend on the ordering of the
+       training data.
+
+    https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
+
+    Examples
+    --------
+    >>> X = [[0], [1], [2], [3]]
+    >>> y = [0, 0, 1, 1]
+    >>> from sklearn.neighbors import KNeighborsClassifier
+    >>> neigh = KNeighborsClassifier(n_neighbors=3)
+    >>> neigh.fit(X, y)
+    KNeighborsClassifier(...)
+    >>> print(neigh.predict([[1.1]]))
+    [0]
+    >>> print(neigh.predict_proba([[0.9]]))
+    [[0.666... 0.333...]]
+    """
+
+    _parameter_constraints: dict = {**NeighborsBase._parameter_constraints}
+    _parameter_constraints.pop("radius")
+    _parameter_constraints.update(
+        {"weights": [StrOptions({"uniform", "distance"}), callable, None]}
+    )
+
+    def __init__(
+        self,
+        n_neighbors=5,
+        *,
+        weights="uniform",
+        algorithm="auto",
+        leaf_size=30,
+        p=2,
+        metric="minkowski",
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            n_neighbors=n_neighbors,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.weights = weights
+
+    @_fit_context(
+        # KNeighborsClassifier.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y):
+        """Fit the k-nearest neighbors classifier from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : {array-like, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_outputs)
+            Target values.
+
+        Returns
+        -------
+        self : KNeighborsClassifier
+            The fitted k-nearest neighbors classifier.
+        """
+        return self._fit(X, y)
+
+    def predict(self, X):
+        """Predict the class labels for the provided data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
+            Class labels for each data sample.
+        """
+        check_is_fitted(self, "_fit_method")
+        if self.weights == "uniform":
+            if self._fit_method == "brute" and ArgKminClassMode.is_usable_for(
+                X, self._fit_X, self.metric
+            ):
+                probabilities = self.predict_proba(X)
+                if self.outputs_2d_:
+                    return np.stack(
+                        [
+                            self.classes_[idx][np.argmax(probas, axis=1)]
+                            for idx, probas in enumerate(probabilities)
+                        ],
+                        axis=1,
+                    )
+                return self.classes_[np.argmax(probabilities, axis=1)]
+            # In that case, we do not need the distances to perform
+            # the weighting so we do not compute them.
+            neigh_ind = self.kneighbors(X, return_distance=False)
+            neigh_dist = None
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
+
+        classes_ = self.classes_
+        _y = self._y
+        if not self.outputs_2d_:
+            _y = self._y.reshape((-1, 1))
+            classes_ = [self.classes_]
+
+        n_outputs = len(classes_)
+        n_queries = _num_samples(self._fit_X if X is None else X)
+        weights = _get_weights(neigh_dist, self.weights)
+        if weights is not None and _all_with_any_reduction_axis_1(weights, value=0):
+            raise ValueError(
+                "All neighbors of some sample is getting zero weights. "
+                "Please modify 'weights' to avoid this case if you are "
+                "using a user-defined function."
+            )
+
+        y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
+        for k, classes_k in enumerate(classes_):
+            if weights is None:
+                mode, _ = _mode(_y[neigh_ind, k], axis=1)
+            else:
+                mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
+
+            mode = np.asarray(mode.ravel(), dtype=np.intp)
+            y_pred[:, k] = classes_k.take(mode)
+
+        if not self.outputs_2d_:
+            y_pred = y_pred.ravel()
+
+        return y_pred
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        p : ndarray of shape (n_queries, n_classes), or a list of n_outputs \
+                of such arrays if n_outputs > 1.
+            The class probabilities of the input samples. Classes are ordered
+            by lexicographic order.
+        """
+        check_is_fitted(self, "_fit_method")
+        if self.weights == "uniform":
+            # TODO: systematize this mapping of metric for
+            # PairwiseDistancesReductions.
+            metric, metric_kwargs = _adjusted_metric(
+                metric=self.metric, metric_kwargs=self.metric_params, p=self.p
+            )
+            if (
+                self._fit_method == "brute"
+                and ArgKminClassMode.is_usable_for(X, self._fit_X, metric)
+                # TODO: Implement efficient multi-output solution
+                and not self.outputs_2d_
+            ):
+                if self.metric == "precomputed":
+                    X = _check_precomputed(X)
+                else:
+                    X = validate_data(
+                        self, X, accept_sparse="csr", reset=False, order="C"
+                    )
+
+                probabilities = ArgKminClassMode.compute(
+                    X,
+                    self._fit_X,
+                    k=self.n_neighbors,
+                    weights=self.weights,
+                    Y_labels=self._y,
+                    unique_Y_labels=self.classes_,
+                    metric=metric,
+                    metric_kwargs=metric_kwargs,
+                    # `strategy="parallel_on_X"` has in practice be shown
+                    # to be more efficient than `strategy="parallel_on_Y``
+                    # on many combination of datasets.
+                    # Hence, we choose to enforce it here.
+                    # For more information, see:
+                    # https://github.com/scikit-learn/scikit-learn/pull/24076#issuecomment-1445258342  # noqa
+                    # TODO: adapt the heuristic for `strategy="auto"` for
+                    # `ArgKminClassMode` and use `strategy="auto"`.
+                    strategy="parallel_on_X",
+                )
+                return probabilities
+
+            # In that case, we do not need the distances to perform
+            # the weighting so we do not compute them.
+            neigh_ind = self.kneighbors(X, return_distance=False)
+            neigh_dist = None
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
+
+        classes_ = self.classes_
+        _y = self._y
+        if not self.outputs_2d_:
+            _y = self._y.reshape((-1, 1))
+            classes_ = [self.classes_]
+
+        n_queries = _num_samples(self._fit_X if X is None else X)
+
+        weights = _get_weights(neigh_dist, self.weights)
+        if weights is None:
+            weights = np.ones_like(neigh_ind)
+        elif _all_with_any_reduction_axis_1(weights, value=0):
+            raise ValueError(
+                "All neighbors of some sample is getting zero weights. "
+                "Please modify 'weights' to avoid this case if you are "
+                "using a user-defined function."
+            )
+
+        all_rows = np.arange(n_queries)
+        probabilities = []
+        for k, classes_k in enumerate(classes_):
+            pred_labels = _y[:, k][neigh_ind]
+            proba_k = np.zeros((n_queries, classes_k.size))
+
+            # a simple ':' index doesn't work right
+            for i, idx in enumerate(pred_labels.T):  # loop is O(n_neighbors)
+                proba_k[all_rows, idx] += weights[:, i]
+
+            # normalize 'votes' into real [0,1] probabilities
+            normalizer = proba_k.sum(axis=1)[:, np.newaxis]
+            proba_k /= normalizer
+
+            probabilities.append(proba_k)
+
+        if not self.outputs_2d_:
+            probabilities = probabilities[0]
+
+        return probabilities
+
+    # This function is defined here only to modify the parent docstring
+    # and add information about X=None
+    def score(self, X, y, sample_weight=None):
+        """
+        Return the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features), or None
+            Test samples. If `None`, predictions for all indexed points are
+            used; in this case, points are not considered their own
+            neighbors. This means that `knn.fit(X, y).score(None, y)`
+            implicitly performs a leave-one-out cross-validation procedure
+            and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())`
+            but typically much faster.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for `X`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
+        """
+        return super().score(X, y, sample_weight)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_label = True
+        tags.input_tags.pairwise = self.metric == "precomputed"
+        return tags
+
+
+class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase):
+    """Classifier implementing a vote among neighbors within a given radius.
+
+    Read more in the :ref:`User Guide <classification>`.
+
+    Parameters
+    ----------
+    radius : float, default=1.0
+        Range of parameter space to use by default for :meth:`radius_neighbors`
+        queries.
+
+    weights : {'uniform', 'distance'}, callable or None, default='uniform'
+        Weight function used in prediction.  Possible values:
+
+        - 'uniform' : uniform weights.  All points in each neighborhood
+          are weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - [callable] : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+
+        Uniform weights are used by default.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+        This parameter is expected to be positive.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+    outlier_label : {manual label, 'most_frequent'}, default=None
+        Label for outlier samples (samples with no neighbors in given radius).
+
+        - manual label: str or int label (should be the same type as y)
+          or list of manual labels if multi-output is used.
+        - 'most_frequent' : assign the most frequent label of y to outliers.
+        - None : when any outlier is detected, ValueError will be raised.
+
+        The outlier label should be selected from among the unique 'Y' labels.
+        If it is specified with a different value a warning will be raised and
+        all class probabilities of outliers will be assigned to be 0.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        Class labels known to the classifier.
+
+    effective_metric_ : str or callable
+        The distance metric used. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    outlier_label_ : int or array-like of shape (n_class,)
+        Label which is given for outlier samples (samples with no neighbors
+        on given radius).
+
+    outputs_2d_ : bool
+        False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
+        otherwise True.
+
+    See Also
+    --------
+    KNeighborsClassifier : Classifier implementing the k-nearest neighbors
+        vote.
+    RadiusNeighborsRegressor : Regression based on neighbors within a
+        fixed radius.
+    KNeighborsRegressor : Regression based on k-nearest neighbors.
+    NearestNeighbors : Unsupervised learner for implementing neighbor
+        searches.
+
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+
+    https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
+
+    Examples
+    --------
+    >>> X = [[0], [1], [2], [3]]
+    >>> y = [0, 0, 1, 1]
+    >>> from sklearn.neighbors import RadiusNeighborsClassifier
+    >>> neigh = RadiusNeighborsClassifier(radius=1.0)
+    >>> neigh.fit(X, y)
+    RadiusNeighborsClassifier(...)
+    >>> print(neigh.predict([[1.5]]))
+    [0]
+    >>> print(neigh.predict_proba([[1.0]]))
+    [[0.66666667 0.33333333]]
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "weights": [StrOptions({"uniform", "distance"}), callable, None],
+        "outlier_label": [Integral, str, "array-like", None],
+    }
+    _parameter_constraints.pop("n_neighbors")
+
+    def __init__(
+        self,
+        radius=1.0,
+        *,
+        weights="uniform",
+        algorithm="auto",
+        leaf_size=30,
+        p=2,
+        metric="minkowski",
+        outlier_label=None,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            radius=radius,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.weights = weights
+        self.outlier_label = outlier_label
+
+    @_fit_context(
+        # RadiusNeighborsClassifier.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y):
+        """Fit the radius neighbors classifier from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : {array-like, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_outputs)
+            Target values.
+
+        Returns
+        -------
+        self : RadiusNeighborsClassifier
+            The fitted radius neighbors classifier.
+        """
+        self._fit(X, y)
+
+        classes_ = self.classes_
+        _y = self._y
+        if not self.outputs_2d_:
+            _y = self._y.reshape((-1, 1))
+            classes_ = [self.classes_]
+
+        if self.outlier_label is None:
+            outlier_label_ = None
+
+        elif self.outlier_label == "most_frequent":
+            outlier_label_ = []
+            # iterate over multi-output, get the most frequent label for each
+            # output.
+            for k, classes_k in enumerate(classes_):
+                label_count = np.bincount(_y[:, k])
+                outlier_label_.append(classes_k[label_count.argmax()])
+
+        else:
+            if _is_arraylike(self.outlier_label) and not isinstance(
+                self.outlier_label, str
+            ):
+                if len(self.outlier_label) != len(classes_):
+                    raise ValueError(
+                        "The length of outlier_label: {} is "
+                        "inconsistent with the output "
+                        "length: {}".format(self.outlier_label, len(classes_))
+                    )
+                outlier_label_ = self.outlier_label
+            else:
+                outlier_label_ = [self.outlier_label] * len(classes_)
+
+            for classes, label in zip(classes_, outlier_label_):
+                if _is_arraylike(label) and not isinstance(label, str):
+                    # ensure the outlier label for each output is a scalar.
+                    raise TypeError(
+                        "The outlier_label of classes {} is "
+                        "supposed to be a scalar, got "
+                        "{}.".format(classes, label)
+                    )
+                if np.append(classes, label).dtype != classes.dtype:
+                    # ensure the dtype of outlier label is consistent with y.
+                    raise TypeError(
+                        "The dtype of outlier_label {} is "
+                        "inconsistent with classes {} in "
+                        "y.".format(label, classes)
+                    )
+
+        self.outlier_label_ = outlier_label_
+
+        return self
+
+    def predict(self, X):
+        """Predict the class labels for the provided data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
+            Class labels for each data sample.
+        """
+
+        probs = self.predict_proba(X)
+        classes_ = self.classes_
+
+        if not self.outputs_2d_:
+            probs = [probs]
+            classes_ = [self.classes_]
+
+        n_outputs = len(classes_)
+        n_queries = probs[0].shape[0]
+        y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
+
+        for k, prob in enumerate(probs):
+            # iterate over multi-output, assign labels based on probabilities
+            # of each output.
+            max_prob_index = prob.argmax(axis=1)
+            y_pred[:, k] = classes_[k].take(max_prob_index)
+
+            outlier_zero_probs = (prob == 0).all(axis=1)
+            if outlier_zero_probs.any():
+                zero_prob_index = np.flatnonzero(outlier_zero_probs)
+                y_pred[zero_prob_index, k] = self.outlier_label_[k]
+
+        if not self.outputs_2d_:
+            y_pred = y_pred.ravel()
+
+        return y_pred
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        p : ndarray of shape (n_queries, n_classes), or a list of \
+                n_outputs of such arrays if n_outputs > 1.
+            The class probabilities of the input samples. Classes are ordered
+            by lexicographic order.
+        """
+        check_is_fitted(self, "_fit_method")
+        n_queries = _num_samples(self._fit_X if X is None else X)
+
+        metric, metric_kwargs = _adjusted_metric(
+            metric=self.metric, metric_kwargs=self.metric_params, p=self.p
+        )
+
+        if (
+            self.weights == "uniform"
+            and self._fit_method == "brute"
+            and not self.outputs_2d_
+            and RadiusNeighborsClassMode.is_usable_for(X, self._fit_X, metric)
+        ):
+            probabilities = RadiusNeighborsClassMode.compute(
+                X=X,
+                Y=self._fit_X,
+                radius=self.radius,
+                weights=self.weights,
+                Y_labels=self._y,
+                unique_Y_labels=self.classes_,
+                outlier_label=self.outlier_label,
+                metric=metric,
+                metric_kwargs=metric_kwargs,
+                strategy="parallel_on_X",
+                # `strategy="parallel_on_X"` has in practice be shown
+                # to be more efficient than `strategy="parallel_on_Y``
+                # on many combination of datasets.
+                # Hence, we choose to enforce it here.
+                # For more information, see:
+                # https://github.com/scikit-learn/scikit-learn/pull/26828/files#r1282398471  # noqa
+            )
+            return probabilities
+
+        neigh_dist, neigh_ind = self.radius_neighbors(X)
+        outlier_mask = np.zeros(n_queries, dtype=bool)
+        outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
+        outliers = np.flatnonzero(outlier_mask)
+        inliers = np.flatnonzero(~outlier_mask)
+
+        classes_ = self.classes_
+        _y = self._y
+        if not self.outputs_2d_:
+            _y = self._y.reshape((-1, 1))
+            classes_ = [self.classes_]
+
+        if self.outlier_label_ is None and outliers.size > 0:
+            raise ValueError(
+                "No neighbors found for test samples %r, "
+                "you can try using larger radius, "
+                "giving a label for outliers, "
+                "or considering removing them from your dataset." % outliers
+            )
+
+        weights = _get_weights(neigh_dist, self.weights)
+        if weights is not None:
+            weights = weights[inliers]
+
+        probabilities = []
+        # iterate over multi-output, measure probabilities of the k-th output.
+        for k, classes_k in enumerate(classes_):
+            pred_labels = np.zeros(len(neigh_ind), dtype=object)
+            pred_labels[:] = [_y[ind, k] for ind in neigh_ind]
+
+            proba_k = np.zeros((n_queries, classes_k.size))
+            proba_inl = np.zeros((len(inliers), classes_k.size))
+
+            # samples have different size of neighbors within the same radius
+            if weights is None:
+                for i, idx in enumerate(pred_labels[inliers]):
+                    proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size)
+            else:
+                for i, idx in enumerate(pred_labels[inliers]):
+                    proba_inl[i, :] = np.bincount(
+                        idx, weights[i], minlength=classes_k.size
+                    )
+            proba_k[inliers, :] = proba_inl
+
+            if outliers.size > 0:
+                _outlier_label = self.outlier_label_[k]
+                label_index = np.flatnonzero(classes_k == _outlier_label)
+                if label_index.size == 1:
+                    proba_k[outliers, label_index[0]] = 1.0
+                else:
+                    warnings.warn(
+                        "Outlier label {} is not in training "
+                        "classes. All class probabilities of "
+                        "outliers will be assigned with 0."
+                        "".format(self.outlier_label_[k])
+                    )
+
+            # normalize 'votes' into real [0,1] probabilities
+            normalizer = proba_k.sum(axis=1)[:, np.newaxis]
+            normalizer[normalizer == 0.0] = 1.0
+            proba_k /= normalizer
+
+            probabilities.append(proba_k)
+
+        if not self.outputs_2d_:
+            probabilities = probabilities[0]
+
+        return probabilities
+
+    # This function is defined here only to modify the parent docstring
+    # and add information about X=None
+    def score(self, X, y, sample_weight=None):
+        """
+        Return the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features), or None
+            Test samples. If `None`, predictions for all indexed points are
+            used; in this case, points are not considered their own
+            neighbors. This means that `knn.fit(X, y).score(None, y)`
+            implicitly performs a leave-one-out cross-validation procedure
+            and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())`
+            but typically much faster.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for `X`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
+        """
+        return super().score(X, y, sample_weight)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_label = True
+        return tags
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_graph.py b/.venv/Lib/site-packages/sklearn/neighbors/_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b3325f06e37bacfbf8804be3f8b5bd5e9c52e7
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_graph.py
@@ -0,0 +1,704 @@
+"""Nearest Neighbors graph functions"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+
+from ..base import ClassNamePrefixFeaturesOutMixin, TransformerMixin, _fit_context
+from ..utils._param_validation import (
+    Integral,
+    Interval,
+    Real,
+    StrOptions,
+    validate_params,
+)
+from ..utils.validation import check_is_fitted
+from ._base import VALID_METRICS, KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin
+from ._unsupervised import NearestNeighbors
+
+
+def _check_params(X, metric, p, metric_params):
+    """Check the validity of the input parameters"""
+    params = zip(["metric", "p", "metric_params"], [metric, p, metric_params])
+    est_params = X.get_params()
+    for param_name, func_param in params:
+        if func_param != est_params[param_name]:
+            raise ValueError(
+                "Got %s for %s, while the estimator has %s for the same parameter."
+                % (func_param, param_name, est_params[param_name])
+            )
+
+
+def _query_include_self(X, include_self, mode):
+    """Return the query based on include_self param"""
+    if include_self == "auto":
+        include_self = mode == "connectivity"
+
+    # it does not include each sample as its own neighbors
+    if not include_self:
+        X = None
+
+    return X
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix", KNeighborsMixin],
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "mode": [StrOptions({"connectivity", "distance"})],
+        "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable],
+        "p": [Interval(Real, 0, None, closed="right"), None],
+        "metric_params": [dict, None],
+        "include_self": ["boolean", StrOptions({"auto"})],
+        "n_jobs": [Integral, None],
+    },
+    prefer_skip_nested_validation=False,  # metric is not validated yet
+)
+def kneighbors_graph(
+    X,
+    n_neighbors,
+    *,
+    mode="connectivity",
+    metric="minkowski",
+    p=2,
+    metric_params=None,
+    include_self=False,
+    n_jobs=None,
+):
+    """Compute the (weighted) graph of k-Neighbors for points in X.
+
+    Read more in the :ref:`User Guide <unsupervised_neighbors>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Sample data.
+
+    n_neighbors : int
+        Number of neighbors for each sample.
+
+    mode : {'connectivity', 'distance'}, default='connectivity'
+        Type of returned matrix: 'connectivity' will return the connectivity
+        matrix with ones and zeros, and 'distance' will return the distances
+        between neighbors according to the given metric.
+
+    metric : str, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is equivalent
+        to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
+        For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
+        to be positive.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    include_self : bool or 'auto', default=False
+        Whether or not to mark each sample as the first nearest neighbor to
+        itself. If 'auto', then True is used for mode='connectivity' and False
+        for mode='distance'.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Returns
+    -------
+    A : sparse matrix of shape (n_samples, n_samples)
+        Graph where A[i, j] is assigned the weight of edge that
+        connects i to j. The matrix is of CSR format.
+
+    See Also
+    --------
+    radius_neighbors_graph: Compute the (weighted) graph of Neighbors for points in X.
+
+    Examples
+    --------
+    >>> X = [[0], [3], [1]]
+    >>> from sklearn.neighbors import kneighbors_graph
+    >>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True)
+    >>> A.toarray()
+    array([[1., 0., 1.],
+           [0., 1., 1.],
+           [1., 0., 1.]])
+    """
+    if not isinstance(X, KNeighborsMixin):
+        X = NearestNeighbors(
+            n_neighbors=n_neighbors,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        ).fit(X)
+    else:
+        _check_params(X, metric, p, metric_params)
+
+    query = _query_include_self(X._fit_X, include_self, mode)
+    return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix", RadiusNeighborsMixin],
+        "radius": [Interval(Real, 0, None, closed="both")],
+        "mode": [StrOptions({"connectivity", "distance"})],
+        "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable],
+        "p": [Interval(Real, 0, None, closed="right"), None],
+        "metric_params": [dict, None],
+        "include_self": ["boolean", StrOptions({"auto"})],
+        "n_jobs": [Integral, None],
+    },
+    prefer_skip_nested_validation=False,  # metric is not validated yet
+)
+def radius_neighbors_graph(
+    X,
+    radius,
+    *,
+    mode="connectivity",
+    metric="minkowski",
+    p=2,
+    metric_params=None,
+    include_self=False,
+    n_jobs=None,
+):
+    """Compute the (weighted) graph of Neighbors for points in X.
+
+    Neighborhoods are restricted the points at a distance lower than
+    radius.
+
+    Read more in the :ref:`User Guide <unsupervised_neighbors>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Sample data.
+
+    radius : float
+        Radius of neighborhoods.
+
+    mode : {'connectivity', 'distance'}, default='connectivity'
+        Type of returned matrix: 'connectivity' will return the connectivity
+        matrix with ones and zeros, and 'distance' will return the distances
+        between neighbors according to the given metric.
+
+    metric : str, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    include_self : bool or 'auto', default=False
+        Whether or not to mark each sample as the first nearest neighbor to
+        itself. If 'auto', then True is used for mode='connectivity' and False
+        for mode='distance'.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Returns
+    -------
+    A : sparse matrix of shape (n_samples, n_samples)
+        Graph where A[i, j] is assigned the weight of edge that connects
+        i to j. The matrix is of CSR format.
+
+    See Also
+    --------
+    kneighbors_graph: Compute the weighted graph of k-neighbors for points in X.
+
+    Examples
+    --------
+    >>> X = [[0], [3], [1]]
+    >>> from sklearn.neighbors import radius_neighbors_graph
+    >>> A = radius_neighbors_graph(X, 1.5, mode='connectivity',
+    ...                            include_self=True)
+    >>> A.toarray()
+    array([[1., 0., 1.],
+           [0., 1., 0.],
+           [1., 0., 1.]])
+    """
+    if not isinstance(X, RadiusNeighborsMixin):
+        X = NearestNeighbors(
+            radius=radius,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        ).fit(X)
+    else:
+        _check_params(X, metric, p, metric_params)
+
+    query = _query_include_self(X._fit_X, include_self, mode)
+    return X.radius_neighbors_graph(query, radius, mode)
+
+
+class KNeighborsTransformer(
+    ClassNamePrefixFeaturesOutMixin, KNeighborsMixin, TransformerMixin, NeighborsBase
+):
+    """Transform X into a (weighted) graph of k nearest neighbors.
+
+    The transformed data is a sparse graph as returned by kneighbors_graph.
+
+    Read more in the :ref:`User Guide <neighbors_transformer>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    mode : {'distance', 'connectivity'}, default='distance'
+        Type of returned matrix: 'connectivity' will return the connectivity
+        matrix with ones and zeros, and 'distance' will return the distances
+        between neighbors according to the given metric.
+
+    n_neighbors : int, default=5
+        Number of neighbors for each sample in the transformed sparse graph.
+        For compatibility reasons, as each sample is considered as its own
+        neighbor, one extra neighbor will be computed when mode == 'distance'.
+        In this case, the sparse graph contains (n_neighbors + 1) neighbors.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        Distance matrices are not supported.
+
+    p : float, default=2
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+        This parameter is expected to be positive.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        If ``-1``, then the number of jobs is set to the number of CPU cores.
+
+    Attributes
+    ----------
+    effective_metric_ : str or callable
+        The distance metric used. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    See Also
+    --------
+    kneighbors_graph : Compute the weighted graph of k-neighbors for
+        points in X.
+    RadiusNeighborsTransformer : Transform X into a weighted graph of
+        neighbors nearer than a radius.
+
+    Notes
+    -----
+    For an example of using :class:`~sklearn.neighbors.KNeighborsTransformer`
+    in combination with :class:`~sklearn.manifold.TSNE` see
+    :ref:`sphx_glr_auto_examples_neighbors_approximate_nearest_neighbors.py`.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_wine
+    >>> from sklearn.neighbors import KNeighborsTransformer
+    >>> X, _ = load_wine(return_X_y=True)
+    >>> X.shape
+    (178, 13)
+    >>> transformer = KNeighborsTransformer(n_neighbors=5, mode='distance')
+    >>> X_dist_graph = transformer.fit_transform(X)
+    >>> X_dist_graph.shape
+    (178, 178)
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "mode": [StrOptions({"distance", "connectivity"})],
+    }
+    _parameter_constraints.pop("radius")
+
+    def __init__(
+        self,
+        *,
+        mode="distance",
+        n_neighbors=5,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super(KNeighborsTransformer, self).__init__(
+            n_neighbors=n_neighbors,
+            radius=None,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.mode = mode
+
+    @_fit_context(
+        # KNeighborsTransformer.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Fit the k-nearest neighbors transformer from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : KNeighborsTransformer
+            The fitted k-nearest neighbors transformer.
+        """
+        self._fit(X)
+        self._n_features_out = self.n_samples_fit_
+        return self
+
+    def transform(self, X):
+        """Compute the (weighted) graph of Neighbors for points in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_transform, n_features)
+            Sample data.
+
+        Returns
+        -------
+        Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+            The matrix is of CSR format.
+        """
+        check_is_fitted(self)
+        add_one = self.mode == "distance"
+        return self.kneighbors_graph(
+            X, mode=self.mode, n_neighbors=self.n_neighbors + add_one
+        )
+
+    def fit_transform(self, X, y=None):
+        """Fit to data, then transform it.
+
+        Fits transformer to X and y with optional parameters fit_params
+        and returns a transformed version of X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training set.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        Xt : sparse matrix of shape (n_samples, n_samples)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+            The matrix is of CSR format.
+        """
+        return self.fit(X).transform(X)
+
+
+class RadiusNeighborsTransformer(
+    ClassNamePrefixFeaturesOutMixin,
+    RadiusNeighborsMixin,
+    TransformerMixin,
+    NeighborsBase,
+):
+    """Transform X into a (weighted) graph of neighbors nearer than a radius.
+
+    The transformed data is a sparse graph as returned by
+    `radius_neighbors_graph`.
+
+    Read more in the :ref:`User Guide <neighbors_transformer>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    mode : {'distance', 'connectivity'}, default='distance'
+        Type of returned matrix: 'connectivity' will return the connectivity
+        matrix with ones and zeros, and 'distance' will return the distances
+        between neighbors according to the given metric.
+
+    radius : float, default=1.0
+        Radius of neighborhood in the transformed sparse graph.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        Distance matrices are not supported.
+
+    p : float, default=2
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+        This parameter is expected to be positive.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        If ``-1``, then the number of jobs is set to the number of CPU cores.
+
+    Attributes
+    ----------
+    effective_metric_ : str or callable
+        The distance metric used. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    See Also
+    --------
+    kneighbors_graph : Compute the weighted graph of k-neighbors for
+        points in X.
+    KNeighborsTransformer : Transform X into a weighted graph of k
+        nearest neighbors.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import load_wine
+    >>> from sklearn.cluster import DBSCAN
+    >>> from sklearn.neighbors import RadiusNeighborsTransformer
+    >>> from sklearn.pipeline import make_pipeline
+    >>> X, _ = load_wine(return_X_y=True)
+    >>> estimator = make_pipeline(
+    ...     RadiusNeighborsTransformer(radius=42.0, mode='distance'),
+    ...     DBSCAN(eps=25.0, metric='precomputed'))
+    >>> X_clustered = estimator.fit_predict(X)
+    >>> clusters, counts = np.unique(X_clustered, return_counts=True)
+    >>> print(counts)
+    [ 29  15 111  11  12]
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "mode": [StrOptions({"distance", "connectivity"})],
+    }
+    _parameter_constraints.pop("n_neighbors")
+
+    def __init__(
+        self,
+        *,
+        mode="distance",
+        radius=1.0,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super(RadiusNeighborsTransformer, self).__init__(
+            n_neighbors=None,
+            radius=radius,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.mode = mode
+
+    @_fit_context(
+        # RadiusNeighborsTransformer.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Fit the radius neighbors transformer from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : RadiusNeighborsTransformer
+            The fitted radius neighbors transformer.
+        """
+        self._fit(X)
+        self._n_features_out = self.n_samples_fit_
+        return self
+
+    def transform(self, X):
+        """Compute the (weighted) graph of Neighbors for points in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_transform, n_features)
+            Sample data.
+
+        Returns
+        -------
+        Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+            The matrix is of CSR format.
+        """
+        check_is_fitted(self)
+        return self.radius_neighbors_graph(X, mode=self.mode, sort_results=True)
+
+    def fit_transform(self, X, y=None):
+        """Fit to data, then transform it.
+
+        Fits transformer to X and y with optional parameters fit_params
+        and returns a transformed version of X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training set.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        Xt : sparse matrix of shape (n_samples, n_samples)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+            The matrix is of CSR format.
+        """
+        return self.fit(X).transform(X)
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.lib
new file mode 100644
index 0000000000000000000000000000000000000000..6eaa79ab27018028c9090f4c78f747306184eddb
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.lib differ
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..fe0c5cfc0a30ec22b0b3d915cf359b5aaa2464ac
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp39-win_amd64.pyd differ
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.pyx.tp b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..4d182a3644e4bb73f344e0e15e4bc0299e4ea785
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_kd_tree.pyx.tp
@@ -0,0 +1,336 @@
+{{py:
+
+# Generated file: _kd_tree.pyx
+
+implementation_specific_values = [
+    # The values are arranged as follows:
+    #
+    #       name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
+    #
+    ('64', 'float64_t', 'np.float64'),
+    ('32', 'float32_t', 'np.float32')
+]
+
+# By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
+# written for the scikit-learn project
+# SPDX-License-Identifier: BSD-3-Clause
+
+}}
+
+
+__all__ = ['KDTree', 'KDTree64', 'KDTree32']
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+DOC_DICT{{name_suffix}} = {
+    'BinaryTree': 'KDTree{{name_suffix}}',
+    'binary_tree': 'kd_tree{{name_suffix}}',
+}
+
+VALID_METRICS{{name_suffix}} = [
+    'EuclideanDistance{{name_suffix}}',
+    'ManhattanDistance{{name_suffix}}',
+    'ChebyshevDistance{{name_suffix}}',
+    'MinkowskiDistance{{name_suffix}}'
+]
+
+{{endfor}}
+
+include "_binary_tree.pxi"
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+# Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}}
+cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}):
+    __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}})
+    pass
+
+{{endfor}}
+
+
+# ----------------------------------------------------------------------
+# The functions below specialized the Binary Tree as a KD Tree
+#
+#   Note that these functions use the concept of "reduced distance".
+#   The reduced distance, defined for some metrics, is a quantity which
+#   is more efficient to compute than the distance, but preserves the
+#   relative rankings of the true distance.  For example, the reduced
+#   distance for the Euclidean metric is the squared-euclidean distance.
+#   For some metrics, the reduced distance is simply the distance.
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+cdef int allocate_data{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t n_nodes,
+    intp_t n_features,
+) except -1:
+    """Allocate arrays needed for the KD Tree"""
+    tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype={{INPUT_DTYPE}})
+    return 0
+
+
+cdef int init_node{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    NodeData_t[::1] node_data,
+    intp_t i_node,
+    intp_t idx_start,
+    intp_t idx_end,
+) except -1:
+    """Initialize the node for the dataset stored in tree.data"""
+    cdef intp_t n_features = tree.data.shape[1]
+    cdef intp_t i, j
+    cdef float64_t rad = 0
+
+    cdef {{INPUT_DTYPE_t}}* lower_bounds = &tree.node_bounds[0, i_node, 0]
+    cdef {{INPUT_DTYPE_t}}* upper_bounds = &tree.node_bounds[1, i_node, 0]
+    cdef const {{INPUT_DTYPE_t}}* data = &tree.data[0, 0]
+    cdef const intp_t* idx_array = &tree.idx_array[0]
+
+    cdef const {{INPUT_DTYPE_t}}* data_row
+
+    # determine Node bounds
+    for j in range(n_features):
+        lower_bounds[j] = INF
+        upper_bounds[j] = -INF
+
+    # Compute the actual data range.  At build time, this is slightly
+    # slower than using the previously-computed bounds of the parent node,
+    # but leads to more compact trees and thus faster queries.
+    for i in range(idx_start, idx_end):
+        data_row = data + idx_array[i] * n_features
+        for j in range(n_features):
+            lower_bounds[j] = fmin(lower_bounds[j], data_row[j])
+            upper_bounds[j] = fmax(upper_bounds[j], data_row[j])
+
+    for j in range(n_features):
+        if tree.dist_metric.p == INF:
+            rad = fmax(rad, 0.5 * (upper_bounds[j] - lower_bounds[j]))
+        else:
+            rad += pow(0.5 * abs(upper_bounds[j] - lower_bounds[j]),
+                       tree.dist_metric.p)
+
+    node_data[i_node].idx_start = idx_start
+    node_data[i_node].idx_end = idx_end
+
+    # The radius will hold the size of the circumscribed hypersphere measured
+    # with the specified metric: in querying, this is used as a measure of the
+    # size of each node when deciding which nodes to split.
+    node_data[i_node].radius = pow(rad, 1. / tree.dist_metric.p)
+    return 0
+
+
+cdef float64_t min_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1 nogil:
+    """Compute the minimum reduced-distance between a point and a node"""
+    cdef intp_t n_features = tree.data.shape[1]
+    cdef float64_t d, d_lo, d_hi, rdist=0.0
+    cdef intp_t j
+
+    if tree.dist_metric.p == INF:
+        for j in range(n_features):
+            d_lo = tree.node_bounds[0, i_node, j] - pt[j]
+            d_hi = pt[j] - tree.node_bounds[1, i_node, j]
+            d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
+            rdist = fmax(rdist, 0.5 * d)
+    else:
+        # here we'll use the fact that x + abs(x) = 2 * max(x, 0)
+        for j in range(n_features):
+            d_lo = tree.node_bounds[0, i_node, j] - pt[j]
+            d_hi = pt[j] - tree.node_bounds[1, i_node, j]
+            d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
+            rdist += pow(0.5 * d, tree.dist_metric.p)
+
+    return rdist
+
+
+cdef float64_t min_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the minimum distance between a point and a node"""
+    if tree.dist_metric.p == INF:
+        return min_rdist{{name_suffix}}(tree, i_node, pt)
+    else:
+        return pow(
+            min_rdist{{name_suffix}}(tree, i_node, pt),
+            1. / tree.dist_metric.p
+        )
+
+
+cdef float64_t max_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the maximum reduced-distance between a point and a node"""
+    cdef intp_t n_features = tree.data.shape[1]
+
+    cdef float64_t d_lo, d_hi, rdist=0.0
+    cdef intp_t j
+
+    if tree.dist_metric.p == INF:
+        for j in range(n_features):
+            rdist = fmax(rdist, fabs(pt[j] - tree.node_bounds[0, i_node, j]))
+            rdist = fmax(rdist, fabs(pt[j] - tree.node_bounds[1, i_node, j]))
+    else:
+        for j in range(n_features):
+            d_lo = fabs(pt[j] - tree.node_bounds[0, i_node, j])
+            d_hi = fabs(pt[j] - tree.node_bounds[1, i_node, j])
+            rdist += pow(fmax(d_lo, d_hi), tree.dist_metric.p)
+
+    return rdist
+
+
+cdef float64_t max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the maximum distance between a point and a node"""
+    if tree.dist_metric.p == INF:
+        return max_rdist{{name_suffix}}(tree, i_node, pt)
+    else:
+        return pow(
+            max_rdist{{name_suffix}}(tree, i_node, pt),
+            1. / tree.dist_metric.p
+        )
+
+
+cdef inline int min_max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+    float64_t* min_dist,
+    float64_t* max_dist,
+) except -1 nogil:
+    """Compute the minimum and maximum distance between a point and a node"""
+    cdef intp_t n_features = tree.data.shape[1]
+
+    cdef float64_t d, d_lo, d_hi
+    cdef intp_t j
+
+    min_dist[0] = 0.0
+    max_dist[0] = 0.0
+
+    if tree.dist_metric.p == INF:
+        for j in range(n_features):
+            d_lo = tree.node_bounds[0, i_node, j] - pt[j]
+            d_hi = pt[j] - tree.node_bounds[1, i_node, j]
+            d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
+            min_dist[0] = fmax(min_dist[0], 0.5 * d)
+            max_dist[0] = fmax(max_dist[0], fabs(d_lo))
+            max_dist[0] = fmax(max_dist[0], fabs(d_hi))
+    else:
+        # as above, use the fact that x + abs(x) = 2 * max(x, 0)
+        for j in range(n_features):
+            d_lo = tree.node_bounds[0, i_node, j] - pt[j]
+            d_hi = pt[j] - tree.node_bounds[1, i_node, j]
+            d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
+            min_dist[0] += pow(0.5 * d, tree.dist_metric.p)
+            max_dist[0] += pow(fmax(fabs(d_lo), fabs(d_hi)),
+                               tree.dist_metric.p)
+
+        min_dist[0] = pow(min_dist[0], 1. / tree.dist_metric.p)
+        max_dist[0] = pow(max_dist[0], 1. / tree.dist_metric.p)
+
+    return 0
+
+
+cdef inline float64_t min_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """Compute the minimum reduced distance between two nodes"""
+    cdef intp_t n_features = tree1.data.shape[1]
+
+    cdef float64_t d, d1, d2, rdist=0.0
+    cdef intp_t j
+
+    if tree1.dist_metric.p == INF:
+        for j in range(n_features):
+            d1 = (tree1.node_bounds[0, i_node1, j]
+                  - tree2.node_bounds[1, i_node2, j])
+            d2 = (tree2.node_bounds[0, i_node2, j]
+                  - tree1.node_bounds[1, i_node1, j])
+            d = (d1 + fabs(d1)) + (d2 + fabs(d2))
+
+            rdist = fmax(rdist, 0.5 * d)
+    else:
+        # here we'll use the fact that x + abs(x) = 2 * max(x, 0)
+        for j in range(n_features):
+            d1 = (tree1.node_bounds[0, i_node1, j]
+                  - tree2.node_bounds[1, i_node2, j])
+            d2 = (tree2.node_bounds[0, i_node2, j]
+                  - tree1.node_bounds[1, i_node1, j])
+            d = (d1 + fabs(d1)) + (d2 + fabs(d2))
+
+            rdist += pow(0.5 * d, tree1.dist_metric.p)
+
+    return rdist
+
+
+cdef inline float64_t min_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """Compute the minimum distance between two nodes"""
+    return tree1.dist_metric._rdist_to_dist(
+        min_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+    )
+
+
+cdef inline float64_t max_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """Compute the maximum reduced distance between two nodes"""
+    cdef intp_t n_features = tree1.data.shape[1]
+
+    cdef float64_t d1, d2, rdist=0.0
+    cdef intp_t j
+
+    if tree1.dist_metric.p == INF:
+        for j in range(n_features):
+            rdist = fmax(rdist, fabs(tree1.node_bounds[0, i_node1, j]
+                                     - tree2.node_bounds[1, i_node2, j]))
+            rdist = fmax(rdist, fabs(tree1.node_bounds[1, i_node1, j]
+                                     - tree2.node_bounds[0, i_node2, j]))
+    else:
+        for j in range(n_features):
+            d1 = fabs(tree1.node_bounds[0, i_node1, j]
+                      - tree2.node_bounds[1, i_node2, j])
+            d2 = fabs(tree1.node_bounds[1, i_node1, j]
+                      - tree2.node_bounds[0, i_node2, j])
+            rdist += pow(fmax(d1, d2), tree1.dist_metric.p)
+
+    return rdist
+
+
+cdef inline float64_t max_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """Compute the maximum distance between two nodes"""
+    return tree1.dist_metric._rdist_to_dist(
+        max_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+    )
+
+{{endfor}}
+
+
+class KDTree(KDTree64):
+    __doc__ = CLASS_DOC.format(BinaryTree="KDTree")
+    pass
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_kde.py b/.venv/Lib/site-packages/sklearn/neighbors/_kde.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ee37a639f31e2fb4753119fece1a9de837e650b
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_kde.py
@@ -0,0 +1,359 @@
+"""
+Kernel Density Estimation
+-------------------------
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.special import gammainc
+
+from ..base import BaseEstimator, _fit_context
+from ..neighbors._base import VALID_METRICS
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import row_norms
+from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data
+from ._ball_tree import BallTree
+from ._kd_tree import KDTree
+
+VALID_KERNELS = [
+    "gaussian",
+    "tophat",
+    "epanechnikov",
+    "exponential",
+    "linear",
+    "cosine",
+]
+
+TREE_DICT = {"ball_tree": BallTree, "kd_tree": KDTree}
+
+
+# TODO: implement a brute force version for testing purposes
+# TODO: create a density estimation base class?
+class KernelDensity(BaseEstimator):
+    """Kernel Density Estimation.
+
+    Read more in the :ref:`User Guide <kernel_density>`.
+
+    Parameters
+    ----------
+    bandwidth : float or {"scott", "silverman"}, default=1.0
+        The bandwidth of the kernel. If bandwidth is a float, it defines the
+        bandwidth of the kernel. If bandwidth is a string, one of the estimation
+        methods is implemented.
+
+    algorithm : {'kd_tree', 'ball_tree', 'auto'}, default='auto'
+        The tree algorithm to use.
+
+    kernel : {'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', \
+                 'cosine'}, default='gaussian'
+        The kernel to use.
+
+    metric : str, default='euclidean'
+        Metric to use for distance computation. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        Not all metrics are valid with all algorithms: refer to the
+        documentation of :class:`BallTree` and :class:`KDTree`. Note that the
+        normalization of the density output is correct only for the Euclidean
+        distance metric.
+
+    atol : float, default=0
+        The desired absolute tolerance of the result.  A larger tolerance will
+        generally lead to faster execution.
+
+    rtol : float, default=0
+        The desired relative tolerance of the result.  A larger tolerance will
+        generally lead to faster execution.
+
+    breadth_first : bool, default=True
+        If true (default), use a breadth-first approach to the problem.
+        Otherwise use a depth-first approach.
+
+    leaf_size : int, default=40
+        Specify the leaf size of the underlying tree.  See :class:`BallTree`
+        or :class:`KDTree` for details.
+
+    metric_params : dict, default=None
+        Additional parameters to be passed to the tree for use with the
+        metric.  For more information, see the documentation of
+        :class:`BallTree` or :class:`KDTree`.
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    tree_ : ``BinaryTree`` instance
+        The tree algorithm for fast generalized N-point problems.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+    bandwidth_ : float
+        Value of the bandwidth, given directly by the bandwidth parameter or
+        estimated using the 'scott' or 'silverman' method.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.neighbors.KDTree : K-dimensional tree for fast generalized N-point
+        problems.
+    sklearn.neighbors.BallTree : Ball tree for fast generalized N-point
+        problems.
+
+    Examples
+    --------
+    Compute a gaussian kernel density estimate with a fixed bandwidth.
+
+    >>> from sklearn.neighbors import KernelDensity
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(42)
+    >>> X = rng.random_sample((100, 3))
+    >>> kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X)
+    >>> log_density = kde.score_samples(X[:3])
+    >>> log_density
+    array([-1.52955942, -1.51462041, -1.60244657])
+    """
+
+    _parameter_constraints: dict = {
+        "bandwidth": [
+            Interval(Real, 0, None, closed="neither"),
+            StrOptions({"scott", "silverman"}),
+        ],
+        "algorithm": [StrOptions(set(TREE_DICT.keys()) | {"auto"})],
+        "kernel": [StrOptions(set(VALID_KERNELS))],
+        "metric": [
+            StrOptions(
+                set(itertools.chain(*[VALID_METRICS[alg] for alg in TREE_DICT.keys()]))
+            )
+        ],
+        "atol": [Interval(Real, 0, None, closed="left")],
+        "rtol": [Interval(Real, 0, None, closed="left")],
+        "breadth_first": ["boolean"],
+        "leaf_size": [Interval(Integral, 1, None, closed="left")],
+        "metric_params": [None, dict],
+    }
+
+    def __init__(
+        self,
+        *,
+        bandwidth=1.0,
+        algorithm="auto",
+        kernel="gaussian",
+        metric="euclidean",
+        atol=0,
+        rtol=0,
+        breadth_first=True,
+        leaf_size=40,
+        metric_params=None,
+    ):
+        self.algorithm = algorithm
+        self.bandwidth = bandwidth
+        self.kernel = kernel
+        self.metric = metric
+        self.atol = atol
+        self.rtol = rtol
+        self.breadth_first = breadth_first
+        self.leaf_size = leaf_size
+        self.metric_params = metric_params
+
+    def _choose_algorithm(self, algorithm, metric):
+        # given the algorithm string + metric string, choose the optimal
+        # algorithm to compute the result.
+        if algorithm == "auto":
+            # use KD Tree if possible
+            if metric in KDTree.valid_metrics:
+                return "kd_tree"
+            elif metric in BallTree.valid_metrics:
+                return "ball_tree"
+        else:  # kd_tree or ball_tree
+            if metric not in TREE_DICT[algorithm].valid_metrics:
+                raise ValueError(
+                    "invalid metric for {0}: '{1}'".format(TREE_DICT[algorithm], metric)
+                )
+            return algorithm
+
+    @_fit_context(
+        # KernelDensity.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None, sample_weight=None):
+        """Fit the Kernel Density model on the data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            List of n_features-dimensional data points.  Each row
+            corresponds to a single data point.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            List of sample weights attached to the data X.
+
+            .. versionadded:: 0.20
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        algorithm = self._choose_algorithm(self.algorithm, self.metric)
+
+        if isinstance(self.bandwidth, str):
+            if self.bandwidth == "scott":
+                self.bandwidth_ = X.shape[0] ** (-1 / (X.shape[1] + 4))
+            elif self.bandwidth == "silverman":
+                self.bandwidth_ = (X.shape[0] * (X.shape[1] + 2) / 4) ** (
+                    -1 / (X.shape[1] + 4)
+                )
+        else:
+            self.bandwidth_ = self.bandwidth
+
+        X = validate_data(self, X, order="C", dtype=np.float64)
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=np.float64, ensure_non_negative=True
+            )
+
+        kwargs = self.metric_params
+        if kwargs is None:
+            kwargs = {}
+        self.tree_ = TREE_DICT[algorithm](
+            X,
+            metric=self.metric,
+            leaf_size=self.leaf_size,
+            sample_weight=sample_weight,
+            **kwargs,
+        )
+        return self
+
+    def score_samples(self, X):
+        """Compute the log-likelihood of each sample under the model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            An array of points to query.  Last dimension should match dimension
+            of training data (n_features).
+
+        Returns
+        -------
+        density : ndarray of shape (n_samples,)
+            Log-likelihood of each sample in `X`. These are normalized to be
+            probability densities, so values will be low for high-dimensional
+            data.
+        """
+        check_is_fitted(self)
+        # The returned density is normalized to the number of points.
+        # For it to be a probability, we must scale it.  For this reason
+        # we'll also scale atol.
+        X = validate_data(self, X, order="C", dtype=np.float64, reset=False)
+        if self.tree_.sample_weight is None:
+            N = self.tree_.data.shape[0]
+        else:
+            N = self.tree_.sum_weight
+        atol_N = self.atol * N
+        log_density = self.tree_.kernel_density(
+            X,
+            h=self.bandwidth_,
+            kernel=self.kernel,
+            atol=atol_N,
+            rtol=self.rtol,
+            breadth_first=self.breadth_first,
+            return_log=True,
+        )
+        log_density -= np.log(N)
+        return log_density
+
+    def score(self, X, y=None):
+        """Compute the total log-likelihood under the model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            List of n_features-dimensional data points.  Each row
+            corresponds to a single data point.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+
+        Returns
+        -------
+        logprob : float
+            Total log-likelihood of the data in X. This is normalized to be a
+            probability density, so the value will be low for high-dimensional
+            data.
+        """
+        return np.sum(self.score_samples(X))
+
+    def sample(self, n_samples=1, random_state=None):
+        """Generate random samples from the model.
+
+        Currently, this is implemented only for gaussian and tophat kernels.
+
+        Parameters
+        ----------
+        n_samples : int, default=1
+            Number of samples to generate.
+
+        random_state : int, RandomState instance or None, default=None
+            Determines random number generation used to generate
+            random samples. Pass an int for reproducible results
+            across multiple function calls.
+            See :term:`Glossary <random_state>`.
+
+        Returns
+        -------
+        X : array-like of shape (n_samples, n_features)
+            List of samples.
+        """
+        check_is_fitted(self)
+        # TODO: implement sampling for other valid kernel shapes
+        if self.kernel not in ["gaussian", "tophat"]:
+            raise NotImplementedError()
+
+        data = np.asarray(self.tree_.data)
+
+        rng = check_random_state(random_state)
+        u = rng.uniform(0, 1, size=n_samples)
+        if self.tree_.sample_weight is None:
+            i = (u * data.shape[0]).astype(np.int64)
+        else:
+            cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight))
+            sum_weight = cumsum_weight[-1]
+            i = np.searchsorted(cumsum_weight, u * sum_weight)
+        if self.kernel == "gaussian":
+            return np.atleast_2d(rng.normal(data[i], self.bandwidth_))
+
+        elif self.kernel == "tophat":
+            # we first draw points from a d-dimensional normal distribution,
+            # then use an incomplete gamma function to map them to a uniform
+            # d-dimensional tophat distribution.
+            dim = data.shape[1]
+            X = rng.normal(size=(n_samples, dim))
+            s_sq = row_norms(X, squared=True)
+            correction = (
+                gammainc(0.5 * dim, 0.5 * s_sq) ** (1.0 / dim)
+                * self.bandwidth_
+                / np.sqrt(s_sq)
+            )
+            return data[i] + X * correction[:, np.newaxis]
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_lof.py b/.venv/Lib/site-packages/sklearn/neighbors/_lof.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6cf21587a1f37ebc996682806ce8ded64e97ea9
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_lof.py
@@ -0,0 +1,518 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Real
+
+import numpy as np
+
+from ..base import OutlierMixin, _fit_context
+from ..utils import check_array
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.metaestimators import available_if
+from ..utils.validation import check_is_fitted
+from ._base import KNeighborsMixin, NeighborsBase
+
+__all__ = ["LocalOutlierFactor"]
+
+
+class LocalOutlierFactor(KNeighborsMixin, OutlierMixin, NeighborsBase):
+    """Unsupervised Outlier Detection using the Local Outlier Factor (LOF).
+
+    The anomaly score of each sample is called the Local Outlier Factor.
+    It measures the local deviation of the density of a given sample with respect
+    to its neighbors.
+    It is local in that the anomaly score depends on how isolated the object
+    is with respect to the surrounding neighborhood.
+    More precisely, locality is given by k-nearest neighbors, whose distance
+    is used to estimate the local density.
+    By comparing the local density of a sample to the local densities of its
+    neighbors, one can identify samples that have a substantially lower density
+    than their neighbors. These are considered outliers.
+
+    .. versionadded:: 0.19
+
+    Parameters
+    ----------
+    n_neighbors : int, default=20
+        Number of neighbors to use by default for :meth:`kneighbors` queries.
+        If n_neighbors is larger than the number of samples provided,
+        all samples will be used.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can
+        affect the speed of the construction and query, as well as the memory
+        required to store the tree. The optimal value depends on the
+        nature of the problem.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+    p : float, default=2
+        Parameter for the Minkowski metric from
+        :func:`sklearn.metrics.pairwise_distances`. When p = 1, this
+        is equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    contamination : 'auto' or float, default='auto'
+        The amount of contamination of the data set, i.e. the proportion
+        of outliers in the data set. When fitting this is used to define the
+        threshold on the scores of the samples.
+
+        - if 'auto', the threshold is determined as in the
+          original paper,
+        - if a float, the contamination should be in the range (0, 0.5].
+
+        .. versionchanged:: 0.22
+           The default value of ``contamination`` changed from 0.1
+           to ``'auto'``.
+
+    novelty : bool, default=False
+        By default, LocalOutlierFactor is only meant to be used for outlier
+        detection (novelty=False). Set novelty to True if you want to use
+        LocalOutlierFactor for novelty detection. In this case be aware that
+        you should only use predict, decision_function and score_samples
+        on new unseen data and not on the training set; and note that the
+        results obtained this way may differ from the standard LOF results.
+
+        .. versionadded:: 0.20
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    negative_outlier_factor_ : ndarray of shape (n_samples,)
+        The opposite LOF of the training samples. The higher, the more normal.
+        Inliers tend to have a LOF score close to 1
+        (``negative_outlier_factor_`` close to -1), while outliers tend to have
+        a larger LOF score.
+
+        The local outlier factor (LOF) of a sample captures its
+        supposed 'degree of abnormality'.
+        It is the average of the ratio of the local reachability density of
+        a sample and those of its k-nearest neighbors.
+
+    n_neighbors_ : int
+        The actual number of neighbors used for :meth:`kneighbors` queries.
+
+    offset_ : float
+        Offset used to obtain binary labels from the raw scores.
+        Observations having a negative_outlier_factor smaller than `offset_`
+        are detected as abnormal.
+        The offset is set to -1.5 (inliers score around -1), except when a
+        contamination parameter different than "auto" is provided. In that
+        case, the offset is defined in such a way we obtain the expected
+        number of outliers in training.
+
+        .. versionadded:: 0.20
+
+    effective_metric_ : str
+        The effective metric used for the distance computation.
+
+    effective_metric_params_ : dict
+        The effective additional keyword arguments for the metric function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        It is the number of samples in the fitted data.
+
+    See Also
+    --------
+    sklearn.svm.OneClassSVM: Unsupervised Outlier Detection using
+        Support Vector Machine.
+
+    References
+    ----------
+    .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).
+           LOF: identifying density-based local outliers. In ACM sigmod record.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.neighbors import LocalOutlierFactor
+    >>> X = [[-1.1], [0.2], [101.1], [0.3]]
+    >>> clf = LocalOutlierFactor(n_neighbors=2)
+    >>> clf.fit_predict(X)
+    array([ 1,  1, -1,  1])
+    >>> clf.negative_outlier_factor_
+    array([ -0.9821...,  -1.0370..., -73.3697...,  -0.9821...])
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "contamination": [
+            StrOptions({"auto"}),
+            Interval(Real, 0, 0.5, closed="right"),
+        ],
+        "novelty": ["boolean"],
+    }
+    _parameter_constraints.pop("radius")
+
+    def __init__(
+        self,
+        n_neighbors=20,
+        *,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        contamination="auto",
+        novelty=False,
+        n_jobs=None,
+    ):
+        super().__init__(
+            n_neighbors=n_neighbors,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.contamination = contamination
+        self.novelty = novelty
+
+    def _check_novelty_fit_predict(self):
+        if self.novelty:
+            msg = (
+                "fit_predict is not available when novelty=True. Use "
+                "novelty=False if you want to predict on the training set."
+            )
+            raise AttributeError(msg)
+        return True
+
+    @available_if(_check_novelty_fit_predict)
+    def fit_predict(self, X, y=None):
+        """Fit the model to the training set X and return the labels.
+
+        **Not available for novelty detection (when novelty is set to True).**
+        Label is 1 for an inlier and -1 for an outlier according to the LOF
+        score and the contamination parameter.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None
+            The query sample or samples to compute the Local Outlier Factor
+            w.r.t. the training samples.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            Returns -1 for anomalies/outliers and 1 for inliers.
+        """
+
+        # As fit_predict would be different from fit.predict, fit_predict is
+        # only available for outlier detection (novelty=False)
+
+        return self.fit(X)._predict()
+
+    @_fit_context(
+        # LocalOutlierFactor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Fit the local outlier factor detector from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : LocalOutlierFactor
+            The fitted local outlier factor detector.
+        """
+        self._fit(X)
+
+        n_samples = self.n_samples_fit_
+        if self.n_neighbors > n_samples:
+            warnings.warn(
+                "n_neighbors (%s) is greater than the "
+                "total number of samples (%s). n_neighbors "
+                "will be set to (n_samples - 1) for estimation."
+                % (self.n_neighbors, n_samples)
+            )
+        self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))
+
+        self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors(
+            n_neighbors=self.n_neighbors_
+        )
+
+        if self._fit_X.dtype == np.float32:
+            self._distances_fit_X_ = self._distances_fit_X_.astype(
+                self._fit_X.dtype,
+                copy=False,
+            )
+
+        self._lrd = self._local_reachability_density(
+            self._distances_fit_X_, _neighbors_indices_fit_X_
+        )
+
+        # Compute lof score over training samples to define offset_:
+        lrd_ratios_array = (
+            self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis]
+        )
+
+        self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1)
+
+        if self.contamination == "auto":
+            # inliers score around -1 (the higher, the less abnormal).
+            self.offset_ = -1.5
+        else:
+            self.offset_ = np.percentile(
+                self.negative_outlier_factor_, 100.0 * self.contamination
+            )
+
+        # Verify if negative_outlier_factor_ values are within acceptable range.
+        # Novelty must also be false to detect outliers
+        if np.min(self.negative_outlier_factor_) < -1e7 and not self.novelty:
+            warnings.warn(
+                "Duplicate values are leading to incorrect results. "
+                "Increase the number of neighbors for more accurate results."
+            )
+
+        return self
+
+    def _check_novelty_predict(self):
+        if not self.novelty:
+            msg = (
+                "predict is not available when novelty=False, use "
+                "fit_predict if you want to predict on training data. Use "
+                "novelty=True if you want to use LOF for novelty detection "
+                "and predict on new unseen data."
+            )
+            raise AttributeError(msg)
+        return True
+
+    @available_if(_check_novelty_predict)
+    def predict(self, X=None):
+        """Predict the labels (1 inlier, -1 outlier) of X according to LOF.
+
+        **Only available for novelty detection (when novelty is set to True).**
+        This method allows to generalize prediction to *new observations* (not
+        in the training set). Note that the result of ``clf.fit(X)`` then
+        ``clf.predict(X)`` with ``novelty=True`` may differ from the result
+        obtained by ``clf.fit_predict(X)`` with ``novelty=False``.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The query sample or samples to compute the Local Outlier Factor
+            w.r.t. the training samples.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            Returns -1 for anomalies/outliers and +1 for inliers.
+        """
+        return self._predict(X)
+
+    def _predict(self, X=None):
+        """Predict the labels (1 inlier, -1 outlier) of X according to LOF.
+
+        If X is None, returns the same as fit_predict(X_train).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None
+            The query sample or samples to compute the Local Outlier Factor
+            w.r.t. the training samples. If None, makes prediction on the
+            training data without considering them as their own neighbors.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            Returns -1 for anomalies/outliers and +1 for inliers.
+        """
+        check_is_fitted(self)
+
+        if X is not None:
+            shifted_opposite_lof_scores = self.decision_function(X)
+            is_inlier = np.ones(shifted_opposite_lof_scores.shape[0], dtype=int)
+            is_inlier[shifted_opposite_lof_scores < 0] = -1
+        else:
+            is_inlier = np.ones(self.n_samples_fit_, dtype=int)
+            is_inlier[self.negative_outlier_factor_ < self.offset_] = -1
+
+        return is_inlier
+
+    def _check_novelty_decision_function(self):
+        if not self.novelty:
+            msg = (
+                "decision_function is not available when novelty=False. "
+                "Use novelty=True if you want to use LOF for novelty "
+                "detection and compute decision_function for new unseen "
+                "data. Note that the opposite LOF of the training samples "
+                "is always available by considering the "
+                "negative_outlier_factor_ attribute."
+            )
+            raise AttributeError(msg)
+        return True
+
+    @available_if(_check_novelty_decision_function)
+    def decision_function(self, X):
+        """Shifted opposite of the Local Outlier Factor of X.
+
+        Bigger is better, i.e. large values correspond to inliers.
+
+        **Only available for novelty detection (when novelty is set to True).**
+        The shift offset allows a zero threshold for being an outlier.
+        The argument X is supposed to contain *new data*: if X contains a
+        point from training, it considers the later in its own neighborhood.
+        Also, the samples in X are not considered in the neighborhood of any
+        point.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The query sample or samples to compute the Local Outlier Factor
+            w.r.t. the training samples.
+
+        Returns
+        -------
+        shifted_opposite_lof_scores : ndarray of shape (n_samples,)
+            The shifted opposite of the Local Outlier Factor of each input
+            samples. The lower, the more abnormal. Negative scores represent
+            outliers, positive scores represent inliers.
+        """
+        return self.score_samples(X) - self.offset_
+
+    def _check_novelty_score_samples(self):
+        if not self.novelty:
+            msg = (
+                "score_samples is not available when novelty=False. The "
+                "scores of the training samples are always available "
+                "through the negative_outlier_factor_ attribute. Use "
+                "novelty=True if you want to use LOF for novelty detection "
+                "and compute score_samples for new unseen data."
+            )
+            raise AttributeError(msg)
+        return True
+
+    @available_if(_check_novelty_score_samples)
+    def score_samples(self, X):
+        """Opposite of the Local Outlier Factor of X.
+
+        It is the opposite as bigger is better, i.e. large values correspond
+        to inliers.
+
+        **Only available for novelty detection (when novelty is set to True).**
+        The argument X is supposed to contain *new data*: if X contains a
+        point from training, it considers the later in its own neighborhood.
+        Also, the samples in X are not considered in the neighborhood of any
+        point. Because of this, the scores obtained via ``score_samples`` may
+        differ from the standard LOF scores.
+        The standard LOF scores for the training data is available via the
+        ``negative_outlier_factor_`` attribute.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The query sample or samples to compute the Local Outlier Factor
+            w.r.t. the training samples.
+
+        Returns
+        -------
+        opposite_lof_scores : ndarray of shape (n_samples,)
+            The opposite of the Local Outlier Factor of each input samples.
+            The lower, the more abnormal.
+        """
+        check_is_fitted(self)
+        X = check_array(X, accept_sparse="csr")
+
+        distances_X, neighbors_indices_X = self.kneighbors(
+            X, n_neighbors=self.n_neighbors_
+        )
+
+        if X.dtype == np.float32:
+            distances_X = distances_X.astype(X.dtype, copy=False)
+
+        X_lrd = self._local_reachability_density(
+            distances_X,
+            neighbors_indices_X,
+        )
+
+        lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis]
+
+        # as bigger is better:
+        return -np.mean(lrd_ratios_array, axis=1)
+
+    def _local_reachability_density(self, distances_X, neighbors_indices):
+        """The local reachability density (LRD)
+
+        The LRD of a sample is the inverse of the average reachability
+        distance of its k-nearest neighbors.
+
+        Parameters
+        ----------
+        distances_X : ndarray of shape (n_queries, self.n_neighbors)
+            Distances to the neighbors (in the training samples `self._fit_X`)
+            of each query point to compute the LRD.
+
+        neighbors_indices : ndarray of shape (n_queries, self.n_neighbors)
+            Neighbors indices (of each query point) among training samples
+            self._fit_X.
+
+        Returns
+        -------
+        local_reachability_density : ndarray of shape (n_queries,)
+            The local reachability density of each sample.
+        """
+        dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1]
+        reach_dist_array = np.maximum(distances_X, dist_k)
+
+        # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_:
+        return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10)
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_nca.py b/.venv/Lib/site-packages/sklearn/neighbors/_nca.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8153f388b7718bc9f2ec6ae775bc67a50c717ae
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_nca.py
@@ -0,0 +1,530 @@
+"""
+Neighborhood Component Analysis
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import sys
+import time
+from numbers import Integral, Real
+from warnings import warn
+
+import numpy as np
+from scipy.optimize import minimize
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..decomposition import PCA
+from ..exceptions import ConvergenceWarning
+from ..metrics import pairwise_distances
+from ..preprocessing import LabelEncoder
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import softmax
+from ..utils.multiclass import check_classification_targets
+from ..utils.random import check_random_state
+from ..utils.validation import check_array, check_is_fitted, validate_data
+
+
+class NeighborhoodComponentsAnalysis(
+    ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator
+):
+    """Neighborhood Components Analysis.
+
+    Neighborhood Component Analysis (NCA) is a machine learning algorithm for
+    metric learning. It learns a linear transformation in a supervised fashion
+    to improve the classification accuracy of a stochastic nearest neighbors
+    rule in the transformed space.
+
+    Read more in the :ref:`User Guide <nca>`.
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Preferred dimensionality of the projected space.
+        If None it will be set to `n_features`.
+
+    init : {'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape \
+            (n_features_a, n_features_b), default='auto'
+        Initialization of the linear transformation. Possible options are
+        `'auto'`, `'pca'`, `'lda'`, `'identity'`, `'random'`, and a numpy
+        array of shape `(n_features_a, n_features_b)`.
+
+        - `'auto'`
+            Depending on `n_components`, the most reasonable initialization
+            is chosen. If `n_components <= min(n_features, n_classes - 1)`
+            we use `'lda'`, as it uses labels information. If not, but
+            `n_components < min(n_features, n_samples)`, we use `'pca'`, as
+            it projects data in meaningful directions (those of higher
+            variance). Otherwise, we just use `'identity'`.
+
+        - `'pca'`
+            `n_components` principal components of the inputs passed
+            to :meth:`fit` will be used to initialize the transformation.
+            (See :class:`~sklearn.decomposition.PCA`)
+
+        - `'lda'`
+            `min(n_components, n_classes)` most discriminative
+            components of the inputs passed to :meth:`fit` will be used to
+            initialize the transformation. (If `n_components > n_classes`,
+            the rest of the components will be zero.) (See
+            :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+
+        - `'identity'`
+            If `n_components` is strictly smaller than the
+            dimensionality of the inputs passed to :meth:`fit`, the identity
+            matrix will be truncated to the first `n_components` rows.
+
+        - `'random'`
+            The initial transformation will be a random array of shape
+            `(n_components, n_features)`. Each value is sampled from the
+            standard normal distribution.
+
+        - numpy array
+            `n_features_b` must match the dimensionality of the inputs passed
+            to :meth:`fit` and n_features_a must be less than or equal to that.
+            If `n_components` is not `None`, `n_features_a` must match it.
+
+    warm_start : bool, default=False
+        If `True` and :meth:`fit` has been called before, the solution of the
+        previous call to :meth:`fit` is used as the initial linear
+        transformation (`n_components` and `init` will be ignored).
+
+    max_iter : int, default=50
+        Maximum number of iterations in the optimization.
+
+    tol : float, default=1e-5
+        Convergence tolerance for the optimization.
+
+    callback : callable, default=None
+        If not `None`, this function is called after every iteration of the
+        optimizer, taking as arguments the current solution (flattened
+        transformation matrix) and the number of iterations. This might be
+        useful in case one wants to examine or store the transformation
+        found after each iteration.
+
+    verbose : int, default=0
+        If 0, no progress messages will be printed.
+        If 1, progress messages will be printed to stdout.
+        If > 1, progress messages will be printed and the `disp`
+        parameter of :func:`scipy.optimize.minimize` will be set to
+        `verbose - 2`.
+
+    random_state : int or numpy.RandomState, default=None
+        A pseudo random number generator object or a seed for it if int. If
+        `init='random'`, `random_state` is used to initialize the random
+        transformation. If `init='pca'`, `random_state` is passed as an
+        argument to PCA when initializing the transformation. Pass an int
+        for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        The linear transformation learned during fitting.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    n_iter_ : int
+        Counts the number of iterations performed by the optimizer.
+
+    random_state_ : numpy.RandomState
+        Pseudo random number generator object used during initialization.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.discriminant_analysis.LinearDiscriminantAnalysis : Linear
+        Discriminant Analysis.
+    sklearn.decomposition.PCA : Principal component analysis (PCA).
+
+    References
+    ----------
+    .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.
+           "Neighbourhood Components Analysis". Advances in Neural Information
+           Processing Systems. 17, 513-520, 2005.
+           http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf
+
+    .. [2] Wikipedia entry on Neighborhood Components Analysis
+           https://en.wikipedia.org/wiki/Neighbourhood_components_analysis
+
+    Examples
+    --------
+    >>> from sklearn.neighbors import NeighborhoodComponentsAnalysis
+    >>> from sklearn.neighbors import KNeighborsClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = load_iris(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ... stratify=y, test_size=0.7, random_state=42)
+    >>> nca = NeighborhoodComponentsAnalysis(random_state=42)
+    >>> nca.fit(X_train, y_train)
+    NeighborhoodComponentsAnalysis(...)
+    >>> knn = KNeighborsClassifier(n_neighbors=3)
+    >>> knn.fit(X_train, y_train)
+    KNeighborsClassifier(...)
+    >>> print(knn.score(X_test, y_test))
+    0.933333...
+    >>> knn.fit(nca.transform(X_train), y_train)
+    KNeighborsClassifier(...)
+    >>> print(knn.score(nca.transform(X_test), y_test))
+    0.961904...
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [
+            Interval(Integral, 1, None, closed="left"),
+            None,
+        ],
+        "init": [
+            StrOptions({"auto", "pca", "lda", "identity", "random"}),
+            np.ndarray,
+        ],
+        "warm_start": ["boolean"],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "callback": [callable, None],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        init="auto",
+        warm_start=False,
+        max_iter=50,
+        tol=1e-5,
+        callback=None,
+        verbose=0,
+        random_state=None,
+    ):
+        self.n_components = n_components
+        self.init = init
+        self.warm_start = warm_start
+        self.max_iter = max_iter
+        self.tol = tol
+        self.callback = callback
+        self.verbose = verbose
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training samples.
+
+        y : array-like of shape (n_samples,)
+            The corresponding training labels.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        # Validate the inputs X and y, and converts y to numerical classes.
+        X, y = validate_data(self, X, y, ensure_min_samples=2)
+        check_classification_targets(y)
+        y = LabelEncoder().fit_transform(y)
+
+        # Check the preferred dimensionality of the projected space
+        if self.n_components is not None and self.n_components > X.shape[1]:
+            raise ValueError(
+                "The preferred dimensionality of the "
+                f"projected space `n_components` ({self.n_components}) cannot "
+                "be greater than the given data "
+                f"dimensionality ({X.shape[1]})!"
+            )
+        # If warm_start is enabled, check that the inputs are consistent
+        if (
+            self.warm_start
+            and hasattr(self, "components_")
+            and self.components_.shape[1] != X.shape[1]
+        ):
+            raise ValueError(
+                f"The new inputs dimensionality ({X.shape[1]}) does not "
+                "match the input dimensionality of the "
+                f"previously learned transformation ({self.components_.shape[1]})."
+            )
+        # Check how the linear transformation should be initialized
+        init = self.init
+        if isinstance(init, np.ndarray):
+            init = check_array(init)
+            # Assert that init.shape[1] = X.shape[1]
+            if init.shape[1] != X.shape[1]:
+                raise ValueError(
+                    f"The input dimensionality ({init.shape[1]}) of the given "
+                    "linear transformation `init` must match the "
+                    f"dimensionality of the given inputs `X` ({X.shape[1]})."
+                )
+            # Assert that init.shape[0] <= init.shape[1]
+            if init.shape[0] > init.shape[1]:
+                raise ValueError(
+                    f"The output dimensionality ({init.shape[0]}) of the given "
+                    "linear transformation `init` cannot be "
+                    f"greater than its input dimensionality ({init.shape[1]})."
+                )
+            # Assert that self.n_components = init.shape[0]
+            if self.n_components is not None and self.n_components != init.shape[0]:
+                raise ValueError(
+                    "The preferred dimensionality of the "
+                    f"projected space `n_components` ({self.n_components}) does"
+                    " not match the output dimensionality of "
+                    "the given linear transformation "
+                    f"`init` ({init.shape[0]})!"
+                )
+
+        # Initialize the random generator
+        self.random_state_ = check_random_state(self.random_state)
+
+        # Measure the total training time
+        t_train = time.time()
+
+        # Compute a mask that stays fixed during optimization:
+        same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
+        # (n_samples, n_samples)
+
+        # Initialize the transformation
+        transformation = np.ravel(self._initialize(X, y, init))
+
+        # Create a dictionary of parameters to be passed to the optimizer
+        disp = self.verbose - 2 if self.verbose > 1 else -1
+        optimizer_params = {
+            "method": "L-BFGS-B",
+            "fun": self._loss_grad_lbfgs,
+            "args": (X, same_class_mask, -1.0),
+            "jac": True,
+            "x0": transformation,
+            "tol": self.tol,
+            "options": dict(maxiter=self.max_iter, disp=disp),
+            "callback": self._callback,
+        }
+
+        # Call the optimizer
+        self.n_iter_ = 0
+        opt_result = minimize(**optimizer_params)
+
+        # Reshape the solution found by the optimizer
+        self.components_ = opt_result.x.reshape(-1, X.shape[1])
+
+        # Stop timer
+        t_train = time.time() - t_train
+        if self.verbose:
+            cls_name = self.__class__.__name__
+
+            # Warn the user if the algorithm did not converge
+            if not opt_result.success:
+                warn(
+                    "[{}] NCA did not converge: {}".format(
+                        cls_name, opt_result.message
+                    ),
+                    ConvergenceWarning,
+                )
+
+            print("[{}] Training took {:8.2f}s.".format(cls_name, t_train))
+
+        return self
+
+    def transform(self, X):
+        """Apply the learned transformation to the given data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Data samples.
+
+        Returns
+        -------
+        X_embedded: ndarray of shape (n_samples, n_components)
+            The data samples transformed.
+
+        Raises
+        ------
+        NotFittedError
+            If :meth:`fit` has not been called before.
+        """
+
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False)
+
+        return np.dot(X, self.components_.T)
+
+    def _initialize(self, X, y, init):
+        """Initialize the transformation.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training samples.
+
+        y : array-like of shape (n_samples,)
+            The training labels.
+
+        init : str or ndarray of shape (n_features_a, n_features_b)
+            The validated initialization of the linear transformation.
+
+        Returns
+        -------
+        transformation : ndarray of shape (n_components, n_features)
+            The initialized linear transformation.
+
+        """
+
+        transformation = init
+        if self.warm_start and hasattr(self, "components_"):
+            transformation = self.components_
+        elif isinstance(init, np.ndarray):
+            pass
+        else:
+            n_samples, n_features = X.shape
+            n_components = self.n_components or n_features
+            if init == "auto":
+                n_classes = len(np.unique(y))
+                if n_components <= min(n_features, n_classes - 1):
+                    init = "lda"
+                elif n_components < min(n_features, n_samples):
+                    init = "pca"
+                else:
+                    init = "identity"
+            if init == "identity":
+                transformation = np.eye(n_components, X.shape[1])
+            elif init == "random":
+                transformation = self.random_state_.standard_normal(
+                    size=(n_components, X.shape[1])
+                )
+            elif init in {"pca", "lda"}:
+                init_time = time.time()
+                if init == "pca":
+                    pca = PCA(
+                        n_components=n_components, random_state=self.random_state_
+                    )
+                    if self.verbose:
+                        print("Finding principal components... ", end="")
+                        sys.stdout.flush()
+                    pca.fit(X)
+                    transformation = pca.components_
+                elif init == "lda":
+                    from ..discriminant_analysis import LinearDiscriminantAnalysis
+
+                    lda = LinearDiscriminantAnalysis(n_components=n_components)
+                    if self.verbose:
+                        print("Finding most discriminative components... ", end="")
+                        sys.stdout.flush()
+                    lda.fit(X, y)
+                    transformation = lda.scalings_.T[:n_components]
+                if self.verbose:
+                    print("done in {:5.2f}s".format(time.time() - init_time))
+        return transformation
+
+    def _callback(self, transformation):
+        """Called after each iteration of the optimizer.
+
+        Parameters
+        ----------
+        transformation : ndarray of shape (n_components * n_features,)
+            The solution computed by the optimizer in this iteration.
+        """
+        if self.callback is not None:
+            self.callback(transformation, self.n_iter_)
+
+        self.n_iter_ += 1
+
+    def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
+        """Compute the loss and the loss gradient w.r.t. `transformation`.
+
+        Parameters
+        ----------
+        transformation : ndarray of shape (n_components * n_features,)
+            The raveled linear transformation on which to compute loss and
+            evaluate gradient.
+
+        X : ndarray of shape (n_samples, n_features)
+            The training samples.
+
+        same_class_mask : ndarray of shape (n_samples, n_samples)
+            A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong
+            to the same class, and `0` otherwise.
+
+        Returns
+        -------
+        loss : float
+            The loss computed for the given transformation.
+
+        gradient : ndarray of shape (n_components * n_features,)
+            The new (flattened) gradient of the loss.
+        """
+
+        if self.n_iter_ == 0:
+            self.n_iter_ += 1
+            if self.verbose:
+                header_fields = ["Iteration", "Objective Value", "Time(s)"]
+                header_fmt = "{:>10} {:>20} {:>10}"
+                header = header_fmt.format(*header_fields)
+                cls_name = self.__class__.__name__
+                print("[{}]".format(cls_name))
+                print(
+                    "[{}] {}\n[{}] {}".format(
+                        cls_name, header, cls_name, "-" * len(header)
+                    )
+                )
+
+        t_funcall = time.time()
+
+        transformation = transformation.reshape(-1, X.shape[1])
+        X_embedded = np.dot(X, transformation.T)  # (n_samples, n_components)
+
+        # Compute softmax distances
+        p_ij = pairwise_distances(X_embedded, squared=True)
+        np.fill_diagonal(p_ij, np.inf)
+        p_ij = softmax(-p_ij)  # (n_samples, n_samples)
+
+        # Compute loss
+        masked_p_ij = p_ij * same_class_mask
+        p = np.sum(masked_p_ij, axis=1, keepdims=True)  # (n_samples, 1)
+        loss = np.sum(p)
+
+        # Compute gradient of loss w.r.t. `transform`
+        weighted_p_ij = masked_p_ij - p_ij * p
+        weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T
+        np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0))
+        gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X)
+        # time complexity of the gradient: O(n_components x n_samples x (
+        # n_samples + n_features))
+
+        if self.verbose:
+            t_funcall = time.time() - t_funcall
+            values_fmt = "[{}] {:>10} {:>20.6e} {:>10.2f}"
+            print(
+                values_fmt.format(
+                    self.__class__.__name__, self.n_iter_, loss, t_funcall
+                )
+            )
+            sys.stdout.flush()
+
+        return sign * loss, sign * gradient.ravel()
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.required = True
+        return tags
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_nearest_centroid.py b/.venv/Lib/site-packages/sklearn/neighbors/_nearest_centroid.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9556cea19ad326f4c3a77b9ee19e488e9fcac93
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_nearest_centroid.py
@@ -0,0 +1,358 @@
+"""
+Nearest Centroid Classification
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Real
+
+import numpy as np
+from scipy import sparse as sp
+
+from ..base import BaseEstimator, ClassifierMixin, _fit_context
+from ..discriminant_analysis import DiscriminantAnalysisPredictionMixin
+from ..metrics.pairwise import (
+    pairwise_distances,
+    pairwise_distances_argmin,
+)
+from ..preprocessing import LabelEncoder
+from ..utils import get_tags
+from ..utils._available_if import available_if
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.multiclass import check_classification_targets
+from ..utils.sparsefuncs import csc_median_axis_0
+from ..utils.validation import check_is_fitted, validate_data
+
+
+class NearestCentroid(
+    DiscriminantAnalysisPredictionMixin, ClassifierMixin, BaseEstimator
+):
+    """Nearest centroid classifier.
+
+    Each class is represented by its centroid, with test samples classified to
+    the class with the nearest centroid.
+
+    Read more in the :ref:`User Guide <nearest_centroid_classifier>`.
+
+    Parameters
+    ----------
+    metric : {"euclidean", "manhattan"}, default="euclidean"
+        Metric to use for distance computation.
+
+        If `metric="euclidean"`, the centroid for the samples corresponding to each
+        class is the arithmetic mean, which minimizes the sum of squared L1 distances.
+        If `metric="manhattan"`, the centroid is the feature-wise median, which
+        minimizes the sum of L1 distances.
+
+        .. versionchanged:: 1.5
+            All metrics but `"euclidean"` and `"manhattan"` were deprecated and
+            now raise an error.
+
+        .. versionchanged:: 0.19
+            `metric='precomputed'` was deprecated and now raises an error
+
+    shrink_threshold : float, default=None
+        Threshold for shrinking centroids to remove features.
+
+    priors : {"uniform", "empirical"} or array-like of shape (n_classes,), \
+        default="uniform"
+        The class prior probabilities. By default, the class proportions are
+        inferred from the training data.
+
+        .. versionadded:: 1.6
+
+    Attributes
+    ----------
+    centroids_ : array-like of shape (n_classes, n_features)
+        Centroid of each class.
+
+    classes_ : array of shape (n_classes,)
+        The unique classes labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    deviations_ : ndarray of shape (n_classes, n_features)
+        Deviations (or shrinkages) of the centroids of each class from the
+        overall centroid. Equal to eq. (18.4) if `shrink_threshold=None`,
+        else (18.5) p. 653 of [2]. Can be used to identify features used
+        for classification.
+
+        .. versionadded:: 1.6
+
+    within_class_std_dev_ : ndarray of shape (n_features,)
+        Pooled or within-class standard deviation of input data.
+
+        .. versionadded:: 1.6
+
+    class_prior_ : ndarray of shape (n_classes,)
+        The class prior probabilities.
+
+        .. versionadded:: 1.6
+
+    See Also
+    --------
+    KNeighborsClassifier : Nearest neighbors classifier.
+
+    Notes
+    -----
+    When used for text classification with tf-idf vectors, this classifier is
+    also known as the Rocchio classifier.
+
+    References
+    ----------
+    [1] Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of
+    multiple cancer types by shrunken centroids of gene expression. Proceedings
+    of the National Academy of Sciences of the United States of America,
+    99(10), 6567-6572. The National Academy of Sciences.
+
+    [2] Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical
+    Learning Data Mining, Inference, and Prediction. 2nd Edition. New York, Springer.
+
+    Examples
+    --------
+    >>> from sklearn.neighbors import NearestCentroid
+    >>> import numpy as np
+    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    >>> y = np.array([1, 1, 1, 2, 2, 2])
+    >>> clf = NearestCentroid()
+    >>> clf.fit(X, y)
+    NearestCentroid()
+    >>> print(clf.predict([[-0.8, -1]]))
+    [1]
+    """
+
+    _parameter_constraints: dict = {
+        "metric": [StrOptions({"manhattan", "euclidean"})],
+        "shrink_threshold": [Interval(Real, 0, None, closed="neither"), None],
+        "priors": ["array-like", StrOptions({"empirical", "uniform"})],
+    }
+
+    def __init__(
+        self,
+        metric="euclidean",
+        *,
+        shrink_threshold=None,
+        priors="uniform",
+    ):
+        self.metric = metric
+        self.shrink_threshold = shrink_threshold
+        self.priors = priors
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """
+        Fit the NearestCentroid model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+            Note that centroid shrinking cannot be used with sparse matrices.
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        # If X is sparse and the metric is "manhattan", store it in a csc
+        # format is easier to calculate the median.
+        if self.metric == "manhattan":
+            X, y = validate_data(self, X, y, accept_sparse=["csc"])
+        else:
+            ensure_all_finite = (
+                "allow-nan" if get_tags(self).input_tags.allow_nan else True
+            )
+            X, y = validate_data(
+                self,
+                X,
+                y,
+                ensure_all_finite=ensure_all_finite,
+                accept_sparse=["csr", "csc"],
+            )
+        is_X_sparse = sp.issparse(X)
+        check_classification_targets(y)
+
+        n_samples, n_features = X.shape
+        le = LabelEncoder()
+        y_ind = le.fit_transform(y)
+        self.classes_ = classes = le.classes_
+        n_classes = classes.size
+        if n_classes < 2:
+            raise ValueError(
+                "The number of classes has to be greater than one; got %d class"
+                % (n_classes)
+            )
+
+        if self.priors == "empirical":  # estimate priors from sample
+            _, class_counts = np.unique(y, return_inverse=True)  # non-negative ints
+            self.class_prior_ = np.bincount(class_counts) / float(len(y))
+        elif self.priors == "uniform":
+            self.class_prior_ = np.asarray([1 / n_classes] * n_classes)
+        else:
+            self.class_prior_ = np.asarray(self.priors)
+
+        if (self.class_prior_ < 0).any():
+            raise ValueError("priors must be non-negative")
+        if not np.isclose(self.class_prior_.sum(), 1.0):
+            warnings.warn(
+                "The priors do not sum to 1. Normalizing such that it sums to one.",
+                UserWarning,
+            )
+            self.class_prior_ = self.class_prior_ / self.class_prior_.sum()
+
+        # Mask mapping each class to its members.
+        self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)
+
+        # Number of clusters in each class.
+        nk = np.zeros(n_classes)
+
+        for cur_class in range(n_classes):
+            center_mask = y_ind == cur_class
+            nk[cur_class] = np.sum(center_mask)
+            if is_X_sparse:
+                center_mask = np.where(center_mask)[0]
+
+            if self.metric == "manhattan":
+                # NumPy does not calculate median of sparse matrices.
+                if not is_X_sparse:
+                    self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
+                else:
+                    self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])
+            else:  # metric == "euclidean"
+                self.centroids_[cur_class] = X[center_mask].mean(axis=0)
+
+        # Compute within-class std_dev with unshrunked centroids
+        variance = np.array(X - self.centroids_[y_ind], copy=False) ** 2
+        self.within_class_std_dev_ = np.array(
+            np.sqrt(variance.sum(axis=0) / (n_samples - n_classes)), copy=False
+        )
+        if any(self.within_class_std_dev_ == 0):
+            warnings.warn(
+                "self.within_class_std_dev_ has at least 1 zero standard deviation."
+                "Inputs within the same classes for at least 1 feature are identical."
+            )
+
+        err_msg = "All features have zero variance. Division by zero."
+        if is_X_sparse and np.all((X.max(axis=0) - X.min(axis=0)).toarray() == 0):
+            raise ValueError(err_msg)
+        elif not is_X_sparse and np.all(np.ptp(X, axis=0) == 0):
+            raise ValueError(err_msg)
+
+        dataset_centroid_ = X.mean(axis=0)
+        # m parameter for determining deviation
+        m = np.sqrt((1.0 / nk) - (1.0 / n_samples))
+        # Calculate deviation using the standard deviation of centroids.
+        # To deter outliers from affecting the results.
+        s = self.within_class_std_dev_ + np.median(self.within_class_std_dev_)
+        mm = m.reshape(len(m), 1)  # Reshape to allow broadcasting.
+        ms = mm * s
+        self.deviations_ = np.array(
+            (self.centroids_ - dataset_centroid_) / ms, copy=False
+        )
+        # Soft thresholding: if the deviation crosses 0 during shrinking,
+        # it becomes zero.
+        if self.shrink_threshold:
+            signs = np.sign(self.deviations_)
+            self.deviations_ = np.abs(self.deviations_) - self.shrink_threshold
+            np.clip(self.deviations_, 0, None, out=self.deviations_)
+            self.deviations_ *= signs
+            # Now adjust the centroids using the deviation
+            msd = ms * self.deviations_
+            self.centroids_ = np.array(dataset_centroid_ + msd, copy=False)
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors `X`.
+
+        The predicted class `C` for each sample in `X` is returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            The predicted classes.
+        """
+        check_is_fitted(self)
+        if np.isclose(self.class_prior_, 1 / len(self.classes_)).all():
+            # `validate_data` is called here since we are not calling `super()`
+            ensure_all_finite = (
+                "allow-nan" if get_tags(self).input_tags.allow_nan else True
+            )
+            X = validate_data(
+                self,
+                X,
+                ensure_all_finite=ensure_all_finite,
+                accept_sparse="csr",
+                reset=False,
+            )
+            return self.classes_[
+                pairwise_distances_argmin(X, self.centroids_, metric=self.metric)
+            ]
+        else:
+            return super().predict(X)
+
+    def _decision_function(self, X):
+        # return discriminant scores, see eq. (18.2) p. 652 of the ESL.
+        check_is_fitted(self, "centroids_")
+
+        X_normalized = validate_data(
+            self, X, copy=True, reset=False, accept_sparse="csr", dtype=np.float64
+        )
+
+        discriminant_score = np.empty(
+            (X_normalized.shape[0], self.classes_.size), dtype=np.float64
+        )
+
+        mask = self.within_class_std_dev_ != 0
+        X_normalized[:, mask] /= self.within_class_std_dev_[mask]
+        centroids_normalized = self.centroids_.copy()
+        centroids_normalized[:, mask] /= self.within_class_std_dev_[mask]
+
+        for class_idx in range(self.classes_.size):
+            distances = pairwise_distances(
+                X_normalized, centroids_normalized[[class_idx]], metric=self.metric
+            ).ravel()
+            distances **= 2
+            discriminant_score[:, class_idx] = np.squeeze(
+                -distances + 2.0 * np.log(self.class_prior_[class_idx])
+            )
+
+        return discriminant_score
+
+    def _check_euclidean_metric(self):
+        return self.metric == "euclidean"
+
+    decision_function = available_if(_check_euclidean_metric)(
+        DiscriminantAnalysisPredictionMixin.decision_function
+    )
+
+    predict_proba = available_if(_check_euclidean_metric)(
+        DiscriminantAnalysisPredictionMixin.predict_proba
+    )
+
+    predict_log_proba = available_if(_check_euclidean_metric)(
+        DiscriminantAnalysisPredictionMixin.predict_log_proba
+    )
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = self.metric == "nan_euclidean"
+        return tags
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.lib
new file mode 100644
index 0000000000000000000000000000000000000000..57b9d941d022044094ea821d04eb6f15f4c22bc8
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.lib differ
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..81893531c59578de69d0ac23f901b229efe2c7ec
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.cp39-win_amd64.pyd differ
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pxd b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..8a749db257a114322447b705ab557af2765a80cd
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pxd
@@ -0,0 +1,10 @@
+from cython cimport floating
+from ..utils._typedefs cimport float64_t, intp_t
+
+cdef int partition_node_indices(
+        const floating *data,
+        intp_t *node_indices,
+        intp_t split_dim,
+        intp_t split_index,
+        intp_t n_features,
+        intp_t n_points) except -1
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pyx b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..8c35d57dfc49550ac10dd98324bd177982f6568b
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_partition_nodes.pyx
@@ -0,0 +1,122 @@
+# BinaryTrees rely on partial sorts to partition their nodes during their
+# initialisation.
+#
+# The C++ std library exposes nth_element, an efficient partial sort for this
+# situation which has a linear time complexity as well as the best performances.
+#
+# To use std::algorithm::nth_element, a few fixture are defined using Cython:
+# - partition_node_indices, a Cython function used in BinaryTrees, that calls
+# - partition_node_indices_inner, a C++ function that wraps nth_element and uses
+# - an IndexComparator to state how to compare KDTrees' indices
+#
+# IndexComparator has been defined so that partial sorts are stable with
+# respect to the nodes initial indices.
+#
+# See for reference:
+#  - https://en.cppreference.com/w/cpp/algorithm/nth_element.
+#  - https://github.com/scikit-learn/scikit-learn/pull/11103
+#  - https://github.com/scikit-learn/scikit-learn/pull/19473
+from cython cimport floating
+
+
+cdef extern from *:
+    """
+    #include <algorithm>
+
+    template<class D, class I>
+    class IndexComparator {
+    private:
+        const D *data;
+        I split_dim, n_features;
+    public:
+        IndexComparator(const D *data, const I &split_dim, const I &n_features):
+            data(data), split_dim(split_dim), n_features(n_features) {}
+
+        bool operator()(const I &a, const I &b) const {
+            D a_value = data[a * n_features + split_dim];
+            D b_value = data[b * n_features + split_dim];
+            return a_value == b_value ? a < b : a_value < b_value;
+        }
+    };
+
+    template<class D, class I>
+    void partition_node_indices_inner(
+        const D *data,
+        I *node_indices,
+        const I &split_dim,
+        const I &split_index,
+        const I &n_features,
+        const I &n_points) {
+        IndexComparator<D, I> index_comparator(data, split_dim, n_features);
+        std::nth_element(
+            node_indices,
+            node_indices + split_index,
+            node_indices + n_points,
+            index_comparator);
+    }
+    """
+    void partition_node_indices_inner[D, I](
+                const D *data,
+                I *node_indices,
+                I split_dim,
+                I split_index,
+                I n_features,
+                I n_points) except +
+
+
+cdef int partition_node_indices(
+        const floating *data,
+        intp_t *node_indices,
+        intp_t split_dim,
+        intp_t split_index,
+        intp_t n_features,
+        intp_t n_points) except -1:
+    """Partition points in the node into two equal-sized groups.
+
+    Upon return, the values in node_indices will be rearranged such that
+    (assuming numpy-style indexing):
+
+        data[node_indices[0:split_index], split_dim]
+          <= data[node_indices[split_index], split_dim]
+
+    and
+
+        data[node_indices[split_index], split_dim]
+          <= data[node_indices[split_index:n_points], split_dim]
+
+    The algorithm is essentially a partial in-place quicksort around a
+    set pivot.
+
+    Parameters
+    ----------
+    data : double pointer
+        Pointer to a 2D array of the training data, of shape [N, n_features].
+        N must be greater than any of the values in node_indices.
+    node_indices : int pointer
+        Pointer to a 1D array of length n_points.  This lists the indices of
+        each of the points within the current node.  This will be modified
+        in-place.
+    split_dim : int
+        the dimension on which to split.  This will usually be computed via
+        the routine ``find_node_split_dim``.
+    split_index : int
+        the index within node_indices around which to split the points.
+    n_features: int
+        the number of features (i.e columns) in the 2D array pointed by data.
+    n_points : int
+        the length of node_indices. This is also the number of points in
+        the original dataset.
+    Returns
+    -------
+    status : int
+        integer exit status.  On return, the contents of node_indices are
+        modified as noted above.
+    """
+    partition_node_indices_inner(
+        data,
+        node_indices,
+        split_dim,
+        split_index,
+        n_features,
+        n_points)
+    return 0
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.lib
new file mode 100644
index 0000000000000000000000000000000000000000..de5779031bbaebb1d07da8d740618503b476def6
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.lib differ
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..8289cc1cb71fbe3ed3a7d816797740509e8a66de
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.cp39-win_amd64.pyd differ
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pxd b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..a8ca8d7d4f684241d6c6617d1a692419f2eb8731
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pxd
@@ -0,0 +1,92 @@
+# Author: Thomas Moreau <thomas.moreau.2010@gmail.com>
+# Author: Olivier Grisel <olivier.grisel@ensta.fr>
+
+# See quad_tree.pyx for details.
+
+cimport numpy as cnp
+from ..utils._typedefs cimport float32_t, intp_t
+
+# This is effectively an ifdef statement in Cython
+# It allows us to write printf debugging lines
+# and remove them at compile time
+cdef enum:
+    DEBUGFLAG = 0
+
+cdef float EPSILON = 1e-6
+
+# XXX: Careful to not change the order of the arguments. It is important to
+# have is_leaf and max_width consecutive as it permits to avoid padding by
+# the compiler and keep the size coherent for both C and numpy data structures.
+cdef struct Cell:
+    # Base storage structure for cells in a QuadTree object
+
+    # Tree structure
+    intp_t parent                # Parent cell of this cell
+    intp_t[8] children           # Array pointing to children of this cell
+
+    # Cell description
+    intp_t cell_id               # Id of the cell in the cells array in the Tree
+    intp_t point_index           # Index of the point at this cell (only defined
+    #                            # in non empty leaf)
+    bint is_leaf                 # Does this cell have children?
+    float32_t squared_max_width  # Squared value of the maximum width w
+    intp_t depth                 # Depth of the cell in the tree
+    intp_t cumulative_size       # Number of points included in the subtree with
+    #                            # this cell as a root.
+
+    # Internal constants
+    float32_t[3] center          # Store the center for quick split of cells
+    float32_t[3] barycenter      # Keep track of the center of mass of the cell
+
+    # Cell boundaries
+    float32_t[3] min_bounds      # Inferior boundaries of this cell (inclusive)
+    float32_t[3] max_bounds      # Superior boundaries of this cell (exclusive)
+
+
+cdef class _QuadTree:
+    # The QuadTree object is a quad tree structure constructed by inserting
+    # recursively points in the tree and splitting cells in 4 so that each
+    # leaf cell contains at most one point.
+    # This structure also handle 3D data, inserted in trees with 8 children
+    # for each node.
+
+    # Parameters of the tree
+    cdef public int n_dimensions         # Number of dimensions in X
+    cdef public int verbose              # Verbosity of the output
+    cdef intp_t n_cells_per_cell         # Number of children per node. (2 ** n_dimension)
+
+    # Tree inner structure
+    cdef public intp_t max_depth         # Max depth of the tree
+    cdef public intp_t cell_count        # Counter for node IDs
+    cdef public intp_t capacity          # Capacity of tree, in terms of nodes
+    cdef public intp_t n_points          # Total number of points
+    cdef Cell* cells                     # Array of nodes
+
+    # Point insertion methods
+    cdef int insert_point(self, float32_t[3] point, intp_t point_index,
+                          intp_t cell_id=*) except -1 nogil
+    cdef intp_t _insert_point_in_new_child(self, float32_t[3] point, Cell* cell,
+                                           intp_t point_index, intp_t size=*
+                                           ) noexcept nogil
+    cdef intp_t _select_child(self, float32_t[3] point, Cell* cell) noexcept nogil
+    cdef bint _is_duplicate(self, float32_t[3] point1, float32_t[3] point2) noexcept nogil
+
+    # Create a summary of the Tree compare to a query point
+    cdef long summarize(self, float32_t[3] point, float32_t* results,
+                        float squared_theta=*, intp_t cell_id=*, long idx=*
+                        ) noexcept nogil
+
+    # Internal cell initialization methods
+    cdef void _init_cell(self, Cell* cell, intp_t parent, intp_t depth) noexcept nogil
+    cdef void _init_root(self, float32_t[3] min_bounds, float32_t[3] max_bounds
+                         ) noexcept nogil
+
+    # Private methods
+    cdef int _check_point_in_cell(self, float32_t[3] point, Cell* cell
+                                  ) except -1 nogil
+
+    # Private array manipulation to manage the ``cells`` array
+    cdef int _resize(self, intp_t capacity) except -1 nogil
+    cdef int _resize_c(self, intp_t capacity=*) except -1 nogil
+    cdef int _get_cell(self, float32_t[3] point, intp_t cell_id=*) except -1 nogil
+    cdef Cell[:] _get_cell_ndarray(self)
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pyx b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..ddc79b695685287c455561a7ba6d6c4e861fc1e5
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pyx
@@ -0,0 +1,609 @@
+# Author: Thomas Moreau <thomas.moreau.2010@gmail.com>
+# Author: Olivier Grisel <olivier.grisel@ensta.fr>
+
+
+from cpython cimport Py_INCREF, PyObject, PyTypeObject
+
+from libc.math cimport fabsf
+from libc.stdlib cimport free
+from libc.string cimport memcpy
+from libc.stdio cimport printf
+from libc.stdint cimport SIZE_MAX
+
+from ..tree._utils cimport safe_realloc
+
+import numpy as np
+cimport numpy as cnp
+cnp.import_array()
+
+cdef extern from "numpy/arrayobject.h":
+    object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr,
+                                int nd, cnp.npy_intp* dims,
+                                cnp.npy_intp* strides,
+                                void* data, int flags, object obj)
+    int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj)
+
+# Build the corresponding numpy dtype for Cell.
+# This works by casting `dummy` to an array of Cell of length 1, which numpy
+# can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946
+# for a more detailed explanation.
+cdef Cell dummy
+CELL_DTYPE = np.asarray(<Cell[:1]>(&dummy)).dtype
+
+assert CELL_DTYPE.itemsize == sizeof(Cell)
+
+
+cdef class _QuadTree:
+    """Array-based representation of a QuadTree.
+
+    This class is currently working for indexing 2D data (regular QuadTree) and
+    for indexing 3D data (OcTree). It is planned to split the 2 implementations
+    using `Cython.Tempita` to save some memory for QuadTree.
+
+    Note that this code is currently internally used only by the Barnes-Hut
+    method in `sklearn.manifold.TSNE`. It is planned to be refactored and
+    generalized in the future to be compatible with nearest neighbors API of
+    `sklearn.neighbors` with 2D and 3D data.
+    """
+    def __cinit__(self, int n_dimensions, int verbose):
+        """Constructor."""
+        # Parameters of the tree
+        self.n_dimensions = n_dimensions
+        self.verbose = verbose
+        self.n_cells_per_cell = <int> (2 ** self.n_dimensions)
+
+        # Inner structures
+        self.max_depth = 0
+        self.cell_count = 0
+        self.capacity = 0
+        self.n_points = 0
+        self.cells = NULL
+
+    def __dealloc__(self):
+        """Destructor."""
+        # Free all inner structures
+        free(self.cells)
+
+    @property
+    def cumulative_size(self):
+        cdef Cell[:] cell_mem_view = self._get_cell_ndarray()
+        return cell_mem_view.base['cumulative_size'][:self.cell_count]
+
+    @property
+    def leafs(self):
+        cdef Cell[:] cell_mem_view = self._get_cell_ndarray()
+        return cell_mem_view.base['is_leaf'][:self.cell_count]
+
+    def build_tree(self, X):
+        """Build a tree from an array of points X."""
+        cdef:
+            int i
+            float32_t[3] pt
+            float32_t[3] min_bounds, max_bounds
+
+        # validate X and prepare for query
+        # X = check_array(X, dtype=float32_t, order='C')
+        n_samples = X.shape[0]
+
+        capacity = 100
+        self._resize(capacity)
+        m = np.min(X, axis=0)
+        M = np.max(X, axis=0)
+        # Scale the maximum to get all points strictly in the tree bounding box
+        # The 3 bounds are for positive, negative and small values
+        M = np.maximum(M * (1. + 1e-3 * np.sign(M)), M + 1e-3)
+        for i in range(self.n_dimensions):
+            min_bounds[i] = m[i]
+            max_bounds[i] = M[i]
+
+            if self.verbose > 10:
+                printf("[QuadTree] bounding box axis %i : [%f, %f]\n",
+                       i, min_bounds[i], max_bounds[i])
+
+        # Create the initial node with boundaries from the dataset
+        self._init_root(min_bounds, max_bounds)
+
+        for i in range(n_samples):
+            for j in range(self.n_dimensions):
+                pt[j] = X[i, j]
+            self.insert_point(pt, i)
+
+        # Shrink the cells array to reduce memory usage
+        self._resize(capacity=self.cell_count)
+
+    cdef int insert_point(self, float32_t[3] point, intp_t point_index,
+                          intp_t cell_id=0) except -1 nogil:
+        """Insert a point in the QuadTree."""
+        cdef int ax
+        cdef intp_t selected_child
+        cdef Cell* cell = &self.cells[cell_id]
+        cdef intp_t n_point = cell.cumulative_size
+
+        if self.verbose > 10:
+            printf("[QuadTree] Inserting depth %li\n", cell.depth)
+
+        # Assert that the point is in the right range
+        if DEBUGFLAG:
+            self._check_point_in_cell(point, cell)
+
+        # If the cell is an empty leaf, insert the point in it
+        if cell.cumulative_size == 0:
+            cell.cumulative_size = 1
+            self.n_points += 1
+            for i in range(self.n_dimensions):
+                cell.barycenter[i] = point[i]
+            cell.point_index = point_index
+            if self.verbose > 10:
+                printf("[QuadTree] inserted point %li in cell %li\n",
+                       point_index, cell_id)
+            return cell_id
+
+        # If the cell is not a leaf, update cell internals and
+        # recurse in selected child
+        if not cell.is_leaf:
+            for ax in range(self.n_dimensions):
+                # barycenter update using a weighted mean
+                cell.barycenter[ax] = (
+                    n_point * cell.barycenter[ax] + point[ax]) / (n_point + 1)
+
+            # Increase the size of the subtree starting from this cell
+            cell.cumulative_size += 1
+
+            # Insert child in the correct subtree
+            selected_child = self._select_child(point, cell)
+            if self.verbose > 49:
+                printf("[QuadTree] selected child %li\n", selected_child)
+            if selected_child == -1:
+                self.n_points += 1
+                return self._insert_point_in_new_child(point, cell, point_index)
+            return self.insert_point(point, point_index, selected_child)
+
+        # Finally, if the cell is a leaf with a point already inserted,
+        # split the cell in n_cells_per_cell if the point is not a duplicate.
+        # If it is a duplicate, increase the size of the leaf and return.
+        if self._is_duplicate(point, cell.barycenter):
+            if self.verbose > 10:
+                printf("[QuadTree] found a duplicate!\n")
+            cell.cumulative_size += 1
+            self.n_points += 1
+            return cell_id
+
+        # In a leaf, the barycenter correspond to the only point included
+        # in it.
+        self._insert_point_in_new_child(cell.barycenter, cell, cell.point_index,
+                                        cell.cumulative_size)
+        return self.insert_point(point, point_index, cell_id)
+
+    # XXX: This operation is not Thread safe
+    cdef intp_t _insert_point_in_new_child(
+        self, float32_t[3] point, Cell* cell, intp_t point_index, intp_t size=1
+    ) noexcept nogil:
+        """Create a child of cell which will contain point."""
+
+        # Local variable definition
+        cdef:
+            intp_t cell_id, cell_child_id, parent_id
+            float32_t[3] save_point
+            float32_t width
+            Cell* child
+            int i
+
+        # If the maximal capacity of the Tree have been reached, double the capacity
+        # We need to save the current cell id and the current point to retrieve them
+        # in case the reallocation
+        if self.cell_count + 1 > self.capacity:
+            parent_id = cell.cell_id
+            for i in range(self.n_dimensions):
+                save_point[i] = point[i]
+            self._resize(SIZE_MAX)
+            cell = &self.cells[parent_id]
+            point = save_point
+
+        # Get an empty cell and initialize it
+        cell_id = self.cell_count
+        self.cell_count += 1
+        child = &self.cells[cell_id]
+
+        self._init_cell(child, cell.cell_id, cell.depth + 1)
+        child.cell_id = cell_id
+
+        # Set the cell as an inner cell of the Tree
+        cell.is_leaf = False
+        cell.point_index = -1
+
+        # Set the correct boundary for the cell, store the point in the cell
+        # and compute its index in the children array.
+        cell_child_id = 0
+        for i in range(self.n_dimensions):
+            cell_child_id *= 2
+            if point[i] >= cell.center[i]:
+                cell_child_id += 1
+                child.min_bounds[i] = cell.center[i]
+                child.max_bounds[i] = cell.max_bounds[i]
+            else:
+                child.min_bounds[i] = cell.min_bounds[i]
+                child.max_bounds[i] = cell.center[i]
+            child.center[i] = (child.min_bounds[i] + child.max_bounds[i]) / 2.
+            width = child.max_bounds[i] - child.min_bounds[i]
+
+            child.barycenter[i] = point[i]
+            child.squared_max_width = max(child.squared_max_width, width*width)
+
+        # Store the point info and the size to account for duplicated points
+        child.point_index = point_index
+        child.cumulative_size = size
+
+        # Store the child cell in the correct place in children
+        cell.children[cell_child_id] = child.cell_id
+
+        if DEBUGFLAG:
+            # Assert that the point is in the right range
+            self._check_point_in_cell(point, child)
+        if self.verbose > 10:
+            printf("[QuadTree] inserted point %li in new child %li\n",
+                   point_index, cell_id)
+
+        return cell_id
+
+    cdef bint _is_duplicate(self, float32_t[3] point1, float32_t[3] point2) noexcept nogil:
+        """Check if the two given points are equals."""
+        cdef int i
+        cdef bint res = True
+        for i in range(self.n_dimensions):
+            # Use EPSILON to avoid numerical error that would overgrow the tree
+            res &= fabsf(point1[i] - point2[i]) <= EPSILON
+        return res
+
+    cdef intp_t _select_child(self, float32_t[3] point, Cell* cell) noexcept nogil:
+        """Select the child of cell which contains the given query point."""
+        cdef:
+            int i
+            intp_t selected_child = 0
+
+        for i in range(self.n_dimensions):
+            # Select the correct child cell to insert the point by comparing
+            # it to the borders of the cells using precomputed center.
+            selected_child *= 2
+            if point[i] >= cell.center[i]:
+                selected_child += 1
+        return cell.children[selected_child]
+
+    cdef void _init_cell(self, Cell* cell, intp_t parent, intp_t depth) noexcept nogil:
+        """Initialize a cell structure with some constants."""
+        cell.parent = parent
+        cell.is_leaf = True
+        cell.depth = depth
+        cell.squared_max_width = 0
+        cell.cumulative_size = 0
+        for i in range(self.n_cells_per_cell):
+            cell.children[i] = SIZE_MAX
+
+    cdef void _init_root(self, float32_t[3] min_bounds, float32_t[3] max_bounds
+                         ) noexcept nogil:
+        """Initialize the root node with the given space boundaries"""
+        cdef:
+            int i
+            float32_t width
+            Cell* root = &self.cells[0]
+
+        self._init_cell(root, -1, 0)
+        for i in range(self.n_dimensions):
+            root.min_bounds[i] = min_bounds[i]
+            root.max_bounds[i] = max_bounds[i]
+            root.center[i] = (max_bounds[i] + min_bounds[i]) / 2.
+            width = max_bounds[i] - min_bounds[i]
+            root.squared_max_width = max(root.squared_max_width, width*width)
+        root.cell_id = 0
+
+        self.cell_count += 1
+
+    cdef int _check_point_in_cell(self, float32_t[3] point, Cell* cell
+                                  ) except -1 nogil:
+        """Check that the given point is in the cell boundaries."""
+
+        if self.verbose >= 50:
+            if self.n_dimensions == 3:
+                printf("[QuadTree] Checking point (%f, %f, %f) in cell %li "
+                       "([%f/%f, %f/%f, %f/%f], size %li)\n",
+                       point[0], point[1], point[2], cell.cell_id,
+                       cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1],
+                       cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2],
+                       cell.cumulative_size)
+            else:
+                printf("[QuadTree] Checking point (%f, %f) in cell %li "
+                       "([%f/%f, %f/%f], size %li)\n",
+                       point[0], point[1], cell.cell_id, cell.min_bounds[0],
+                       cell.max_bounds[0], cell.min_bounds[1],
+                       cell.max_bounds[1], cell.cumulative_size)
+
+        for i in range(self.n_dimensions):
+            if (cell.min_bounds[i] > point[i] or
+                    cell.max_bounds[i] <= point[i]):
+                with gil:
+                    msg = "[QuadTree] InsertionError: point out of cell "
+                    msg += "boundary.\nAxis %li: cell [%f, %f]; point %f\n"
+
+                    msg %= i, cell.min_bounds[i],  cell.max_bounds[i], point[i]
+                    raise ValueError(msg)
+
+    def _check_coherence(self):
+        """Check the coherence of the cells of the tree.
+
+        Check that the info stored in each cell is compatible with the info
+        stored in descendent and sibling cells. Raise a ValueError if this
+        fails.
+        """
+        for cell in self.cells[:self.cell_count]:
+            # Check that the barycenter of inserted point is within the cell
+            # boundaries
+            self._check_point_in_cell(cell.barycenter, &cell)
+
+            if not cell.is_leaf:
+                # Compute the number of point in children and compare with
+                # its cummulative_size.
+                n_points = 0
+                for idx in range(self.n_cells_per_cell):
+                    child_id = cell.children[idx]
+                    if child_id != -1:
+                        child = self.cells[child_id]
+                        n_points += child.cumulative_size
+                        assert child.cell_id == child_id, (
+                            "Cell id not correctly initialized.")
+                if n_points != cell.cumulative_size:
+                    raise ValueError(
+                        "Cell {} is incoherent. Size={} but found {} points "
+                        "in children. ({})"
+                        .format(cell.cell_id, cell.cumulative_size,
+                                n_points, cell.children))
+
+        # Make sure that the number of point in the tree correspond to the
+        # cumulative size in root cell.
+        if self.n_points != self.cells[0].cumulative_size:
+            raise ValueError(
+                "QuadTree is incoherent. Size={} but found {} points "
+                "in children."
+                .format(self.n_points, self.cells[0].cumulative_size))
+
+    cdef long summarize(self, float32_t[3] point, float32_t* results,
+                        float squared_theta=.5, intp_t cell_id=0, long idx=0
+                        ) noexcept nogil:
+        """Summarize the tree compared to a query point.
+
+        Input arguments
+        ---------------
+        point : array (n_dimensions)
+             query point to construct the summary.
+        cell_id : integer, optional (default: 0)
+            current cell of the tree summarized. This should be set to 0 for
+            external calls.
+        idx : integer, optional (default: 0)
+            current index in the result array. This should be set to 0 for
+            external calls
+        squared_theta: float, optional (default: .5)
+            threshold to decide whether the node is sufficiently far
+            from the query point to be a good summary. The formula is such that
+            the node is a summary if
+                node_width^2 / dist_node_point^2 < squared_theta.
+            Note that the argument should be passed as theta^2 to avoid
+            computing square roots of the distances.
+
+        Output arguments
+        ----------------
+        results : array (n_samples * (n_dimensions+2))
+            result will contain a summary of the tree information compared to
+            the query point:
+            - results[idx:idx+n_dimensions] contains the coordinate-wise
+                difference between the query point and the summary cell idx.
+                This is useful in t-SNE to compute the negative forces.
+            - result[idx+n_dimensions+1] contains the squared euclidean
+                distance to the summary cell idx.
+            - result[idx+n_dimensions+2] contains the number of point of the
+                tree contained in the summary cell idx.
+
+        Return
+        ------
+        idx : integer
+            number of elements in the results array.
+        """
+        cdef:
+            int i, idx_d = idx + self.n_dimensions
+            bint duplicate = True
+            Cell* cell = &self.cells[cell_id]
+
+        results[idx_d] = 0.
+        for i in range(self.n_dimensions):
+            results[idx + i] = point[i] - cell.barycenter[i]
+            results[idx_d] += results[idx + i] * results[idx + i]
+            duplicate &= fabsf(results[idx + i]) <= EPSILON
+
+        # Do not compute self interactions
+        if duplicate and cell.is_leaf:
+            return idx
+
+        # Check whether we can use this node as a summary
+        # It's a summary node if the angular size as measured from the point
+        # is relatively small (w.r.t. theta) or if it is a leaf node.
+        # If it can be summarized, we use the cell center of mass
+        # Otherwise, we go a higher level of resolution and into the leaves.
+        if cell.is_leaf or (
+                (cell.squared_max_width / results[idx_d]) < squared_theta):
+            results[idx_d + 1] = <float32_t> cell.cumulative_size
+            return idx + self.n_dimensions + 2
+
+        else:
+            # Recursively compute the summary in nodes
+            for c in range(self.n_cells_per_cell):
+                child_id = cell.children[c]
+                if child_id != -1:
+                    idx = self.summarize(point, results, squared_theta,
+                                         child_id, idx)
+
+        return idx
+
+    def get_cell(self, point):
+        """return the id of the cell containing the query point or raise
+        ValueError if the point is not in the tree
+        """
+        cdef float32_t[3] query_pt
+        cdef int i
+
+        assert len(point) == self.n_dimensions, (
+            "Query point should be a point in dimension {}."
+            .format(self.n_dimensions))
+
+        for i in range(self.n_dimensions):
+            query_pt[i] = point[i]
+
+        return self._get_cell(query_pt, 0)
+
+    cdef int _get_cell(self, float32_t[3] point, intp_t cell_id=0
+                       ) except -1 nogil:
+        """guts of get_cell.
+
+        Return the id of the cell containing the query point or raise ValueError
+        if the point is not in the tree"""
+        cdef:
+            intp_t selected_child
+            Cell* cell = &self.cells[cell_id]
+
+        if cell.is_leaf:
+            if self._is_duplicate(cell.barycenter, point):
+                if self.verbose > 99:
+                    printf("[QuadTree] Found point in cell: %li\n",
+                           cell.cell_id)
+                return cell_id
+            with gil:
+                raise ValueError("Query point not in the Tree.")
+
+        selected_child = self._select_child(point, cell)
+        if selected_child > 0:
+            if self.verbose > 99:
+                printf("[QuadTree] Selected_child: %li\n", selected_child)
+            return self._get_cell(point, selected_child)
+        with gil:
+            raise ValueError("Query point not in the Tree.")
+
+    # Pickling primitives
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return (_QuadTree, (self.n_dimensions, self.verbose), self.__getstate__())
+
+    def __getstate__(self):
+        """Getstate re-implementation, for pickling."""
+        d = {}
+        # capacity is inferred during the __setstate__ using nodes
+        d["max_depth"] = self.max_depth
+        d["cell_count"] = self.cell_count
+        d["capacity"] = self.capacity
+        d["n_points"] = self.n_points
+        d["cells"] = self._get_cell_ndarray().base
+        return d
+
+    def __setstate__(self, d):
+        """Setstate re-implementation, for unpickling."""
+        self.max_depth = d["max_depth"]
+        self.cell_count = d["cell_count"]
+        self.capacity = d["capacity"]
+        self.n_points = d["n_points"]
+
+        if 'cells' not in d:
+            raise ValueError('You have loaded Tree version which '
+                             'cannot be imported')
+
+        cell_ndarray = d['cells']
+
+        if (cell_ndarray.ndim != 1 or
+                cell_ndarray.dtype != CELL_DTYPE or
+                not cell_ndarray.flags.c_contiguous):
+            raise ValueError('Did not recognise loaded array layout')
+
+        self.capacity = cell_ndarray.shape[0]
+        if self._resize_c(self.capacity) != 0:
+            raise MemoryError("resizing tree to %d" % self.capacity)
+
+        cdef Cell[:] cell_mem_view = cell_ndarray
+        memcpy(
+            pto=self.cells,
+            pfrom=&cell_mem_view[0],
+            size=self.capacity * sizeof(Cell),
+        )
+
+    # Array manipulation methods, to convert it to numpy or to resize
+    # self.cells array
+
+    cdef Cell[:] _get_cell_ndarray(self):
+        """Wraps nodes as a NumPy struct array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory. Individual fields are publicly accessible as properties of the
+        Tree.
+        """
+        cdef cnp.npy_intp shape[1]
+        shape[0] = <cnp.npy_intp> self.cell_count
+        cdef cnp.npy_intp strides[1]
+        strides[0] = sizeof(Cell)
+        cdef Cell[:] arr
+        Py_INCREF(CELL_DTYPE)
+        arr = PyArray_NewFromDescr(
+            subtype=<PyTypeObject *> np.ndarray,
+            descr=CELL_DTYPE,
+            nd=1,
+            dims=shape,
+            strides=strides,
+            data=<void*> self.cells,
+            flags=cnp.NPY_ARRAY_DEFAULT,
+            obj=None,
+        )
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr.base, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array!")
+        return arr
+
+    cdef int _resize(self, intp_t capacity) except -1 nogil:
+        """Resize all inner arrays to `capacity`, if `capacity` == -1, then
+           double the size of the inner arrays.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if self._resize_c(capacity) != 0:
+            # Acquire gil only if we need to raise
+            with gil:
+                raise MemoryError()
+
+    cdef int _resize_c(self, intp_t capacity=SIZE_MAX) except -1 nogil:
+        """Guts of _resize
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if capacity == self.capacity and self.cells != NULL:
+            return 0
+
+        if <size_t> capacity == SIZE_MAX:
+            if self.capacity == 0:
+                capacity = 9  # default initial value to min
+            else:
+                capacity = 2 * self.capacity
+
+        safe_realloc(&self.cells, capacity)
+
+        # if capacity smaller than cell_count, adjust the counter
+        if capacity < self.cell_count:
+            self.cell_count = capacity
+
+        self.capacity = capacity
+        return 0
+
+    def _py_summarize(self, float32_t[:] query_pt, float32_t[:, :] X, float angle):
+        # Used for testing summarize
+        cdef:
+            float32_t[:] summary
+            int n_samples
+
+        n_samples = X.shape[0]
+        summary = np.empty(4 * n_samples, dtype=np.float32)
+
+        idx = self.summarize(&query_pt[0], &summary[0], angle * angle)
+        return idx, summary
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_regression.py b/.venv/Lib/site-packages/sklearn/neighbors/_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..c28b40a243cf2218fad226c69f78812072760d7b
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_regression.py
@@ -0,0 +1,513 @@
+"""Nearest Neighbor Regression."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+
+import numpy as np
+
+from ..base import RegressorMixin, _fit_context
+from ..metrics import DistanceMetric
+from ..utils._param_validation import StrOptions
+from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights
+
+
+class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
+    """Regression based on k-nearest neighbors.
+
+    The target is predicted by local interpolation of the targets
+    associated of the nearest neighbors in the training set.
+
+    Read more in the :ref:`User Guide <regression>`.
+
+    .. versionadded:: 0.9
+
+    Parameters
+    ----------
+    n_neighbors : int, default=5
+        Number of neighbors to use by default for :meth:`kneighbors` queries.
+
+    weights : {'uniform', 'distance'}, callable or None, default='uniform'
+        Weight function used in prediction.  Possible values:
+
+        - 'uniform' : uniform weights.  All points in each neighborhood
+          are weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - [callable] : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+
+        Uniform weights are used by default.
+
+        See the following example for a demonstration of the impact of
+        different weighting schemes on predictions:
+        :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric : str, DistanceMetric object or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        If metric is a DistanceMetric object, it will be passed directly to
+        the underlying computation routines.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+        Doesn't affect :meth:`fit` method.
+
+    Attributes
+    ----------
+    effective_metric_ : str or callable
+        The distance metric to use. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    See Also
+    --------
+    NearestNeighbors : Unsupervised learner for implementing neighbor searches.
+    RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius.
+    KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote.
+    RadiusNeighborsClassifier : Classifier implementing
+        a vote among neighbors within a given radius.
+
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+
+    .. warning::
+
+       Regarding the Nearest Neighbors algorithms, if it is found that two
+       neighbors, neighbor `k+1` and `k`, have identical distances but
+       different labels, the results will depend on the ordering of the
+       training data.
+
+    https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
+
+    Examples
+    --------
+    >>> X = [[0], [1], [2], [3]]
+    >>> y = [0, 0, 1, 1]
+    >>> from sklearn.neighbors import KNeighborsRegressor
+    >>> neigh = KNeighborsRegressor(n_neighbors=2)
+    >>> neigh.fit(X, y)
+    KNeighborsRegressor(...)
+    >>> print(neigh.predict([[1.5]]))
+    [0.5]
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "weights": [StrOptions({"uniform", "distance"}), callable, None],
+    }
+    _parameter_constraints["metric"].append(DistanceMetric)
+    _parameter_constraints.pop("radius")
+
+    def __init__(
+        self,
+        n_neighbors=5,
+        *,
+        weights="uniform",
+        algorithm="auto",
+        leaf_size=30,
+        p=2,
+        metric="minkowski",
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            n_neighbors=n_neighbors,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.weights = weights
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # For cross-validation routines to split data correctly
+        tags.input_tags.pairwise = self.metric == "precomputed"
+        return tags
+
+    @_fit_context(
+        # KNeighborsRegressor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y):
+        """Fit the k-nearest neighbors regressor from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : {array-like, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_outputs)
+            Target values.
+
+        Returns
+        -------
+        self : KNeighborsRegressor
+            The fitted k-nearest neighbors regressor.
+        """
+        return self._fit(X, y)
+
+    def predict(self, X):
+        """Predict the target for the provided data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
+            Target values.
+        """
+        if self.weights == "uniform":
+            # In that case, we do not need the distances to perform
+            # the weighting so we do not compute them.
+            neigh_ind = self.kneighbors(X, return_distance=False)
+            neigh_dist = None
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
+
+        weights = _get_weights(neigh_dist, self.weights)
+
+        _y = self._y
+        if _y.ndim == 1:
+            _y = _y.reshape((-1, 1))
+
+        if weights is None:
+            y_pred = np.mean(_y[neigh_ind], axis=1)
+        else:
+            y_pred = np.empty((neigh_dist.shape[0], _y.shape[1]), dtype=np.float64)
+            denom = np.sum(weights, axis=1)
+
+            for j in range(_y.shape[1]):
+                num = np.sum(_y[neigh_ind, j] * weights, axis=1)
+                y_pred[:, j] = num / denom
+
+        if self._y.ndim == 1:
+            y_pred = y_pred.ravel()
+
+        return y_pred
+
+
+class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBase):
+    """Regression based on neighbors within a fixed radius.
+
+    The target is predicted by local interpolation of the targets
+    associated of the nearest neighbors in the training set.
+
+    Read more in the :ref:`User Guide <regression>`.
+
+    .. versionadded:: 0.9
+
+    Parameters
+    ----------
+    radius : float, default=1.0
+        Range of parameter space to use by default for :meth:`radius_neighbors`
+        queries.
+
+    weights : {'uniform', 'distance'}, callable or None, default='uniform'
+        Weight function used in prediction.  Possible values:
+
+        - 'uniform' : uniform weights.  All points in each neighborhood
+          are weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - [callable] : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+
+        Uniform weights are used by default.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    effective_metric_ : str or callable
+        The distance metric to use. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    See Also
+    --------
+    NearestNeighbors : Unsupervised learner for implementing neighbor searches.
+    KNeighborsRegressor : Regression based on k-nearest neighbors.
+    KNeighborsClassifier : Classifier based on the k-nearest neighbors.
+    RadiusNeighborsClassifier : Classifier based on neighbors within a given radius.
+
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+
+    https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
+
+    Examples
+    --------
+    >>> X = [[0], [1], [2], [3]]
+    >>> y = [0, 0, 1, 1]
+    >>> from sklearn.neighbors import RadiusNeighborsRegressor
+    >>> neigh = RadiusNeighborsRegressor(radius=1.0)
+    >>> neigh.fit(X, y)
+    RadiusNeighborsRegressor(...)
+    >>> print(neigh.predict([[1.5]]))
+    [0.5]
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "weights": [StrOptions({"uniform", "distance"}), callable, None],
+    }
+    _parameter_constraints.pop("n_neighbors")
+
+    def __init__(
+        self,
+        radius=1.0,
+        *,
+        weights="uniform",
+        algorithm="auto",
+        leaf_size=30,
+        p=2,
+        metric="minkowski",
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            radius=radius,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            p=p,
+            metric=metric,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.weights = weights
+
+    @_fit_context(
+        # RadiusNeighborsRegressor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y):
+        """Fit the radius neighbors regressor from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : {array-like, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_outputs)
+            Target values.
+
+        Returns
+        -------
+        self : RadiusNeighborsRegressor
+            The fitted radius neighbors regressor.
+        """
+        return self._fit(X, y)
+
+    def predict(self, X):
+        """Predict the target for the provided data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        y : ndarray of shape (n_queries,) or (n_queries, n_outputs), \
+                dtype=double
+            Target values.
+        """
+        neigh_dist, neigh_ind = self.radius_neighbors(X)
+
+        weights = _get_weights(neigh_dist, self.weights)
+
+        _y = self._y
+        if _y.ndim == 1:
+            _y = _y.reshape((-1, 1))
+
+        empty_obs = np.full_like(_y[0], np.nan)
+
+        if weights is None:
+            y_pred = np.array(
+                [
+                    np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs
+                    for (i, ind) in enumerate(neigh_ind)
+                ]
+            )
+
+        else:
+            y_pred = np.array(
+                [
+                    (
+                        np.average(_y[ind, :], axis=0, weights=weights[i])
+                        if len(ind)
+                        else empty_obs
+                    )
+                    for (i, ind) in enumerate(neigh_ind)
+                ]
+            )
+
+        if np.any(np.isnan(y_pred)):
+            empty_warning_msg = (
+                "One or more samples have no neighbors "
+                "within specified radius; predicting NaN."
+            )
+            warnings.warn(empty_warning_msg)
+
+        if self._y.ndim == 1:
+            y_pred = y_pred.ravel()
+
+        return y_pred
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/_unsupervised.py b/.venv/Lib/site-packages/sklearn/neighbors/_unsupervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..b34af2c75367ad73e69b826ca968b2f53982e541
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/_unsupervised.py
@@ -0,0 +1,179 @@
+"""Unsupervised nearest neighbors learner"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ..base import _fit_context
+from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin
+
+
+class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
+    """Unsupervised learner for implementing neighbor searches.
+
+    Read more in the :ref:`User Guide <unsupervised_neighbors>`.
+
+    .. versionadded:: 0.9
+
+    Parameters
+    ----------
+    n_neighbors : int, default=5
+        Number of neighbors to use by default for :meth:`kneighbors` queries.
+
+    radius : float, default=1.0
+        Range of parameter space to use by default for :meth:`radius_neighbors`
+        queries.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+    p : float (positive), default=2
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    effective_metric_ : str
+        Metric used to compute distances to neighbors.
+
+    effective_metric_params_ : dict
+        Parameters for the metric used to compute distances to neighbors.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    See Also
+    --------
+    KNeighborsClassifier : Classifier implementing the k-nearest neighbors
+        vote.
+    RadiusNeighborsClassifier : Classifier implementing a vote among neighbors
+        within a given radius.
+    KNeighborsRegressor : Regression based on k-nearest neighbors.
+    RadiusNeighborsRegressor : Regression based on neighbors within a fixed
+        radius.
+    BallTree : Space partitioning data structure for organizing points in a
+        multi-dimensional space, used for nearest neighbor search.
+
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+
+    https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.neighbors import NearestNeighbors
+    >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]
+    >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
+    >>> neigh.fit(samples)
+    NearestNeighbors(...)
+    >>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)
+    array([[2, 0]]...)
+    >>> nbrs = neigh.radius_neighbors(
+    ...    [[0, 0, 1.3]], 0.4, return_distance=False
+    ... )
+    >>> np.asarray(nbrs[0][0])
+    array(2)
+    """
+
+    def __init__(
+        self,
+        *,
+        n_neighbors=5,
+        radius=1.0,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            n_neighbors=n_neighbors,
+            radius=radius,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+
+    @_fit_context(
+        # NearestNeighbors.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Fit the nearest neighbors estimator from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : NearestNeighbors
+            The fitted nearest neighbors estimator.
+        """
+        return self._fit(X)
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/meson.build b/.venv/Lib/site-packages/sklearn/neighbors/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..06a869bb1fb671398f420e58a066f80ed040533b
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/meson.build
@@ -0,0 +1,56 @@
+_binary_tree_pxi = custom_target(
+  '_binary_tree_pxi',
+  output: '_binary_tree.pxi',
+  input: '_binary_tree.pxi.tp',
+  command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'],
+)
+
+# .pyx is generated so this is needed to make Cython compilation work. The pxi
+# file is included avoid "missing dependency paths" with ninja -t missindeps
+neighbors_cython_tree = [
+  fs.copyfile('__init__.py'),
+  fs.copyfile('_partition_nodes.pxd'),
+  _binary_tree_pxi,
+]
+
+name_list = ['_ball_tree', '_kd_tree']
+
+foreach name: name_list
+  pyx = custom_target(
+    name + '_pyx',
+    output: name + '.pyx',
+    input: name + '.pyx.tp',
+    command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'],
+    # TODO in principle this should go in py.exension_module below. This is
+    # temporary work-around for dependency issue with .pyx.tp files. For more
+    # details, see https://github.com/mesonbuild/meson/issues/13212
+    depends: [neighbors_cython_tree, utils_cython_tree, metrics_cython_tree],
+  )
+  py.extension_module(
+    name,
+    pyx,
+    dependencies: [np_dep],
+    cython_args: cython_args,
+    subdir: 'sklearn/neighbors',
+    install: true
+)
+endforeach
+
+neighbors_extension_metadata = {
+  '_partition_nodes':
+      {'sources': ['_partition_nodes.pyx'],
+       'override_options': ['cython_language=cpp'], 'dependencies': [np_dep]},
+  '_quad_tree': {'sources': ['_quad_tree.pyx'], 'dependencies': [np_dep]},
+}
+
+foreach ext_name, ext_dict : neighbors_extension_metadata
+  py.extension_module(
+    ext_name,
+    [ext_dict.get('sources'), utils_cython_tree],
+    dependencies: ext_dict.get('dependencies'),
+    override_options : ext_dict.get('override_options', []),
+    cython_args: cython_args,
+    subdir: 'sklearn/neighbors',
+    install: true
+  )
+endforeach
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/tests/test_ball_tree.py b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_ball_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab2af2d78f3db07f526c77e965c51356af1616c
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_ball_tree.py
@@ -0,0 +1,200 @@
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal
+
+from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import _convert_container
+from sklearn.utils.validation import check_array
+
+rng = np.random.RandomState(10)
+V_mahalanobis = rng.rand(3, 3)
+V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
+
+DIMENSION = 3
+
+METRICS = {
+    "euclidean": {},
+    "manhattan": {},
+    "minkowski": dict(p=3),
+    "chebyshev": {},
+}
+
+DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"]
+
+BOOLEAN_METRICS = [
+    "jaccard",
+    "dice",
+    "rogerstanimoto",
+    "russellrao",
+    "sokalmichener",
+    "sokalsneath",
+]
+
+BALL_TREE_CLASSES = [
+    BallTree64,
+    BallTree32,
+]
+
+
+def brute_force_neighbors(X, Y, k, metric, **kwargs):
+    from sklearn.metrics import DistanceMetric
+
+    X, Y = check_array(X), check_array(Y)
+    D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
+    ind = np.argsort(D, axis=1)[:, :k]
+    dist = D[np.arange(Y.shape[0])[:, None], ind]
+    return dist, ind
+
+
+def test_BallTree_is_BallTree64_subclass():
+    assert issubclass(BallTree, BallTree64)
+
+
+@pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS))
+@pytest.mark.parametrize("array_type", ["list", "array"])
+@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
+def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation):
+    rng = check_random_state(0)
+    if metric in BOOLEAN_METRICS:
+        X = rng.random_sample((40, 10)).round(0)
+        Y = rng.random_sample((10, 10)).round(0)
+    elif metric in DISCRETE_METRICS:
+        X = (4 * rng.random_sample((40, 10))).round(0)
+        Y = (4 * rng.random_sample((10, 10))).round(0)
+    X = _convert_container(X, array_type)
+    Y = _convert_container(Y, array_type)
+
+    k = 5
+
+    bt = BallTreeImplementation(X, leaf_size=1, metric=metric)
+    dist1, ind1 = bt.query(Y, k)
+    dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
+    assert_array_almost_equal(dist1, dist2)
+
+
+@pytest.mark.parametrize(
+    "BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5])
+)
+def test_query_haversine(BallTreeImplementation, decimal_tol):
+    rng = check_random_state(0)
+    X = 2 * np.pi * rng.random_sample((40, 2))
+    bt = BallTreeImplementation(X, leaf_size=1, metric="haversine")
+    dist1, ind1 = bt.query(X, k=5)
+    dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine")
+
+    assert_array_almost_equal(dist1, dist2, decimal=decimal_tol)
+    assert_array_almost_equal(ind1, ind2)
+
+
+@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
+def test_array_object_type(BallTreeImplementation):
+    """Check that we do not accept object dtype array."""
+    X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
+    with pytest.raises(ValueError, match="setting an array element with a sequence"):
+        BallTreeImplementation(X)
+
+
+@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
+def test_bad_pyfunc_metric(BallTreeImplementation):
+    def wrong_returned_value(x, y):
+        return "1"
+
+    def one_arg_func(x):
+        return 1.0  # pragma: no cover
+
+    X = np.ones((5, 2))
+    msg = "Custom distance function must accept two vectors and return a float."
+    with pytest.raises(TypeError, match=msg):
+        BallTreeImplementation(X, metric=wrong_returned_value)
+
+    msg = "takes 1 positional argument but 2 were given"
+    with pytest.raises(TypeError, match=msg):
+        BallTreeImplementation(X, metric=one_arg_func)
+
+
+@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS))
+def test_ball_tree_numerical_consistency(global_random_seed, metric):
+    # Results on float64 and float32 versions of a dataset must be
+    # numerically close.
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(
+        random_seed=global_random_seed, features=50
+    )
+
+    metric_params = METRICS.get(metric, {})
+    bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params)
+    bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params)
+
+    # Test consistency with respect to the `query` method
+    k = 5
+    dist_64, ind_64 = bt_64.query(Y_64, k=k)
+    dist_32, ind_32 = bt_32.query(Y_32, k=k)
+    assert_allclose(dist_64, dist_32, rtol=1e-5)
+    assert_equal(ind_64, ind_32)
+    assert dist_64.dtype == np.float64
+    assert dist_32.dtype == np.float32
+
+    # Test consistency with respect to the `query_radius` method
+    r = 2.38
+    ind_64 = bt_64.query_radius(Y_64, r=r)
+    ind_32 = bt_32.query_radius(Y_32, r=r)
+    for _ind64, _ind32 in zip(ind_64, ind_32):
+        assert_equal(_ind64, _ind32)
+
+    # Test consistency with respect to the `query_radius` method
+    # with return distances being true
+    ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True)
+    ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True)
+    for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32):
+        assert_equal(_ind64, _ind32)
+        assert_allclose(_dist_64, _dist_32, rtol=1e-5)
+        assert _dist_64.dtype == np.float64
+        assert _dist_32.dtype == np.float32
+
+
+@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS))
+def test_kernel_density_numerical_consistency(global_random_seed, metric):
+    # Test consistency with respect to the `kernel_density` method
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
+
+    metric_params = METRICS.get(metric, {})
+    bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params)
+    bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params)
+
+    kernel = "gaussian"
+    h = 0.1
+    density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True)
+    density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True)
+    assert_allclose(density64, density32, rtol=1e-5)
+    assert density64.dtype == np.float64
+    assert density32.dtype == np.float32
+
+
+def test_two_point_correlation_numerical_consistency(global_random_seed):
+    # Test consistency with respect to the `two_point_correlation` method
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
+
+    bt_64 = BallTree64(X_64, leaf_size=10)
+    bt_32 = BallTree32(X_32, leaf_size=10)
+
+    r = np.linspace(0, 1, 10)
+
+    counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True)
+    counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True)
+    assert_allclose(counts_64, counts_32)
+
+
+def get_dataset_for_binary_tree(random_seed, features=3):
+    rng = np.random.RandomState(random_seed)
+    _X = rng.rand(100, features)
+    _Y = rng.rand(5, features)
+
+    X_64 = _X.astype(dtype=np.float64, copy=False)
+    Y_64 = _Y.astype(dtype=np.float64, copy=False)
+
+    X_32 = _X.astype(dtype=np.float32, copy=False)
+    Y_32 = _Y.astype(dtype=np.float32, copy=False)
+
+    return X_64, X_32, Y_64, Y_32
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/tests/test_graph.py b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..0862ecd748eb1042e0030a3c883e2925f8be8a26
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_graph.py
@@ -0,0 +1,101 @@
+import numpy as np
+import pytest
+
+from sklearn.metrics import euclidean_distances
+from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer
+from sklearn.neighbors._base import _is_sorted_by_data
+from sklearn.utils._testing import assert_array_equal
+
+
+def test_transformer_result():
+    # Test the number of neighbors returned
+    n_neighbors = 5
+    n_samples_fit = 20
+    n_queries = 18
+    n_features = 10
+
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples_fit, n_features)
+    X2 = rng.randn(n_queries, n_features)
+    radius = np.percentile(euclidean_distances(X), 10)
+
+    # with n_neighbors
+    for mode in ["distance", "connectivity"]:
+        add_one = mode == "distance"
+        nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode)
+        Xt = nnt.fit_transform(X)
+        assert Xt.shape == (n_samples_fit, n_samples_fit)
+        assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),)
+        assert Xt.format == "csr"
+        assert _is_sorted_by_data(Xt)
+
+        X2t = nnt.transform(X2)
+        assert X2t.shape == (n_queries, n_samples_fit)
+        assert X2t.data.shape == (n_queries * (n_neighbors + add_one),)
+        assert X2t.format == "csr"
+        assert _is_sorted_by_data(X2t)
+
+    # with radius
+    for mode in ["distance", "connectivity"]:
+        add_one = mode == "distance"
+        nnt = RadiusNeighborsTransformer(radius=radius, mode=mode)
+        Xt = nnt.fit_transform(X)
+        assert Xt.shape == (n_samples_fit, n_samples_fit)
+        assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),)
+        assert Xt.format == "csr"
+        assert _is_sorted_by_data(Xt)
+
+        X2t = nnt.transform(X2)
+        assert X2t.shape == (n_queries, n_samples_fit)
+        assert not X2t.data.shape == (n_queries * (n_neighbors + add_one),)
+        assert X2t.format == "csr"
+        assert _is_sorted_by_data(X2t)
+
+
+def _has_explicit_diagonal(X):
+    """Return True if the diagonal is explicitly stored"""
+    X = X.tocoo()
+    explicit = X.row[X.row == X.col]
+    return len(explicit) == X.shape[0]
+
+
+def test_explicit_diagonal():
+    # Test that the diagonal is explicitly stored in the sparse graph
+    n_neighbors = 5
+    n_samples_fit, n_samples_transform, n_features = 20, 18, 10
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples_fit, n_features)
+    X2 = rng.randn(n_samples_transform, n_features)
+
+    nnt = KNeighborsTransformer(n_neighbors=n_neighbors)
+    Xt = nnt.fit_transform(X)
+    assert _has_explicit_diagonal(Xt)
+    assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
+
+    Xt = nnt.transform(X)
+    assert _has_explicit_diagonal(Xt)
+    assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
+
+    # Using transform on new data should not always have zero diagonal
+    X2t = nnt.transform(X2)
+    assert not _has_explicit_diagonal(X2t)
+
+
+@pytest.mark.parametrize("Klass", [KNeighborsTransformer, RadiusNeighborsTransformer])
+def test_graph_feature_names_out(Klass):
+    """Check `get_feature_names_out` for transformers defined in `_graph.py`."""
+
+    n_samples_fit = 20
+    n_features = 10
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples_fit, n_features)
+
+    est = Klass().fit(X)
+    names_out = est.get_feature_names_out()
+
+    class_name_lower = Klass.__name__.lower()
+    expected_names_out = np.array(
+        [f"{class_name_lower}{i}" for i in range(est.n_samples_fit_)],
+        dtype=object,
+    )
+    assert_array_equal(names_out, expected_names_out)
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kd_tree.py b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kd_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..205c323ca7f13228384b25ff63bbbf2075cba71c
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kd_tree.py
@@ -0,0 +1,100 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_equal
+
+from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64
+from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree
+from sklearn.utils.parallel import Parallel, delayed
+
+DIMENSION = 3
+
+METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)}
+
+KD_TREE_CLASSES = [
+    KDTree64,
+    KDTree32,
+]
+
+
+def test_KDTree_is_KDTree64_subclass():
+    assert issubclass(KDTree, KDTree64)
+
+
+@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES)
+def test_array_object_type(BinarySearchTree):
+    """Check that we do not accept object dtype array."""
+    X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
+    with pytest.raises(ValueError, match="setting an array element with a sequence"):
+        BinarySearchTree(X)
+
+
+@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES)
+def test_kdtree_picklable_with_joblib(BinarySearchTree):
+    """Make sure that KDTree queries work when joblib memmaps.
+
+    Non-regression test for #21685 and #21228."""
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((10, 3))
+    tree = BinarySearchTree(X, leaf_size=2)
+
+    # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that
+    # use to raise "ValueError: buffer source array is read-only" in a previous
+    # version of the Cython code.
+    Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X])
+
+
+@pytest.mark.parametrize("metric", METRICS)
+def test_kd_tree_numerical_consistency(global_random_seed, metric):
+    # Results on float64 and float32 versions of a dataset must be
+    # numerically close.
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(
+        random_seed=global_random_seed, features=50
+    )
+
+    metric_params = METRICS.get(metric, {})
+    kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params)
+    kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params)
+
+    # Test consistency with respect to the `query` method
+    k = 4
+    dist_64, ind_64 = kd_64.query(Y_64, k=k)
+    dist_32, ind_32 = kd_32.query(Y_32, k=k)
+    assert_allclose(dist_64, dist_32, rtol=1e-5)
+    assert_equal(ind_64, ind_32)
+    assert dist_64.dtype == np.float64
+    assert dist_32.dtype == np.float32
+
+    # Test consistency with respect to the `query_radius` method
+    r = 2.38
+    ind_64 = kd_64.query_radius(Y_64, r=r)
+    ind_32 = kd_32.query_radius(Y_32, r=r)
+    for _ind64, _ind32 in zip(ind_64, ind_32):
+        assert_equal(_ind64, _ind32)
+
+    # Test consistency with respect to the `query_radius` method
+    # with return distances being true
+    ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True)
+    ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True)
+    for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32):
+        assert_equal(_ind64, _ind32)
+        assert_allclose(_dist_64, _dist_32, rtol=1e-5)
+        assert _dist_64.dtype == np.float64
+        assert _dist_32.dtype == np.float32
+
+
+@pytest.mark.parametrize("metric", METRICS)
+def test_kernel_density_numerical_consistency(global_random_seed, metric):
+    # Test consistency with respect to the `kernel_density` method
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
+
+    metric_params = METRICS.get(metric, {})
+    kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params)
+    kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params)
+
+    kernel = "gaussian"
+    h = 0.1
+    density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True)
+    density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True)
+    assert_allclose(density64, density32, rtol=1e-5)
+    assert density64.dtype == np.float64
+    assert density32.dtype == np.float32
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kde.py b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kde.py
new file mode 100644
index 0000000000000000000000000000000000000000..d30194b95b34ec9246d3030b4d80573ae6654e8d
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_kde.py
@@ -0,0 +1,252 @@
+import joblib
+import numpy as np
+import pytest
+
+from sklearn.datasets import make_blobs
+from sklearn.exceptions import NotFittedError
+from sklearn.model_selection import GridSearchCV
+from sklearn.neighbors import KDTree, KernelDensity, NearestNeighbors
+from sklearn.neighbors._ball_tree import kernel_norm
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils._testing import assert_allclose
+
+
+# XXX Duplicated in test_neighbors_tree, test_kde
+def compute_kernel_slow(Y, X, kernel, h):
+    if h == "scott":
+        h = X.shape[0] ** (-1 / (X.shape[1] + 4))
+    elif h == "silverman":
+        h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4))
+
+    d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
+    norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0]
+
+    if kernel == "gaussian":
+        return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
+    elif kernel == "tophat":
+        return norm * (d < h).sum(-1)
+    elif kernel == "epanechnikov":
+        return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
+    elif kernel == "exponential":
+        return norm * (np.exp(-d / h)).sum(-1)
+    elif kernel == "linear":
+        return norm * ((1 - d / h) * (d < h)).sum(-1)
+    elif kernel == "cosine":
+        return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
+    else:
+        raise ValueError("kernel not recognized")
+
+
+def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
+    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol)
+    log_dens = kde.fit(X).score_samples(Y)
+    assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1e-7, rtol))
+    assert_allclose(
+        np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1e-7, rtol)
+    )
+
+
+@pytest.mark.parametrize(
+    "kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
+)
+@pytest.mark.parametrize("bandwidth", [0.01, 0.1, 1, "scott", "silverman"])
+def test_kernel_density(kernel, bandwidth):
+    n_samples, n_features = (100, 3)
+
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features)
+    Y = rng.randn(n_samples, n_features)
+
+    dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)
+
+    for rtol in [0, 1e-5]:
+        for atol in [1e-6, 1e-2]:
+            for breadth_first in (True, False):
+                check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true)
+
+
+def test_kernel_density_sampling(n_samples=100, n_features=3):
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features)
+
+    bandwidth = 0.2
+
+    for kernel in ["gaussian", "tophat"]:
+        # draw a tophat sample
+        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
+        samp = kde.sample(100)
+        assert X.shape == samp.shape
+
+        # check that samples are in the right range
+        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
+        dist, ind = nbrs.kneighbors(X, return_distance=True)
+
+        if kernel == "tophat":
+            assert np.all(dist < bandwidth)
+        elif kernel == "gaussian":
+            # 5 standard deviations is safe for 100 samples, but there's a
+            # very small chance this test could fail.
+            assert np.all(dist < 5 * bandwidth)
+
+    # check unsupported kernels
+    for kernel in ["epanechnikov", "exponential", "linear", "cosine"]:
+        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
+        with pytest.raises(NotImplementedError):
+            kde.sample(100)
+
+    # non-regression test: used to return a scalar
+    X = rng.randn(4, 1)
+    kde = KernelDensity(kernel="gaussian").fit(X)
+    assert kde.sample().shape == (1, 1)
+
+
+@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree"])
+@pytest.mark.parametrize(
+    "metric", ["euclidean", "minkowski", "manhattan", "chebyshev", "haversine"]
+)
+def test_kde_algorithm_metric_choice(algorithm, metric):
+    # Smoke test for various metrics and algorithms
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 2)  # 2 features required for haversine dist.
+    Y = rng.randn(10, 2)
+
+    kde = KernelDensity(algorithm=algorithm, metric=metric)
+
+    if algorithm == "kd_tree" and metric not in KDTree.valid_metrics:
+        with pytest.raises(ValueError, match="invalid metric"):
+            kde.fit(X)
+    else:
+        kde.fit(X)
+        y_dens = kde.score_samples(Y)
+        assert y_dens.shape == Y.shape[:1]
+
+
+def test_kde_score(n_samples=100, n_features=3):
+    pass
+    # FIXME
+    # rng = np.random.RandomState(0)
+    # X = rng.random_sample((n_samples, n_features))
+    # Y = rng.random_sample((n_samples, n_features))
+
+
+def test_kde_sample_weights_error():
+    kde = KernelDensity()
+    with pytest.raises(ValueError):
+        kde.fit(np.random.random((200, 10)), sample_weight=np.random.random((200, 10)))
+    with pytest.raises(ValueError):
+        kde.fit(np.random.random((200, 10)), sample_weight=-np.random.random(200))
+
+
+def test_kde_pipeline_gridsearch():
+    # test that kde plays nice in pipelines and grid-searches
+    X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]])
+    pipe1 = make_pipeline(
+        StandardScaler(with_mean=False, with_std=False),
+        KernelDensity(kernel="gaussian"),
+    )
+    params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
+    search = GridSearchCV(pipe1, param_grid=params)
+    search.fit(X)
+    assert search.best_params_["kerneldensity__bandwidth"] == 0.1
+
+
+def test_kde_sample_weights():
+    n_samples = 400
+    size_test = 20
+    weights_neutral = np.full(n_samples, 3.0)
+    for d in [1, 2, 10]:
+        rng = np.random.RandomState(0)
+        X = rng.rand(n_samples, d)
+        weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
+        X_repetitions = np.repeat(X, weights, axis=0)
+        n_samples_test = size_test // d
+        test_points = rng.rand(n_samples_test, d)
+        for algorithm in ["auto", "ball_tree", "kd_tree"]:
+            for metric in ["euclidean", "minkowski", "manhattan", "chebyshev"]:
+                if algorithm != "kd_tree" or metric in KDTree.valid_metrics:
+                    kde = KernelDensity(algorithm=algorithm, metric=metric)
+
+                    # Test that adding a constant sample weight has no effect
+                    kde.fit(X, sample_weight=weights_neutral)
+                    scores_const_weight = kde.score_samples(test_points)
+                    sample_const_weight = kde.sample(random_state=1234)
+                    kde.fit(X)
+                    scores_no_weight = kde.score_samples(test_points)
+                    sample_no_weight = kde.sample(random_state=1234)
+                    assert_allclose(scores_const_weight, scores_no_weight)
+                    assert_allclose(sample_const_weight, sample_no_weight)
+
+                    # Test equivalence between sampling and (integer) weights
+                    kde.fit(X, sample_weight=weights)
+                    scores_weight = kde.score_samples(test_points)
+                    sample_weight = kde.sample(random_state=1234)
+                    kde.fit(X_repetitions)
+                    scores_ref_sampling = kde.score_samples(test_points)
+                    sample_ref_sampling = kde.sample(random_state=1234)
+                    assert_allclose(scores_weight, scores_ref_sampling)
+                    assert_allclose(sample_weight, sample_ref_sampling)
+
+                    # Test that sample weights has a non-trivial effect
+                    diff = np.max(np.abs(scores_no_weight - scores_weight))
+                    assert diff > 0.001
+
+                    # Test invariance with respect to arbitrary scaling
+                    scale_factor = rng.rand()
+                    kde.fit(X, sample_weight=(scale_factor * weights))
+                    scores_scaled_weight = kde.score_samples(test_points)
+                    assert_allclose(scores_scaled_weight, scores_weight)
+
+
+@pytest.mark.parametrize("sample_weight", [None, [0.1, 0.2, 0.3]])
+def test_pickling(tmpdir, sample_weight):
+    # Make sure that predictions are the same before and after pickling. Used
+    # to be a bug because sample_weights wasn't pickled and the resulting tree
+    # would miss some info.
+
+    kde = KernelDensity()
+    data = np.reshape([1.0, 2.0, 3.0], (-1, 1))
+    kde.fit(data, sample_weight=sample_weight)
+
+    X = np.reshape([1.1, 2.1], (-1, 1))
+    scores = kde.score_samples(X)
+
+    file_path = str(tmpdir.join("dump.pkl"))
+    joblib.dump(kde, file_path)
+    kde = joblib.load(file_path)
+    scores_pickled = kde.score_samples(X)
+
+    assert_allclose(scores, scores_pickled)
+
+
+@pytest.mark.parametrize("method", ["score_samples", "sample"])
+def test_check_is_fitted(method):
+    # Check that predict raises an exception in an unfitted estimator.
+    # Unfitted estimators should raise a NotFittedError.
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 2)
+    kde = KernelDensity()
+
+    with pytest.raises(NotFittedError):
+        getattr(kde, method)(X)
+
+
+@pytest.mark.parametrize("bandwidth", ["scott", "silverman", 0.1])
+def test_bandwidth(bandwidth):
+    n_samples, n_features = (100, 3)
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features)
+    kde = KernelDensity(bandwidth=bandwidth).fit(X)
+    samp = kde.sample(100)
+    kde_sc = kde.score_samples(X)
+    assert X.shape == samp.shape
+    assert kde_sc.shape == (n_samples,)
+
+    # Test that the attribute self.bandwidth_ has the expected value
+    if bandwidth == "scott":
+        h = X.shape[0] ** (-1 / (X.shape[1] + 4))
+    elif bandwidth == "silverman":
+        h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4))
+    else:
+        h = bandwidth
+    assert kde.bandwidth_ == pytest.approx(h)
diff --git a/.venv/Lib/site-packages/sklearn/neighbors/tests/test_lof.py b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_lof.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba22bbb44f35fe1f57a5ec1ede7dad9c2725f421
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neighbors/tests/test_lof.py
@@ -0,0 +1,394 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+from math import sqrt
+
+import numpy as np
+import pytest
+
+from sklearn import metrics, neighbors
+from sklearn.datasets import load_iris
+from sklearn.metrics import roc_auc_score
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.estimator_checks import (
+    check_outlier_corruption,
+    parametrize_with_checks,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+# load the iris dataset
+# and randomly permute it
+rng = check_random_state(0)
+iris = load_iris()
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+
+def test_lof(global_dtype):
+    # Toy sample (the last two samples are outliers):
+    X = np.asarray(
+        [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]],
+        dtype=global_dtype,
+    )
+
+    # Test LocalOutlierFactor:
+    clf = neighbors.LocalOutlierFactor(n_neighbors=5)
+    score = clf.fit(X).negative_outlier_factor_
+    assert_array_equal(clf._fit_X, X)
+
+    # Assert largest outlier score is smaller than smallest inlier score:
+    assert np.min(score[:-2]) > np.max(score[-2:])
+
+    # Assert predict() works:
+    clf = neighbors.LocalOutlierFactor(contamination=0.25, n_neighbors=5).fit(X)
+    expected_predictions = 6 * [1] + 2 * [-1]
+    assert_array_equal(clf._predict(), expected_predictions)
+    assert_array_equal(clf.fit_predict(X), expected_predictions)
+
+
+def test_lof_performance(global_dtype):
+    # Generate train/test data
+    rng = check_random_state(2)
+    X = 0.3 * rng.randn(120, 2).astype(global_dtype, copy=False)
+    X_train = X[:100]
+
+    # Generate some abnormal novel observations
+    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)).astype(
+        global_dtype, copy=False
+    )
+    X_test = np.r_[X[100:], X_outliers]
+    y_test = np.array([0] * 20 + [1] * 20)
+
+    # fit the model for novelty detection
+    clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train)
+
+    # predict scores (the lower, the more normal)
+    y_pred = -clf.decision_function(X_test)
+
+    # check that roc_auc is good
+    assert roc_auc_score(y_test, y_pred) > 0.99
+
+
+def test_lof_values(global_dtype):
+    # toy samples:
+    X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype)
+    clf1 = neighbors.LocalOutlierFactor(
+        n_neighbors=2, contamination=0.1, novelty=True
+    ).fit(X_train)
+    clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train)
+    s_0 = 2.0 * sqrt(2.0) / (1.0 + sqrt(2.0))
+    s_1 = (1.0 + sqrt(2)) * (1.0 / (4.0 * sqrt(2.0)) + 1.0 / (2.0 + 2.0 * sqrt(2)))
+    # check predict()
+    assert_allclose(-clf1.negative_outlier_factor_, [s_0, s_1, s_1])
+    assert_allclose(-clf2.negative_outlier_factor_, [s_0, s_1, s_1])
+    # check predict(one sample not in train)
+    assert_allclose(-clf1.score_samples([[2.0, 2.0]]), [s_0])
+    assert_allclose(-clf2.score_samples([[2.0, 2.0]]), [s_0])
+    # check predict(one sample already in train)
+    assert_allclose(-clf1.score_samples([[1.0, 1.0]]), [s_1])
+    assert_allclose(-clf2.score_samples([[1.0, 1.0]]), [s_1])
+
+
+def test_lof_precomputed(global_dtype, random_state=42):
+    """Tests LOF with a distance matrix."""
+    # Note: smaller samples may result in spurious test success
+    rng = np.random.RandomState(random_state)
+    X = rng.random_sample((10, 4)).astype(global_dtype, copy=False)
+    Y = rng.random_sample((3, 4)).astype(global_dtype, copy=False)
+    DXX = metrics.pairwise_distances(X, metric="euclidean")
+    DYX = metrics.pairwise_distances(Y, X, metric="euclidean")
+    # As a feature matrix (n_samples by n_features)
+    lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
+    lof_X.fit(X)
+    pred_X_X = lof_X._predict()
+    pred_X_Y = lof_X.predict(Y)
+
+    # As a dense distance matrix (n_samples by n_samples)
+    lof_D = neighbors.LocalOutlierFactor(
+        n_neighbors=3, algorithm="brute", metric="precomputed", novelty=True
+    )
+    lof_D.fit(DXX)
+    pred_D_X = lof_D._predict()
+    pred_D_Y = lof_D.predict(DYX)
+
+    assert_allclose(pred_X_X, pred_D_X)
+    assert_allclose(pred_X_Y, pred_D_Y)
+
+
+def test_n_neighbors_attribute():
+    X = iris.data
+    clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
+    assert clf.n_neighbors_ == X.shape[0] - 1
+
+    clf = neighbors.LocalOutlierFactor(n_neighbors=500)
+    msg = "n_neighbors will be set to (n_samples - 1)"
+    with pytest.warns(UserWarning, match=re.escape(msg)):
+        clf.fit(X)
+    assert clf.n_neighbors_ == X.shape[0] - 1
+
+
+def test_score_samples(global_dtype):
+    X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype)
+    X_test = np.asarray([[2.0, 2.0]], dtype=global_dtype)
+    clf1 = neighbors.LocalOutlierFactor(
+        n_neighbors=2, contamination=0.1, novelty=True
+    ).fit(X_train)
+    clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train)
+
+    clf1_scores = clf1.score_samples(X_test)
+    clf1_decisions = clf1.decision_function(X_test)
+
+    clf2_scores = clf2.score_samples(X_test)
+    clf2_decisions = clf2.decision_function(X_test)
+
+    assert_allclose(
+        clf1_scores,
+        clf1_decisions + clf1.offset_,
+    )
+    assert_allclose(
+        clf2_scores,
+        clf2_decisions + clf2.offset_,
+    )
+    assert_allclose(clf1_scores, clf2_scores)
+
+
+def test_novelty_errors():
+    X = iris.data
+
+    # check errors for novelty=False
+    clf = neighbors.LocalOutlierFactor()
+    clf.fit(X)
+    # predict, decision_function and score_samples raise ValueError
+    for method in ["predict", "decision_function", "score_samples"]:
+        outer_msg = f"'LocalOutlierFactor' has no attribute '{method}'"
+        inner_msg = "{} is not available when novelty=False".format(method)
+        with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+            getattr(clf, method)
+
+        assert isinstance(exec_info.value.__cause__, AttributeError)
+        assert inner_msg in str(exec_info.value.__cause__)
+
+    # check errors for novelty=True
+    clf = neighbors.LocalOutlierFactor(novelty=True)
+
+    outer_msg = "'LocalOutlierFactor' has no attribute 'fit_predict'"
+    inner_msg = "fit_predict is not available when novelty=True"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        getattr(clf, "fit_predict")
+
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
+
+
+def test_novelty_training_scores(global_dtype):
+    # check that the scores of the training samples are still accessible
+    # when novelty=True through the negative_outlier_factor_ attribute
+    X = iris.data.astype(global_dtype)
+
+    # fit with novelty=False
+    clf_1 = neighbors.LocalOutlierFactor()
+    clf_1.fit(X)
+    scores_1 = clf_1.negative_outlier_factor_
+
+    # fit with novelty=True
+    clf_2 = neighbors.LocalOutlierFactor(novelty=True)
+    clf_2.fit(X)
+    scores_2 = clf_2.negative_outlier_factor_
+
+    assert_allclose(scores_1, scores_2)
+
+
+def test_hasattr_prediction():
+    # check availability of prediction methods depending on novelty value.
+    X = [[1, 1], [1, 2], [2, 1]]
+
+    # when novelty=True
+    clf = neighbors.LocalOutlierFactor(novelty=True)
+    clf.fit(X)
+    assert hasattr(clf, "predict")
+    assert hasattr(clf, "decision_function")
+    assert hasattr(clf, "score_samples")
+    assert not hasattr(clf, "fit_predict")
+
+    # when novelty=False
+    clf = neighbors.LocalOutlierFactor(novelty=False)
+    clf.fit(X)
+    assert hasattr(clf, "fit_predict")
+    assert not hasattr(clf, "predict")
+    assert not hasattr(clf, "decision_function")
+    assert not hasattr(clf, "score_samples")
+
+
+@parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)])
+def test_novelty_true_common_tests(estimator, check):
+    # the common tests are run for the default LOF (novelty=False).
+    # here we run these common tests for LOF when novelty=True
+    check(estimator)
+
+
+@pytest.mark.parametrize("expected_outliers", [30, 53])
+def test_predicted_outlier_number(expected_outliers):
+    # the number of predicted outliers should be equal to the number of
+    # expected outliers unless there are ties in the abnormality scores.
+    X = iris.data
+    n_samples = X.shape[0]
+    contamination = float(expected_outliers) / n_samples
+
+    clf = neighbors.LocalOutlierFactor(contamination=contamination)
+    y_pred = clf.fit_predict(X)
+
+    num_outliers = np.sum(y_pred != 1)
+    if num_outliers != expected_outliers:
+        y_dec = clf.negative_outlier_factor_
+        check_outlier_corruption(num_outliers, expected_outliers, y_dec)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse(csr_container):
+    # LocalOutlierFactor must support CSR inputs
+    # TODO: compare results on dense and sparse data as proposed in:
+    # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
+    X = csr_container(iris.data)
+
+    lof = neighbors.LocalOutlierFactor(novelty=True)
+    lof.fit(X)
+    lof.predict(X)
+    lof.score_samples(X)
+    lof.decision_function(X)
+
+    lof = neighbors.LocalOutlierFactor(novelty=False)
+    lof.fit_predict(X)
+
+
+def test_lof_error_n_neighbors_too_large():
+    """Check that we raise a proper error message when n_neighbors == n_samples.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/17207
+    """
+    X = np.ones((7, 7))
+
+    msg = (
+        "Expected n_neighbors < n_samples_fit, but n_neighbors = 1, "
+        "n_samples_fit = 1, n_samples = 1"
+    )
+    with pytest.raises(ValueError, match=msg):
+        lof = neighbors.LocalOutlierFactor(n_neighbors=1).fit(X[:1])
+
+    lof = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X[:2])
+    assert lof.n_samples_fit_ == 2
+
+    msg = (
+        "Expected n_neighbors < n_samples_fit, but n_neighbors = 2, "
+        "n_samples_fit = 2, n_samples = 2"
+    )
+    with pytest.raises(ValueError, match=msg):
+        lof.kneighbors(None, n_neighbors=2)
+
+    distances, indices = lof.kneighbors(None, n_neighbors=1)
+    assert distances.shape == (2, 1)
+    assert indices.shape == (2, 1)
+
+    msg = (
+        "Expected n_neighbors <= n_samples_fit, but n_neighbors = 3, "
+        "n_samples_fit = 2, n_samples = 7"
+    )
+    with pytest.raises(ValueError, match=msg):
+        lof.kneighbors(X, n_neighbors=3)
+
+    (
+        distances,
+        indices,
+    ) = lof.kneighbors(X, n_neighbors=2)
+    assert distances.shape == (7, 2)
+    assert indices.shape == (7, 2)
+
+
+@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
+@pytest.mark.parametrize("novelty", [True, False])
+@pytest.mark.parametrize("contamination", [0.5, "auto"])
+def test_lof_input_dtype_preservation(global_dtype, algorithm, contamination, novelty):
+    """Check that the fitted attributes are stored using the data type of X."""
+    X = iris.data.astype(global_dtype, copy=False)
+
+    iso = neighbors.LocalOutlierFactor(
+        n_neighbors=5, algorithm=algorithm, contamination=contamination, novelty=novelty
+    )
+    iso.fit(X)
+
+    assert iso.negative_outlier_factor_.dtype == global_dtype
+
+    for method in ("score_samples", "decision_function"):
+        if hasattr(iso, method):
+            y_pred = getattr(iso, method)(X)
+            assert y_pred.dtype == global_dtype
+
+
+@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
+@pytest.mark.parametrize("novelty", [True, False])
+@pytest.mark.parametrize("contamination", [0.5, "auto"])
+def test_lof_dtype_equivalence(algorithm, novelty, contamination):
+    """Check the equivalence of the results with 32 and 64 bits input."""
+
+    inliers = iris.data[:50]  # setosa iris are really distinct from others
+    outliers = iris.data[-5:]  # virginica will be considered as outliers
+    # lower the precision of the input data to check that we have an equivalence when
+    # making the computation in 32 and 64 bits.
+    X = np.concatenate([inliers, outliers], axis=0).astype(np.float32)
+
+    lof_32 = neighbors.LocalOutlierFactor(
+        algorithm=algorithm, novelty=novelty, contamination=contamination
+    )
+    X_32 = X.astype(np.float32, copy=True)
+    lof_32.fit(X_32)
+
+    lof_64 = neighbors.LocalOutlierFactor(
+        algorithm=algorithm, novelty=novelty, contamination=contamination
+    )
+    X_64 = X.astype(np.float64, copy=True)
+    lof_64.fit(X_64)
+
+    assert_allclose(lof_32.negative_outlier_factor_, lof_64.negative_outlier_factor_)
+
+    for method in ("score_samples", "decision_function", "predict", "fit_predict"):
+        if hasattr(lof_32, method):
+            y_pred_32 = getattr(lof_32, method)(X_32)
+            y_pred_64 = getattr(lof_64, method)(X_64)
+            assert_allclose(y_pred_32, y_pred_64, atol=0.0002)
+
+
+def test_lof_duplicate_samples():
+    """
+    Check that LocalOutlierFactor raises a warning when duplicate values
+    in the training data cause inaccurate results.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27839
+    """
+
+    rng = np.random.default_rng(0)
+
+    x = rng.permutation(
+        np.hstack(
+            [
+                [0.1] * 1000,  # constant values
+                np.linspace(0.1, 0.3, num=3000),
+                rng.random(500) * 100,  # the clear outliers
+            ]
+        )
+    )
+    X = x.reshape(-1, 1)
+
+    error_msg = (
+        "Duplicate values are leading to incorrect results. "
+        "Increase the number of neighbors for more accurate results."
+    )
+
+    lof = neighbors.LocalOutlierFactor(n_neighbors=5, contamination=0.1)
+
+    # Catch the warning
+    with pytest.warns(UserWarning, match=re.escape(error_msg)):
+        lof.fit_predict(X)
diff --git a/.venv/Lib/site-packages/sklearn/neural_network/__init__.py b/.venv/Lib/site-packages/sklearn/neural_network/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..adbcdbc687e4d059a7ee0094803cf765c2334881
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neural_network/__init__.py
@@ -0,0 +1,9 @@
+"""Models based on neural networks."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._multilayer_perceptron import MLPClassifier, MLPRegressor
+from ._rbm import BernoulliRBM
+
+__all__ = ["BernoulliRBM", "MLPClassifier", "MLPRegressor"]
diff --git a/.venv/Lib/site-packages/sklearn/neural_network/_stochastic_optimizers.py b/.venv/Lib/site-packages/sklearn/neural_network/_stochastic_optimizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe1e72860681ad8f248d3781d3d68b546f8ceaa4
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neural_network/_stochastic_optimizers.py
@@ -0,0 +1,287 @@
+"""Stochastic optimization methods for MLP"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+
+class BaseOptimizer:
+    """Base (Stochastic) gradient descent optimizer
+
+    Parameters
+    ----------
+    learning_rate_init : float, default=0.1
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+
+    Attributes
+    ----------
+    learning_rate : float
+        the current learning rate
+    """
+
+    def __init__(self, learning_rate_init=0.1):
+        self.learning_rate_init = learning_rate_init
+        self.learning_rate = float(learning_rate_init)
+
+    def update_params(self, params, grads):
+        """Update parameters with given gradients
+
+        Parameters
+        ----------
+        params : list of length = len(coefs_) + len(intercepts_)
+            The concatenated list containing coefs_ and intercepts_ in MLP
+            model. Used for initializing velocities and updating params
+
+        grads : list of length = len(params)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+        """
+        updates = self._get_updates(grads)
+        for param, update in zip((p for p in params), updates):
+            param += update
+
+    def iteration_ends(self, time_step):
+        """Perform update to learning rate and potentially other states at the
+        end of an iteration
+        """
+        pass
+
+    def trigger_stopping(self, msg, verbose):
+        """Decides whether it is time to stop training
+
+        Parameters
+        ----------
+        msg : str
+            Message passed in for verbose output
+
+        verbose : bool
+            Print message to stdin if True
+
+        Returns
+        -------
+        is_stopping : bool
+            True if training needs to stop
+        """
+        if verbose:
+            print(msg + " Stopping.")
+        return True
+
+
+class SGDOptimizer(BaseOptimizer):
+    """Stochastic gradient descent optimizer with momentum
+
+    Parameters
+    ----------
+    params : list, length = len(coefs_) + len(intercepts_)
+        The concatenated list containing coefs_ and intercepts_ in MLP model.
+        Used for initializing velocities and updating params
+
+    learning_rate_init : float, default=0.1
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+
+    lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
+        Learning rate schedule for weight updates.
+
+        -'constant', is a constant learning rate given by
+         'learning_rate_init'.
+
+        -'invscaling' gradually decreases the learning rate 'learning_rate_' at
+          each time step 't' using an inverse scaling exponent of 'power_t'.
+          learning_rate_ = learning_rate_init / pow(t, power_t)
+
+        -'adaptive', keeps the learning rate constant to
+         'learning_rate_init' as long as the training keeps decreasing.
+         Each time 2 consecutive epochs fail to decrease the training loss by
+         tol, or fail to increase validation score by tol if 'early_stopping'
+         is on, the current learning rate is divided by 5.
+
+    momentum : float, default=0.9
+        Value of momentum used, must be larger than or equal to 0
+
+    nesterov : bool, default=True
+        Whether to use nesterov's momentum or not. Use nesterov's if True
+
+    power_t : float, default=0.5
+        Power of time step 't' in inverse scaling. See `lr_schedule` for
+        more details.
+
+    Attributes
+    ----------
+    learning_rate : float
+        the current learning rate
+
+    velocities : list, length = len(params)
+        velocities that are used to update params
+    """
+
+    def __init__(
+        self,
+        params,
+        learning_rate_init=0.1,
+        lr_schedule="constant",
+        momentum=0.9,
+        nesterov=True,
+        power_t=0.5,
+    ):
+        super().__init__(learning_rate_init)
+
+        self.lr_schedule = lr_schedule
+        self.momentum = momentum
+        self.nesterov = nesterov
+        self.power_t = power_t
+        self.velocities = [np.zeros_like(param) for param in params]
+
+    def iteration_ends(self, time_step):
+        """Perform updates to learning rate and potential other states at the
+        end of an iteration
+
+        Parameters
+        ----------
+        time_step : int
+            number of training samples trained on so far, used to update
+            learning rate for 'invscaling'
+        """
+        if self.lr_schedule == "invscaling":
+            self.learning_rate = (
+                float(self.learning_rate_init) / (time_step + 1) ** self.power_t
+            )
+
+    def trigger_stopping(self, msg, verbose):
+        if self.lr_schedule != "adaptive":
+            if verbose:
+                print(msg + " Stopping.")
+            return True
+
+        if self.learning_rate <= 1e-6:
+            if verbose:
+                print(msg + " Learning rate too small. Stopping.")
+            return True
+
+        self.learning_rate /= 5.0
+        if verbose:
+            print(msg + " Setting learning rate to %f" % self.learning_rate)
+        return False
+
+    def _get_updates(self, grads):
+        """Get the values used to update params with given gradients
+
+        Parameters
+        ----------
+        grads : list, length = len(coefs_) + len(intercepts_)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+
+        Returns
+        -------
+        updates : list, length = len(grads)
+            The values to add to params
+        """
+        updates = [
+            self.momentum * velocity - self.learning_rate * grad
+            for velocity, grad in zip(self.velocities, grads)
+        ]
+        self.velocities = updates
+
+        if self.nesterov:
+            updates = [
+                self.momentum * velocity - self.learning_rate * grad
+                for velocity, grad in zip(self.velocities, grads)
+            ]
+
+        return updates
+
+
+class AdamOptimizer(BaseOptimizer):
+    """Stochastic gradient descent optimizer with Adam
+
+    Note: All default values are from the original Adam paper
+
+    Parameters
+    ----------
+    params : list, length = len(coefs_) + len(intercepts_)
+        The concatenated list containing coefs_ and intercepts_ in MLP model.
+        Used for initializing velocities and updating params
+
+    learning_rate_init : float, default=0.001
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+
+    beta_1 : float, default=0.9
+        Exponential decay rate for estimates of first moment vector, should be
+        in [0, 1)
+
+    beta_2 : float, default=0.999
+        Exponential decay rate for estimates of second moment vector, should be
+        in [0, 1)
+
+    epsilon : float, default=1e-8
+        Value for numerical stability
+
+    Attributes
+    ----------
+    learning_rate : float
+        The current learning rate
+
+    t : int
+        Timestep
+
+    ms : list, length = len(params)
+        First moment vectors
+
+    vs : list, length = len(params)
+        Second moment vectors
+
+    References
+    ----------
+    :arxiv:`Kingma, Diederik, and Jimmy Ba (2014) "Adam: A method for
+        stochastic optimization." <1412.6980>
+    """
+
+    def __init__(
+        self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
+    ):
+        super().__init__(learning_rate_init)
+
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.t = 0
+        self.ms = [np.zeros_like(param) for param in params]
+        self.vs = [np.zeros_like(param) for param in params]
+
+    def _get_updates(self, grads):
+        """Get the values used to update params with given gradients
+
+        Parameters
+        ----------
+        grads : list, length = len(coefs_) + len(intercepts_)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+
+        Returns
+        -------
+        updates : list, length = len(grads)
+            The values to add to params
+        """
+        self.t += 1
+        self.ms = [
+            self.beta_1 * m + (1 - self.beta_1) * grad
+            for m, grad in zip(self.ms, grads)
+        ]
+        self.vs = [
+            self.beta_2 * v + (1 - self.beta_2) * (grad**2)
+            for v, grad in zip(self.vs, grads)
+        ]
+        self.learning_rate = (
+            self.learning_rate_init
+            * np.sqrt(1 - self.beta_2**self.t)
+            / (1 - self.beta_1**self.t)
+        )
+        updates = [
+            -self.learning_rate * m / (np.sqrt(v) + self.epsilon)
+            for m, v in zip(self.ms, self.vs)
+        ]
+        return updates
diff --git a/.venv/Lib/site-packages/sklearn/neural_network/tests/__init__.py b/.venv/Lib/site-packages/sklearn/neural_network/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/Lib/site-packages/sklearn/neural_network/tests/test_base.py b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f40efd5af5af1ee689305babb7d931428e8c595
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_base.py
@@ -0,0 +1,29 @@
+import numpy as np
+import pytest
+
+from sklearn.neural_network._base import binary_log_loss, log_loss
+
+
+def test_binary_log_loss_1_prob_finite():
+    # y_proba is equal to one should result in a finite logloss
+    y_true = np.array([[0, 0, 1]]).T
+    y_prob = np.array([[0.9, 1.0, 1.0]]).T
+
+    loss = binary_log_loss(y_true, y_prob)
+    assert np.isfinite(loss)
+
+
+@pytest.mark.parametrize(
+    "y_true, y_prob",
+    [
+        (
+            np.array([[1, 0, 0], [0, 1, 0]]),
+            np.array([[0.0, 1.0, 0.0], [0.9, 0.05, 0.05]]),
+        ),
+        (np.array([[0, 0, 1]]).T, np.array([[0.9, 1.0, 1.0]]).T),
+    ],
+)
+def test_log_loss_1_prob_finite(y_true, y_prob):
+    # y_proba is equal to 1 should result in a finite logloss
+    loss = log_loss(y_true, y_prob)
+    assert np.isfinite(loss)
diff --git a/.venv/Lib/site-packages/sklearn/neural_network/tests/test_mlp.py b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f94b25defde4747f720f2facb0d09b2059ecb26
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_mlp.py
@@ -0,0 +1,1021 @@
+"""
+Testing for Multi-layer Perceptron module (sklearn.neural_network)
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+import sys
+import warnings
+from io import StringIO
+
+import joblib
+import numpy as np
+import pytest
+
+from sklearn.datasets import (
+    load_digits,
+    load_iris,
+    make_multilabel_classification,
+    make_regression,
+)
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import roc_auc_score
+from sklearn.neural_network import MLPClassifier, MLPRegressor
+from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, scale
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"]
+
+X_digits, y_digits = load_digits(n_class=3, return_X_y=True)
+
+X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200])
+y_digits_multi = y_digits[:200]
+
+X_digits, y_digits = load_digits(n_class=2, return_X_y=True)
+
+X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200])
+y_digits_binary = y_digits[:200]
+
+classification_datasets = [
+    (X_digits_multi, y_digits_multi),
+    (X_digits_binary, y_digits_binary),
+]
+
+X_reg, y_reg = make_regression(
+    n_samples=200, n_features=10, bias=20.0, noise=100.0, random_state=7
+)
+y_reg = scale(y_reg)
+regression_datasets = [(X_reg, y_reg)]
+
+iris = load_iris()
+
+X_iris = iris.data
+y_iris = iris.target
+
+
+def test_alpha():
+    # Test that larger alpha yields weights closer to zero
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+
+    alpha_vectors = []
+    alpha_values = np.arange(2)
+    absolute_sum = lambda x: np.sum(np.abs(x))
+
+    for alpha in alpha_values:
+        mlp = MLPClassifier(hidden_layer_sizes=10, alpha=alpha, random_state=1)
+        with ignore_warnings(category=ConvergenceWarning):
+            mlp.fit(X, y)
+        alpha_vectors.append(
+            np.array([absolute_sum(mlp.coefs_[0]), absolute_sum(mlp.coefs_[1])])
+        )
+
+    for i in range(len(alpha_values) - 1):
+        assert (alpha_vectors[i] > alpha_vectors[i + 1]).all()
+
+
+def test_fit():
+    # Test that the algorithm solution is equal to a worked out example.
+    X = np.array([[0.6, 0.8, 0.7]])
+    y = np.array([0])
+    mlp = MLPClassifier(
+        solver="sgd",
+        learning_rate_init=0.1,
+        alpha=0.1,
+        activation="logistic",
+        random_state=1,
+        max_iter=1,
+        hidden_layer_sizes=2,
+        momentum=0,
+    )
+    # set weights
+    mlp.coefs_ = [0] * 2
+    mlp.intercepts_ = [0] * 2
+    mlp.n_outputs_ = 1
+    mlp.coefs_[0] = np.array([[0.1, 0.2], [0.3, 0.1], [0.5, 0]])
+    mlp.coefs_[1] = np.array([[0.1], [0.2]])
+    mlp.intercepts_[0] = np.array([0.1, 0.1])
+    mlp.intercepts_[1] = np.array([1.0])
+    mlp._coef_grads = [] * 2
+    mlp._intercept_grads = [] * 2
+    mlp.n_features_in_ = 3
+
+    # Initialize parameters
+    mlp.n_iter_ = 0
+    mlp.learning_rate_ = 0.1
+
+    # Compute the number of layers
+    mlp.n_layers_ = 3
+
+    # Pre-allocate gradient matrices
+    mlp._coef_grads = [0] * (mlp.n_layers_ - 1)
+    mlp._intercept_grads = [0] * (mlp.n_layers_ - 1)
+
+    mlp.out_activation_ = "logistic"
+    mlp.t_ = 0
+    mlp.best_loss_ = np.inf
+    mlp.loss_curve_ = []
+    mlp._no_improvement_count = 0
+    mlp._intercept_velocity = [
+        np.zeros_like(intercepts) for intercepts in mlp.intercepts_
+    ]
+    mlp._coef_velocity = [np.zeros_like(coefs) for coefs in mlp.coefs_]
+
+    mlp.partial_fit(X, y, classes=[0, 1])
+    # Manually worked out example
+    # h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.1 + 0.8 * 0.3 + 0.7 * 0.5 + 0.1)
+    #       =  0.679178699175393
+    # h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.2 + 0.8 * 0.1 + 0.7 * 0 + 0.1)
+    #         = 0.574442516811659
+    # o1 = g(h * W2 + b21) = g(0.679 * 0.1 + 0.574 * 0.2 + 1)
+    #       = 0.7654329236196236
+    # d21 = -(0 - 0.765) = 0.765
+    # d11 = (1 - 0.679) * 0.679 * 0.765 * 0.1 = 0.01667
+    # d12 = (1 - 0.574) * 0.574 * 0.765 * 0.2 = 0.0374
+    # W1grad11 = X1 * d11 + alpha * W11 = 0.6 * 0.01667 + 0.1 * 0.1 = 0.0200
+    # W1grad11 = X1 * d12 + alpha * W12 = 0.6 * 0.0374 + 0.1 * 0.2 = 0.04244
+    # W1grad21 = X2 * d11 + alpha * W13 = 0.8 * 0.01667 + 0.1 * 0.3 = 0.043336
+    # W1grad22 = X2 * d12 + alpha * W14 = 0.8 * 0.0374 + 0.1 * 0.1 = 0.03992
+    # W1grad31 = X3 * d11 + alpha * W15 = 0.6 * 0.01667 + 0.1 * 0.5 = 0.060002
+    # W1grad32 = X3 * d12 + alpha * W16 = 0.6 * 0.0374 + 0.1 * 0 = 0.02244
+    # W2grad1 = h1 * d21 + alpha * W21 = 0.679 * 0.765 + 0.1 * 0.1 = 0.5294
+    # W2grad2 = h2 * d21 + alpha * W22 = 0.574 * 0.765 + 0.1 * 0.2 = 0.45911
+    # b1grad1 = d11 = 0.01667
+    # b1grad2 = d12 = 0.0374
+    # b2grad = d21 = 0.765
+    # W1 = W1 - eta * [W1grad11, .., W1grad32] = [[0.1, 0.2], [0.3, 0.1],
+    #          [0.5, 0]] - 0.1 * [[0.0200, 0.04244], [0.043336, 0.03992],
+    #          [0.060002, 0.02244]] = [[0.098, 0.195756], [0.2956664,
+    #          0.096008], [0.4939998, -0.002244]]
+    # W2 = W2 - eta * [W2grad1, W2grad2] = [[0.1], [0.2]] - 0.1 *
+    #        [[0.5294], [0.45911]] = [[0.04706], [0.154089]]
+    # b1 = b1 - eta * [b1grad1, b1grad2] = 0.1 - 0.1 * [0.01667, 0.0374]
+    #         = [0.098333, 0.09626]
+    # b2 = b2 - eta * b2grad = 1.0 - 0.1 * 0.765 = 0.9235
+    assert_almost_equal(
+        mlp.coefs_[0],
+        np.array([[0.098, 0.195756], [0.2956664, 0.096008], [0.4939998, -0.002244]]),
+        decimal=3,
+    )
+    assert_almost_equal(mlp.coefs_[1], np.array([[0.04706], [0.154089]]), decimal=3)
+    assert_almost_equal(mlp.intercepts_[0], np.array([0.098333, 0.09626]), decimal=3)
+    assert_almost_equal(mlp.intercepts_[1], np.array(0.9235), decimal=3)
+    # Testing output
+    #  h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.098 + 0.8 * 0.2956664 +
+    #               0.7 * 0.4939998 + 0.098333) = 0.677
+    #  h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.195756 + 0.8 * 0.096008 +
+    #            0.7 * -0.002244 + 0.09626) = 0.572
+    #  o1 = h * W2 + b21 = 0.677 * 0.04706 +
+    #             0.572 * 0.154089 + 0.9235 = 1.043
+    #  prob = sigmoid(o1) = 0.739
+    assert_almost_equal(mlp.predict_proba(X)[0, 1], 0.739, decimal=3)
+
+
+def test_gradient():
+    # Test gradient.
+
+    # This makes sure that the activation functions and their derivatives
+    # are correct. The numerical and analytical computation of the gradient
+    # should be close.
+    for n_labels in [2, 3]:
+        n_samples = 5
+        n_features = 10
+        random_state = np.random.RandomState(seed=42)
+        X = random_state.rand(n_samples, n_features)
+        y = 1 + np.mod(np.arange(n_samples) + 1, n_labels)
+        Y = LabelBinarizer().fit_transform(y)
+
+        for activation in ACTIVATION_TYPES:
+            mlp = MLPClassifier(
+                activation=activation,
+                hidden_layer_sizes=10,
+                solver="lbfgs",
+                alpha=1e-5,
+                learning_rate_init=0.2,
+                max_iter=1,
+                random_state=1,
+            )
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", ConvergenceWarning)
+                mlp.fit(X, y)
+
+            theta = np.hstack([l.ravel() for l in mlp.coefs_ + mlp.intercepts_])
+
+            layer_units = [X.shape[1]] + [mlp.hidden_layer_sizes] + [mlp.n_outputs_]
+
+            activations = []
+            deltas = []
+            coef_grads = []
+            intercept_grads = []
+
+            activations.append(X)
+            for i in range(mlp.n_layers_ - 1):
+                activations.append(np.empty((X.shape[0], layer_units[i + 1])))
+                deltas.append(np.empty((X.shape[0], layer_units[i + 1])))
+
+                fan_in = layer_units[i]
+                fan_out = layer_units[i + 1]
+                coef_grads.append(np.empty((fan_in, fan_out)))
+                intercept_grads.append(np.empty(fan_out))
+
+            # analytically compute the gradients
+            def loss_grad_fun(t):
+                return mlp._loss_grad_lbfgs(
+                    t, X, Y, activations, deltas, coef_grads, intercept_grads
+                )
+
+            [value, grad] = loss_grad_fun(theta)
+            numgrad = np.zeros(np.size(theta))
+            n = np.size(theta, 0)
+            E = np.eye(n)
+            epsilon = 1e-5
+            # numerically compute the gradients
+            for i in range(n):
+                dtheta = E[:, i] * epsilon
+                numgrad[i] = (
+                    loss_grad_fun(theta + dtheta)[0] - loss_grad_fun(theta - dtheta)[0]
+                ) / (epsilon * 2.0)
+            assert_almost_equal(numgrad, grad)
+
+
+@pytest.mark.parametrize("X,y", classification_datasets)
+def test_lbfgs_classification(X, y):
+    # Test lbfgs on classification.
+    # It should achieve a score higher than 0.95 for the binary and multi-class
+    # versions of the digits dataset.
+    X_train = X[:150]
+    y_train = y[:150]
+    X_test = X[150:]
+    expected_shape_dtype = (X_test.shape[0], y_train.dtype.kind)
+
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPClassifier(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            max_iter=150,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        mlp.fit(X_train, y_train)
+        y_predict = mlp.predict(X_test)
+        assert mlp.score(X_train, y_train) > 0.95
+        assert (y_predict.shape[0], y_predict.dtype.kind) == expected_shape_dtype
+
+
+@pytest.mark.parametrize("X,y", regression_datasets)
+def test_lbfgs_regression(X, y):
+    # Test lbfgs on the regression dataset.
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPRegressor(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            max_iter=200,
+            tol=1e-3,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        mlp.fit(X, y)
+        if activation == "identity":
+            assert mlp.score(X, y) > 0.80
+        else:
+            # Non linear models perform much better than linear bottleneck:
+            assert mlp.score(X, y) > 0.98
+
+
+@pytest.mark.parametrize("X,y", classification_datasets)
+def test_lbfgs_classification_maxfun(X, y):
+    # Test lbfgs parameter max_fun.
+    # It should independently limit the number of iterations for lbfgs.
+    max_fun = 10
+    # classification tests
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPClassifier(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            max_iter=150,
+            max_fun=max_fun,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        with pytest.warns(ConvergenceWarning):
+            mlp.fit(X, y)
+            assert max_fun >= mlp.n_iter_
+
+
+@pytest.mark.parametrize("X,y", regression_datasets)
+def test_lbfgs_regression_maxfun(X, y):
+    # Test lbfgs parameter max_fun.
+    # It should independently limit the number of iterations for lbfgs.
+    max_fun = 10
+    # regression tests
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPRegressor(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            tol=0.0,
+            max_iter=150,
+            max_fun=max_fun,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        with pytest.warns(ConvergenceWarning):
+            mlp.fit(X, y)
+            assert max_fun >= mlp.n_iter_
+
+
+def test_learning_rate_warmstart():
+    # Tests that warm_start reuse past solutions.
+    X = [[3, 2], [1, 6], [5, 6], [-2, -4]]
+    y = [1, 1, 1, 0]
+    for learning_rate in ["invscaling", "constant"]:
+        mlp = MLPClassifier(
+            solver="sgd",
+            hidden_layer_sizes=4,
+            learning_rate=learning_rate,
+            max_iter=1,
+            power_t=0.25,
+            warm_start=True,
+        )
+        with ignore_warnings(category=ConvergenceWarning):
+            mlp.fit(X, y)
+            prev_eta = mlp._optimizer.learning_rate
+            mlp.fit(X, y)
+            post_eta = mlp._optimizer.learning_rate
+
+        if learning_rate == "constant":
+            assert prev_eta == post_eta
+        elif learning_rate == "invscaling":
+            assert mlp.learning_rate_init / pow(8 + 1, mlp.power_t) == post_eta
+
+
+def test_multilabel_classification():
+    # Test that multi-label classification works as expected.
+    # test fit method
+    X, y = make_multilabel_classification(
+        n_samples=50, random_state=0, return_indicator=True
+    )
+    mlp = MLPClassifier(
+        solver="lbfgs",
+        hidden_layer_sizes=50,
+        alpha=1e-5,
+        max_iter=150,
+        random_state=0,
+        activation="logistic",
+        learning_rate_init=0.2,
+    )
+    mlp.fit(X, y)
+    assert mlp.score(X, y) > 0.97
+
+    # test partial fit method
+    mlp = MLPClassifier(
+        solver="sgd",
+        hidden_layer_sizes=50,
+        max_iter=150,
+        random_state=0,
+        activation="logistic",
+        alpha=1e-5,
+        learning_rate_init=0.2,
+    )
+    for i in range(100):
+        mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4])
+    assert mlp.score(X, y) > 0.9
+
+    # Make sure early stopping still work now that splitting is stratified by
+    # default (it is disabled for multilabel classification)
+    mlp = MLPClassifier(early_stopping=True)
+    mlp.fit(X, y).predict(X)
+
+
+def test_multioutput_regression():
+    # Test that multi-output regression works as expected
+    X, y = make_regression(n_samples=200, n_targets=5, random_state=11)
+    mlp = MLPRegressor(
+        solver="lbfgs", hidden_layer_sizes=50, max_iter=200, tol=1e-2, random_state=1
+    )
+    mlp.fit(X, y)
+    assert mlp.score(X, y) > 0.9
+
+
+def test_partial_fit_classes_error():
+    # Tests that passing different classes to partial_fit raises an error
+    X = [[3, 2]]
+    y = [0]
+    clf = MLPClassifier(solver="sgd")
+    clf.partial_fit(X, y, classes=[0, 1])
+    with pytest.raises(ValueError):
+        clf.partial_fit(X, y, classes=[1, 2])
+
+
+def test_partial_fit_classification():
+    # Test partial_fit on classification.
+    # `partial_fit` should yield the same results as 'fit' for binary and
+    # multi-class classification.
+    for X, y in classification_datasets:
+        mlp = MLPClassifier(
+            solver="sgd",
+            max_iter=100,
+            random_state=1,
+            tol=0,
+            alpha=1e-5,
+            learning_rate_init=0.2,
+        )
+
+        with ignore_warnings(category=ConvergenceWarning):
+            mlp.fit(X, y)
+        pred1 = mlp.predict(X)
+        mlp = MLPClassifier(
+            solver="sgd", random_state=1, alpha=1e-5, learning_rate_init=0.2
+        )
+        for i in range(100):
+            mlp.partial_fit(X, y, classes=np.unique(y))
+        pred2 = mlp.predict(X)
+        assert_array_equal(pred1, pred2)
+        assert mlp.score(X, y) > 0.95
+
+
+def test_partial_fit_unseen_classes():
+    # Non regression test for bug 6994
+    # Tests for labeling errors in partial fit
+
+    clf = MLPClassifier(random_state=0)
+    clf.partial_fit([[1], [2], [3]], ["a", "b", "c"], classes=["a", "b", "c", "d"])
+    clf.partial_fit([[4]], ["d"])
+    assert clf.score([[1], [2], [3], [4]], ["a", "b", "c", "d"]) > 0
+
+
+def test_partial_fit_regression():
+    # Test partial_fit on regression.
+    # `partial_fit` should yield the same results as 'fit' for regression.
+    X = X_reg
+    y = y_reg
+
+    for momentum in [0, 0.9]:
+        mlp = MLPRegressor(
+            solver="sgd",
+            max_iter=100,
+            activation="relu",
+            random_state=1,
+            learning_rate_init=0.01,
+            batch_size=X.shape[0],
+            momentum=momentum,
+        )
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", ConvergenceWarning)
+            mlp.fit(X, y)
+        pred1 = mlp.predict(X)
+        mlp = MLPRegressor(
+            solver="sgd",
+            activation="relu",
+            learning_rate_init=0.01,
+            random_state=1,
+            batch_size=X.shape[0],
+            momentum=momentum,
+        )
+        for i in range(100):
+            mlp.partial_fit(X, y)
+
+        pred2 = mlp.predict(X)
+        assert_allclose(pred1, pred2)
+        score = mlp.score(X, y)
+        assert score > 0.65
+
+
+def test_partial_fit_errors():
+    # Test partial_fit error handling.
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+
+    # no classes passed
+    with pytest.raises(ValueError):
+        MLPClassifier(solver="sgd").partial_fit(X, y, classes=[2])
+
+    # lbfgs doesn't support partial_fit
+    assert not hasattr(MLPClassifier(solver="lbfgs"), "partial_fit")
+
+
+def test_nonfinite_params():
+    # Check that MLPRegressor throws ValueError when dealing with non-finite
+    # parameter values
+    rng = np.random.RandomState(0)
+    n_samples = 10
+    fmax = np.finfo(np.float64).max
+    X = fmax * rng.uniform(size=(n_samples, 2))
+    y = rng.standard_normal(size=n_samples)
+
+    clf = MLPRegressor()
+    msg = (
+        "Solver produced non-finite parameter weights. The input data may contain large"
+        " values and need to be preprocessed."
+    )
+    with pytest.raises(ValueError, match=msg):
+        with warnings.catch_warnings():
+            # RuntimeWarning: overflow encountered in square
+            warnings.simplefilter("ignore")
+            clf.fit(X, y)
+
+
+def test_predict_proba_binary():
+    # Test that predict_proba works as expected for binary class.
+    X = X_digits_binary[:50]
+    y = y_digits_binary[:50]
+
+    clf = MLPClassifier(hidden_layer_sizes=5, activation="logistic", random_state=1)
+    with ignore_warnings(category=ConvergenceWarning):
+        clf.fit(X, y)
+    y_proba = clf.predict_proba(X)
+    y_log_proba = clf.predict_log_proba(X)
+
+    (n_samples, n_classes) = y.shape[0], 2
+
+    proba_max = y_proba.argmax(axis=1)
+    proba_log_max = y_log_proba.argmax(axis=1)
+
+    assert y_proba.shape == (n_samples, n_classes)
+    assert_array_equal(proba_max, proba_log_max)
+    assert_allclose(y_log_proba, np.log(y_proba))
+
+    assert roc_auc_score(y, y_proba[:, 1]) == 1.0
+
+
+def test_predict_proba_multiclass():
+    # Test that predict_proba works as expected for multi class.
+    X = X_digits_multi[:10]
+    y = y_digits_multi[:10]
+
+    clf = MLPClassifier(hidden_layer_sizes=5)
+    with ignore_warnings(category=ConvergenceWarning):
+        clf.fit(X, y)
+    y_proba = clf.predict_proba(X)
+    y_log_proba = clf.predict_log_proba(X)
+
+    (n_samples, n_classes) = y.shape[0], np.unique(y).size
+
+    proba_max = y_proba.argmax(axis=1)
+    proba_log_max = y_log_proba.argmax(axis=1)
+
+    assert y_proba.shape == (n_samples, n_classes)
+    assert_array_equal(proba_max, proba_log_max)
+    assert_allclose(y_log_proba, np.log(y_proba))
+
+
+def test_predict_proba_multilabel():
+    # Test that predict_proba works as expected for multilabel.
+    # Multilabel should not use softmax which makes probabilities sum to 1
+    X, Y = make_multilabel_classification(
+        n_samples=50, random_state=0, return_indicator=True
+    )
+    n_samples, n_classes = Y.shape
+
+    clf = MLPClassifier(solver="lbfgs", hidden_layer_sizes=30, random_state=0)
+    clf.fit(X, Y)
+    y_proba = clf.predict_proba(X)
+
+    assert y_proba.shape == (n_samples, n_classes)
+    assert_array_equal(y_proba > 0.5, Y)
+
+    y_log_proba = clf.predict_log_proba(X)
+    proba_max = y_proba.argmax(axis=1)
+    proba_log_max = y_log_proba.argmax(axis=1)
+
+    assert (y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1) > 1e-10
+    assert_array_equal(proba_max, proba_log_max)
+    assert_allclose(y_log_proba, np.log(y_proba))
+
+
+def test_shuffle():
+    # Test that the shuffle parameter affects the training process (it should)
+    X, y = make_regression(n_samples=50, n_features=5, n_targets=1, random_state=0)
+
+    # The coefficients will be identical if both do or do not shuffle
+    for shuffle in [True, False]:
+        mlp1 = MLPRegressor(
+            hidden_layer_sizes=1,
+            max_iter=1,
+            batch_size=1,
+            random_state=0,
+            shuffle=shuffle,
+        )
+        mlp2 = MLPRegressor(
+            hidden_layer_sizes=1,
+            max_iter=1,
+            batch_size=1,
+            random_state=0,
+            shuffle=shuffle,
+        )
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", ConvergenceWarning)
+            mlp1.fit(X, y)
+            mlp2.fit(X, y)
+
+        assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
+
+    # The coefficients will be slightly different if shuffle=True
+    mlp1 = MLPRegressor(
+        hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=True
+    )
+    mlp2 = MLPRegressor(
+        hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=False
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        mlp1.fit(X, y)
+        mlp2.fit(X, y)
+
+    assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_matrices(csr_container):
+    # Test that sparse and dense input matrices output the same results.
+    X = X_digits_binary[:50]
+    y = y_digits_binary[:50]
+    X_sparse = csr_container(X)
+    mlp = MLPClassifier(solver="lbfgs", hidden_layer_sizes=15, random_state=1)
+    mlp.fit(X, y)
+    pred1 = mlp.predict(X)
+    mlp.fit(X_sparse, y)
+    pred2 = mlp.predict(X_sparse)
+    assert_almost_equal(pred1, pred2)
+    pred1 = mlp.predict(X)
+    pred2 = mlp.predict(X_sparse)
+    assert_array_equal(pred1, pred2)
+
+
+def test_tolerance():
+    # Test tolerance.
+    # It should force the solver to exit the loop when it converges.
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+    clf = MLPClassifier(tol=0.5, max_iter=3000, solver="sgd")
+    clf.fit(X, y)
+    assert clf.max_iter > clf.n_iter_
+
+
+def test_verbose_sgd():
+    # Test verbose.
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+    clf = MLPClassifier(solver="sgd", max_iter=2, verbose=10, hidden_layer_sizes=2)
+    old_stdout = sys.stdout
+    sys.stdout = output = StringIO()
+
+    with ignore_warnings(category=ConvergenceWarning):
+        clf.fit(X, y)
+    clf.partial_fit(X, y)
+
+    sys.stdout = old_stdout
+    assert "Iteration" in output.getvalue()
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_early_stopping(MLPEstimator):
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+    tol = 0.2
+    mlp_estimator = MLPEstimator(
+        tol=tol, max_iter=3000, solver="sgd", early_stopping=True
+    )
+    mlp_estimator.fit(X, y)
+    assert mlp_estimator.max_iter > mlp_estimator.n_iter_
+
+    assert mlp_estimator.best_loss_ is None
+    assert isinstance(mlp_estimator.validation_scores_, list)
+
+    valid_scores = mlp_estimator.validation_scores_
+    best_valid_score = mlp_estimator.best_validation_score_
+    assert max(valid_scores) == best_valid_score
+    assert best_valid_score + tol > valid_scores[-2]
+    assert best_valid_score + tol > valid_scores[-1]
+
+    # check that the attributes `validation_scores_` and `best_validation_score_`
+    # are set to None when `early_stopping=False`
+    mlp_estimator = MLPEstimator(
+        tol=tol, max_iter=3000, solver="sgd", early_stopping=False
+    )
+    mlp_estimator.fit(X, y)
+    assert mlp_estimator.validation_scores_ is None
+    assert mlp_estimator.best_validation_score_ is None
+    assert mlp_estimator.best_loss_ is not None
+
+
+def test_adaptive_learning_rate():
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+    clf = MLPClassifier(tol=0.5, max_iter=3000, solver="sgd", learning_rate="adaptive")
+    clf.fit(X, y)
+    assert clf.max_iter > clf.n_iter_
+    assert 1e-6 > clf._optimizer.learning_rate
+
+
+def test_warm_start():
+    X = X_iris
+    y = y_iris
+
+    y_2classes = np.array([0] * 75 + [1] * 75)
+    y_3classes = np.array([0] * 40 + [1] * 40 + [2] * 70)
+    y_3classes_alt = np.array([0] * 50 + [1] * 50 + [3] * 50)
+    y_4classes = np.array([0] * 37 + [1] * 37 + [2] * 38 + [3] * 38)
+    y_5classes = np.array([0] * 30 + [1] * 30 + [2] * 30 + [3] * 30 + [4] * 30)
+
+    # No error raised
+    clf = MLPClassifier(
+        hidden_layer_sizes=2, solver="lbfgs", warm_start=True, random_state=42, tol=1e-2
+    ).fit(X, y)
+    clf.fit(X, y)
+    clf.fit(X, y_3classes)
+
+    for y_i in (y_2classes, y_3classes_alt, y_4classes, y_5classes):
+        clf = MLPClassifier(
+            hidden_layer_sizes=2,
+            solver="lbfgs",
+            warm_start=True,
+            random_state=42,
+            tol=1e-2,
+        ).fit(X, y)
+        message = (
+            "warm_start can only be used where `y` has the same "
+            "classes as in the previous call to fit."
+            " Previously got [0 1 2], `y` has %s" % np.unique(y_i)
+        )
+        with pytest.raises(ValueError, match=re.escape(message)):
+            clf.fit(X, y_i)
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_warm_start_full_iteration(MLPEstimator):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/16812
+    # Check that the MLP estimator accomplish `max_iter` with a
+    # warm started estimator.
+    X, y = X_iris, y_iris
+    max_iter = 3
+    clf = MLPEstimator(
+        hidden_layer_sizes=2, solver="sgd", warm_start=True, max_iter=max_iter
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        clf.fit(X, y)
+        assert max_iter == clf.n_iter_
+        clf.fit(X, y)
+        assert max_iter == clf.n_iter_
+
+
+def test_n_iter_no_change():
+    # test n_iter_no_change using binary data set
+    # the classifying fitting process is not prone to loss curve fluctuations
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+    tol = 0.01
+    max_iter = 3000
+
+    # test multiple n_iter_no_change
+    for n_iter_no_change in [2, 5, 10, 50, 100]:
+        clf = MLPClassifier(
+            tol=tol, max_iter=max_iter, solver="sgd", n_iter_no_change=n_iter_no_change
+        )
+        clf.fit(X, y)
+
+        # validate n_iter_no_change
+        assert clf._no_improvement_count == n_iter_no_change + 1
+        assert max_iter > clf.n_iter_
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_n_iter_no_change_inf():
+    # test n_iter_no_change using binary data set
+    # the fitting process should go to max_iter iterations
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+
+    # set a ridiculous tolerance
+    # this should always trigger _update_no_improvement_count()
+    tol = 1e9
+
+    # fit
+    n_iter_no_change = np.inf
+    max_iter = 3000
+    clf = MLPClassifier(
+        tol=tol, max_iter=max_iter, solver="sgd", n_iter_no_change=n_iter_no_change
+    )
+    clf.fit(X, y)
+
+    # validate n_iter_no_change doesn't cause early stopping
+    assert clf.n_iter_ == max_iter
+
+    # validate _update_no_improvement_count() was always triggered
+    assert clf._no_improvement_count == clf.n_iter_ - 1
+
+
+def test_early_stopping_stratified():
+    # Make sure data splitting for early stopping is stratified
+    X = [[1, 2], [2, 3], [3, 4], [4, 5]]
+    y = [0, 0, 0, 1]
+
+    mlp = MLPClassifier(early_stopping=True)
+    with pytest.raises(
+        ValueError, match="The least populated class in y has only 1 member"
+    ):
+        mlp.fit(X, y)
+
+
+def test_mlp_classifier_dtypes_casting():
+    # Compare predictions for different dtypes
+    mlp_64 = MLPClassifier(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=100, tol=1e-1
+    )
+    mlp_64.fit(X_digits[:300], y_digits[:300])
+    pred_64 = mlp_64.predict(X_digits[300:])
+    proba_64 = mlp_64.predict_proba(X_digits[300:])
+
+    mlp_32 = MLPClassifier(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=100, tol=1e-1
+    )
+    mlp_32.fit(X_digits[:300].astype(np.float32), y_digits[:300])
+    pred_32 = mlp_32.predict(X_digits[300:].astype(np.float32))
+    proba_32 = mlp_32.predict_proba(X_digits[300:].astype(np.float32))
+
+    assert_array_equal(pred_64, pred_32)
+    assert_allclose(proba_64, proba_32, rtol=1e-02)
+
+
+def test_mlp_regressor_dtypes_casting():
+    mlp_64 = MLPRegressor(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=150, tol=1e-3
+    )
+    mlp_64.fit(X_digits[:300], y_digits[:300])
+    pred_64 = mlp_64.predict(X_digits[300:])
+
+    mlp_32 = MLPRegressor(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=150, tol=1e-3
+    )
+    mlp_32.fit(X_digits[:300].astype(np.float32), y_digits[:300])
+    pred_32 = mlp_32.predict(X_digits[300:].astype(np.float32))
+
+    assert_allclose(pred_64, pred_32, rtol=5e-04)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("Estimator", [MLPClassifier, MLPRegressor])
+def test_mlp_param_dtypes(dtype, Estimator):
+    # Checks if input dtype is used for network parameters
+    # and predictions
+    X, y = X_digits.astype(dtype), y_digits
+    mlp = Estimator(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=50, tol=1e-1
+    )
+    mlp.fit(X[:300], y[:300])
+    pred = mlp.predict(X[300:])
+
+    assert all([intercept.dtype == dtype for intercept in mlp.intercepts_])
+
+    assert all([coef.dtype == dtype for coef in mlp.coefs_])
+
+    if Estimator == MLPRegressor:
+        assert pred.dtype == dtype
+
+
+def test_mlp_loading_from_joblib_partial_fit(tmp_path):
+    """Loading from MLP and partial fitting updates weights. Non-regression
+    test for #19626."""
+    pre_trained_estimator = MLPRegressor(
+        hidden_layer_sizes=(42,), random_state=42, learning_rate_init=0.01, max_iter=200
+    )
+    features, target = [[2]], [4]
+
+    # Fit on x=2, y=4
+    pre_trained_estimator.fit(features, target)
+
+    # dump and load model
+    pickled_file = tmp_path / "mlp.pkl"
+    joblib.dump(pre_trained_estimator, pickled_file)
+    load_estimator = joblib.load(pickled_file)
+
+    # Train for a more epochs on point x=2, y=1
+    fine_tune_features, fine_tune_target = [[2]], [1]
+
+    for _ in range(200):
+        load_estimator.partial_fit(fine_tune_features, fine_tune_target)
+
+    # finetuned model learned the new target
+    predicted_value = load_estimator.predict(fine_tune_features)
+    assert_allclose(predicted_value, fine_tune_target, rtol=1e-4)
+
+
+@pytest.mark.parametrize("Estimator", [MLPClassifier, MLPRegressor])
+def test_preserve_feature_names(Estimator):
+    """Check that feature names are preserved when early stopping is enabled.
+
+    Feature names are required for consistency checks during scoring.
+
+    Non-regression test for gh-24846
+    """
+    pd = pytest.importorskip("pandas")
+    rng = np.random.RandomState(0)
+
+    X = pd.DataFrame(data=rng.randn(10, 2), columns=["colname_a", "colname_b"])
+    y = pd.Series(data=np.full(10, 1), name="colname_y")
+
+    model = Estimator(early_stopping=True, validation_fraction=0.2)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        model.fit(X, y)
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_mlp_warm_start_with_early_stopping(MLPEstimator):
+    """Check that early stopping works with warm start."""
+    mlp = MLPEstimator(
+        max_iter=10, random_state=0, warm_start=True, early_stopping=True
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        mlp.fit(X_iris, y_iris)
+        n_validation_scores = len(mlp.validation_scores_)
+        mlp.set_params(max_iter=20)
+        mlp.fit(X_iris, y_iris)
+    assert len(mlp.validation_scores_) > n_validation_scores
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+@pytest.mark.parametrize("solver", ["sgd", "adam", "lbfgs"])
+def test_mlp_warm_start_no_convergence(MLPEstimator, solver):
+    """Check that we stop the number of iteration at `max_iter` when warm starting.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/24764
+    """
+    model = MLPEstimator(
+        solver=solver,
+        warm_start=True,
+        early_stopping=False,
+        max_iter=10,
+        n_iter_no_change=np.inf,
+        random_state=0,
+    )
+
+    with pytest.warns(ConvergenceWarning):
+        model.fit(X_iris, y_iris)
+    assert model.n_iter_ == 10
+
+    model.set_params(max_iter=20)
+    with pytest.warns(ConvergenceWarning):
+        model.fit(X_iris, y_iris)
+    assert model.n_iter_ == 20
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_mlp_partial_fit_after_fit(MLPEstimator):
+    """Check partial fit does not fail after fit when early_stopping=True.
+
+    Non-regression test for gh-25693.
+    """
+    mlp = MLPEstimator(early_stopping=True, random_state=0).fit(X_iris, y_iris)
+
+    msg = "partial_fit does not support early_stopping=True"
+    with pytest.raises(ValueError, match=msg):
+        mlp.partial_fit(X_iris, y_iris)
+
+
+def test_mlp_diverging_loss():
+    """Test that a diverging model does not raise errors when early stopping is enabled.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/29504
+    """
+    mlp = MLPRegressor(
+        hidden_layer_sizes=100,
+        activation="identity",
+        solver="sgd",
+        alpha=0.0001,
+        learning_rate="constant",
+        learning_rate_init=1,
+        shuffle=True,
+        max_iter=20,
+        early_stopping=True,
+        n_iter_no_change=10,
+        random_state=0,
+    )
+
+    with warnings.catch_warnings():
+        # RuntimeWarning: overflow encountered in matmul
+        # ConvergenceWarning: Stochastic Optimizer: Maximum iteration
+        warnings.simplefilter("ignore", RuntimeWarning)
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        mlp.fit(X_iris, y_iris)
+
+    # In python, float("nan") != float("nan")
+    assert str(mlp.validation_scores_[-1]) == str(np.nan)
+    assert isinstance(mlp.validation_scores_[-1], float)
diff --git a/.venv/Lib/site-packages/sklearn/neural_network/tests/test_rbm.py b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_rbm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dec2d165daa0e557c63113afc29a9c65ddbc258
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_rbm.py
@@ -0,0 +1,251 @@
+import re
+import sys
+from io import StringIO
+
+import numpy as np
+import pytest
+
+from sklearn.datasets import load_digits
+from sklearn.neural_network import BernoulliRBM
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS
+from sklearn.utils.validation import assert_all_finite
+
+Xdigits, _ = load_digits(return_X_y=True)
+Xdigits -= Xdigits.min()
+Xdigits /= Xdigits.max()
+
+
+def test_fit():
+    X = Xdigits.copy()
+
+    rbm = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=10, n_iter=7, random_state=9
+    )
+    rbm.fit(X)
+
+    assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0)
+
+    # in-place tricks shouldn't have modified X
+    assert_array_equal(X, Xdigits)
+
+
+def test_partial_fit():
+    X = Xdigits.copy()
+    rbm = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=20, random_state=9
+    )
+    n_samples = X.shape[0]
+    n_batches = int(np.ceil(float(n_samples) / rbm.batch_size))
+    batch_slices = np.array_split(X, n_batches)
+
+    for i in range(7):
+        for batch in batch_slices:
+            rbm.partial_fit(batch)
+
+    assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0)
+    assert_array_equal(X, Xdigits)
+
+
+def test_transform():
+    X = Xdigits[:100]
+    rbm1 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    rbm1.fit(X)
+
+    Xt1 = rbm1.transform(X)
+    Xt2 = rbm1._mean_hiddens(X)
+
+    assert_array_equal(Xt1, Xt2)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_small_sparse(csr_container):
+    # BernoulliRBM should work on small sparse matrices.
+    X = csr_container(Xdigits[:4])
+    BernoulliRBM().fit(X)  # no exception
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_small_sparse_partial_fit(sparse_container):
+    X_sparse = sparse_container(Xdigits[:100])
+    X = Xdigits[:100].copy()
+
+    rbm1 = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=10, random_state=9
+    )
+    rbm2 = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=10, random_state=9
+    )
+
+    rbm1.partial_fit(X_sparse)
+    rbm2.partial_fit(X)
+
+    assert_almost_equal(
+        rbm1.score_samples(X).mean(), rbm2.score_samples(X).mean(), decimal=0
+    )
+
+
+def test_sample_hiddens():
+    rng = np.random.RandomState(0)
+    X = Xdigits[:100]
+    rbm1 = BernoulliRBM(n_components=2, batch_size=5, n_iter=5, random_state=42)
+    rbm1.fit(X)
+
+    h = rbm1._mean_hiddens(X[0])
+    hs = np.mean([rbm1._sample_hiddens(X[0], rng) for i in range(100)], 0)
+
+    assert_almost_equal(h, hs, decimal=1)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_fit_gibbs(csc_container):
+    # XXX: this test is very seed-dependent! It probably needs to be rewritten.
+
+    # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]]
+    # from the same input
+    rng = np.random.RandomState(42)
+    X = np.array([[0.0], [1.0]])
+    rbm1 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng)
+    # you need that much iters
+    rbm1.fit(X)
+    assert_almost_equal(
+        rbm1.components_, np.array([[0.02649814], [0.02009084]]), decimal=4
+    )
+    assert_almost_equal(rbm1.gibbs(X), X)
+
+    # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] from
+    # the same input even when the input is sparse, and test against non-sparse
+    rng = np.random.RandomState(42)
+    X = csc_container([[0.0], [1.0]])
+    rbm2 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng)
+    rbm2.fit(X)
+    assert_almost_equal(
+        rbm2.components_, np.array([[0.02649814], [0.02009084]]), decimal=4
+    )
+    assert_almost_equal(rbm2.gibbs(X), X.toarray())
+    assert_almost_equal(rbm1.components_, rbm2.components_)
+
+
+def test_gibbs_smoke():
+    # Check if we don't get NaNs sampling the full digits dataset.
+    # Also check that sampling again will yield different results.
+    X = Xdigits
+    rbm1 = BernoulliRBM(n_components=42, batch_size=40, n_iter=20, random_state=42)
+    rbm1.fit(X)
+    X_sampled = rbm1.gibbs(X)
+    assert_all_finite(X_sampled)
+    X_sampled2 = rbm1.gibbs(X)
+    assert np.all((X_sampled != X_sampled2).max(axis=1))
+
+
+@pytest.mark.parametrize("lil_containers", LIL_CONTAINERS)
+def test_score_samples(lil_containers):
+    # Test score_samples (pseudo-likelihood) method.
+    # Assert that pseudo-likelihood is computed without clipping.
+    # See Fabian's blog, http://bit.ly/1iYefRk
+    rng = np.random.RandomState(42)
+    X = np.vstack([np.zeros(1000), np.ones(1000)])
+    rbm1 = BernoulliRBM(n_components=10, batch_size=2, n_iter=10, random_state=rng)
+    rbm1.fit(X)
+    assert (rbm1.score_samples(X) < -300).all()
+
+    # Sparse vs. dense should not affect the output. Also test sparse input
+    # validation.
+    rbm1.random_state = 42
+    d_score = rbm1.score_samples(X)
+    rbm1.random_state = 42
+    s_score = rbm1.score_samples(lil_containers(X))
+    assert_almost_equal(d_score, s_score)
+
+    # Test numerical stability (#2785): would previously generate infinities
+    # and crash with an exception.
+    with np.errstate(under="ignore"):
+        rbm1.score_samples([np.arange(1000) * 100])
+
+
+def test_rbm_verbose():
+    rbm = BernoulliRBM(n_iter=2, verbose=10)
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        rbm.fit(Xdigits)
+    finally:
+        sys.stdout = old_stdout
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_sparse_and_verbose(csc_container):
+    # Make sure RBM works with sparse input when verbose=True
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+
+    X = csc_container([[0.0], [1.0]])
+    rbm = BernoulliRBM(
+        n_components=2, batch_size=2, n_iter=1, random_state=42, verbose=True
+    )
+    try:
+        rbm.fit(X)
+        s = sys.stdout.getvalue()
+        # make sure output is sound
+        assert re.match(
+            r"\[BernoulliRBM\] Iteration 1,"
+            r" pseudo-likelihood = -?(\d)+(\.\d+)?,"
+            r" time = (\d|\.)+s",
+            s,
+        )
+    finally:
+        sys.stdout = old_stdout
+
+
+@pytest.mark.parametrize(
+    "dtype_in, dtype_out",
+    [(np.float32, np.float32), (np.float64, np.float64), (int, np.float64)],
+)
+def test_transformer_dtypes_casting(dtype_in, dtype_out):
+    X = Xdigits[:100].astype(dtype_in)
+    rbm = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    Xt = rbm.fit_transform(X)
+
+    # dtype_in and dtype_out should be consistent
+    assert Xt.dtype == dtype_out, "transform dtype: {} - original dtype: {}".format(
+        Xt.dtype, X.dtype
+    )
+
+
+def test_convergence_dtype_consistency():
+    # float 64 transformer
+    X_64 = Xdigits[:100].astype(np.float64)
+    rbm_64 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    Xt_64 = rbm_64.fit_transform(X_64)
+
+    # float 32 transformer
+    X_32 = Xdigits[:100].astype(np.float32)
+    rbm_32 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    Xt_32 = rbm_32.fit_transform(X_32)
+
+    # results and attributes should be close enough in 32 bit and 64 bit
+    assert_allclose(Xt_64, Xt_32, rtol=1e-06, atol=0)
+    assert_allclose(
+        rbm_64.intercept_hidden_, rbm_32.intercept_hidden_, rtol=1e-06, atol=0
+    )
+    assert_allclose(
+        rbm_64.intercept_visible_, rbm_32.intercept_visible_, rtol=1e-05, atol=0
+    )
+    assert_allclose(rbm_64.components_, rbm_32.components_, rtol=1e-03, atol=0)
+    assert_allclose(rbm_64.h_samples_, rbm_32.h_samples_)
+
+
+@pytest.mark.parametrize("method", ["fit", "partial_fit"])
+def test_feature_names_out(method):
+    """Check `get_feature_names_out` for `BernoulliRBM`."""
+    n_components = 10
+    rbm = BernoulliRBM(n_components=n_components)
+    getattr(rbm, method)(Xdigits)
+
+    names = rbm.get_feature_names_out()
+    expected_names = [f"bernoullirbm{i}" for i in range(n_components)]
+    assert_array_equal(expected_names, names)
diff --git a/.venv/Lib/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1087da642c3c497602bdc90a7aaa9da0e069ee57
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py
@@ -0,0 +1,112 @@
+import numpy as np
+
+from sklearn.neural_network._stochastic_optimizers import (
+    AdamOptimizer,
+    BaseOptimizer,
+    SGDOptimizer,
+)
+from sklearn.utils._testing import assert_array_equal
+
+shapes = [(4, 6), (6, 8), (7, 8, 9)]
+
+
+def test_base_optimizer():
+    for lr in [10**i for i in range(-3, 4)]:
+        optimizer = BaseOptimizer(lr)
+        assert optimizer.trigger_stopping("", False)
+
+
+def test_sgd_optimizer_no_momentum():
+    params = [np.zeros(shape) for shape in shapes]
+    rng = np.random.RandomState(0)
+
+    for lr in [10**i for i in range(-3, 4)]:
+        optimizer = SGDOptimizer(params, lr, momentum=0, nesterov=False)
+        grads = [rng.random_sample(shape) for shape in shapes]
+        expected = [param - lr * grad for param, grad in zip(params, grads)]
+        optimizer.update_params(params, grads)
+
+        for exp, param in zip(expected, params):
+            assert_array_equal(exp, param)
+
+
+def test_sgd_optimizer_momentum():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 0.1
+    rng = np.random.RandomState(0)
+
+    for momentum in np.arange(0.5, 0.9, 0.1):
+        optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=False)
+        velocities = [rng.random_sample(shape) for shape in shapes]
+        optimizer.velocities = velocities
+        grads = [rng.random_sample(shape) for shape in shapes]
+        updates = [
+            momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads)
+        ]
+        expected = [param + update for param, update in zip(params, updates)]
+        optimizer.update_params(params, grads)
+
+        for exp, param in zip(expected, params):
+            assert_array_equal(exp, param)
+
+
+def test_sgd_optimizer_trigger_stopping():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 2e-6
+    optimizer = SGDOptimizer(params, lr, lr_schedule="adaptive")
+    assert not optimizer.trigger_stopping("", False)
+    assert lr / 5 == optimizer.learning_rate
+    assert optimizer.trigger_stopping("", False)
+
+
+def test_sgd_optimizer_nesterovs_momentum():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 0.1
+    rng = np.random.RandomState(0)
+
+    for momentum in np.arange(0.5, 0.9, 0.1):
+        optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=True)
+        velocities = [rng.random_sample(shape) for shape in shapes]
+        optimizer.velocities = velocities
+        grads = [rng.random_sample(shape) for shape in shapes]
+        updates = [
+            momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads)
+        ]
+        updates = [
+            momentum * update - lr * grad for update, grad in zip(updates, grads)
+        ]
+        expected = [param + update for param, update in zip(params, updates)]
+        optimizer.update_params(params, grads)
+
+        for exp, param in zip(expected, params):
+            assert_array_equal(exp, param)
+
+
+def test_adam_optimizer():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 0.001
+    epsilon = 1e-8
+    rng = np.random.RandomState(0)
+
+    for beta_1 in np.arange(0.9, 1.0, 0.05):
+        for beta_2 in np.arange(0.995, 1.0, 0.001):
+            optimizer = AdamOptimizer(params, lr, beta_1, beta_2, epsilon)
+            ms = [rng.random_sample(shape) for shape in shapes]
+            vs = [rng.random_sample(shape) for shape in shapes]
+            t = 10
+            optimizer.ms = ms
+            optimizer.vs = vs
+            optimizer.t = t - 1
+            grads = [rng.random_sample(shape) for shape in shapes]
+
+            ms = [beta_1 * m + (1 - beta_1) * grad for m, grad in zip(ms, grads)]
+            vs = [beta_2 * v + (1 - beta_2) * (grad**2) for v, grad in zip(vs, grads)]
+            learning_rate = lr * np.sqrt(1 - beta_2**t) / (1 - beta_1**t)
+            updates = [
+                -learning_rate * m / (np.sqrt(v) + epsilon) for m, v in zip(ms, vs)
+            ]
+            expected = [param + update for param, update in zip(params, updates)]
+
+            optimizer.update_params(params, grads)
+            for exp, param in zip(expected, params):
+                assert_array_equal(exp, param)
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/__init__.py b/.venv/Lib/site-packages/sklearn/preprocessing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d48372874958cf43d13c4c4bebd90abc28344f3
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/__init__.py
@@ -0,0 +1,63 @@
+"""Methods for scaling, centering, normalization, binarization, and more."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._data import (
+    Binarizer,
+    KernelCenterer,
+    MaxAbsScaler,
+    MinMaxScaler,
+    Normalizer,
+    PowerTransformer,
+    QuantileTransformer,
+    RobustScaler,
+    StandardScaler,
+    add_dummy_feature,
+    binarize,
+    maxabs_scale,
+    minmax_scale,
+    normalize,
+    power_transform,
+    quantile_transform,
+    robust_scale,
+    scale,
+)
+from ._discretization import KBinsDiscretizer
+from ._encoders import OneHotEncoder, OrdinalEncoder
+from ._function_transformer import FunctionTransformer
+from ._label import LabelBinarizer, LabelEncoder, MultiLabelBinarizer, label_binarize
+from ._polynomial import PolynomialFeatures, SplineTransformer
+from ._target_encoder import TargetEncoder
+
+__all__ = [
+    "Binarizer",
+    "FunctionTransformer",
+    "KBinsDiscretizer",
+    "KernelCenterer",
+    "LabelBinarizer",
+    "LabelEncoder",
+    "MultiLabelBinarizer",
+    "MinMaxScaler",
+    "MaxAbsScaler",
+    "QuantileTransformer",
+    "Normalizer",
+    "OneHotEncoder",
+    "OrdinalEncoder",
+    "PowerTransformer",
+    "RobustScaler",
+    "SplineTransformer",
+    "StandardScaler",
+    "TargetEncoder",
+    "add_dummy_feature",
+    "PolynomialFeatures",
+    "binarize",
+    "normalize",
+    "scale",
+    "robust_scale",
+    "maxabs_scale",
+    "minmax_scale",
+    "label_binarize",
+    "quantile_transform",
+    "power_transform",
+]
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.lib
new file mode 100644
index 0000000000000000000000000000000000000000..bf29e3455aa6a7241418c917282ecbe907f7a300
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.lib differ
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..e43d91b51982942b22afdec5ad9a70e3065d65d9
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp39-win_amd64.pyd differ
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..8a7260f2f48f6373981e9dd8fe8ecbaadb752bf6
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx
@@ -0,0 +1,258 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ..utils._typedefs cimport uint8_t, int64_t, intp_t
+
+ctypedef uint8_t FLAG_t
+
+# We use the following verbatim block to determine whether the current
+# platform's compiler supports 128-bit integer values intrinsically.
+# This should work for GCC and CLANG on 64-bit architectures, but doesn't for
+# MSVC on any architecture. We prefer to use 128-bit integers when possible
+# because the intermediate calculations have a non-trivial risk of overflow. It
+# is, however, very unlikely to come up on an average use case, hence 64-bit
+# integers (i.e. `long long`) are "good enough" for most common cases. There is
+# not much we can do to efficiently mitigate  the overflow risk on the Windows
+# platform at this time. Consider this a "best effort" design decision that
+# could be revisited later in case someone comes up with a safer option that
+# does not hurt the performance of the common cases.
+# See `test_sizeof_LARGEST_INT_t()`for more information on exact type expectations.
+cdef extern from *:
+    """
+    #ifdef __SIZEOF_INT128__
+        typedef __int128 LARGEST_INT_t;
+    #elif (__clang__ || __EMSCRIPTEN__) && !__i386__
+        typedef _BitInt(128) LARGEST_INT_t;
+    #else
+        typedef long long LARGEST_INT_t;
+    #endif
+    """
+    ctypedef long long LARGEST_INT_t
+
+
+# Determine the size of `LARGEST_INT_t` at runtime.
+# Used in `test_sizeof_LARGEST_INT_t`.
+def _get_sizeof_LARGEST_INT_t():
+    return sizeof(LARGEST_INT_t)
+
+
+# TODO: use `{int,float}{32,64}_t` when cython#5230 is resolved:
+# https://github.com/cython/cython/issues/5230
+ctypedef fused DATA_t:
+    float
+    double
+    int
+    long long
+# INDEX_{A,B}_t are defined to generate a proper Cartesian product
+# of types through Cython fused-type expansion.
+ctypedef fused INDEX_A_t:
+    signed int
+    signed long long
+ctypedef fused INDEX_B_t:
+    signed int
+    signed long long
+
+cdef inline int64_t _deg2_column(
+    LARGEST_INT_t n_features,
+    LARGEST_INT_t i,
+    LARGEST_INT_t j,
+    FLAG_t interaction_only
+) nogil:
+    """Compute the index of the column for a degree 2 expansion
+
+    n_features is the dimensionality of the input data, i and j are the indices
+    for the columns involved in the expansion.
+    """
+    if interaction_only:
+        return n_features * i - i * (i + 3) / 2 - 1 + j
+    else:
+        return n_features * i - i* (i + 1) / 2 + j
+
+
+cdef inline int64_t _deg3_column(
+    LARGEST_INT_t n_features,
+    LARGEST_INT_t i,
+    LARGEST_INT_t j,
+    LARGEST_INT_t k,
+    FLAG_t interaction_only
+) nogil:
+    """Compute the index of the column for a degree 3 expansion
+
+    n_features is the dimensionality of the input data, i, j and k are the indices
+    for the columns involved in the expansion.
+    """
+    if interaction_only:
+        return (
+            (
+                (3 * n_features) * (n_features * i - i**2)
+                + i * (i**2 + 11) - (3 * j) * (j + 3)
+            ) / 6 + i**2 + n_features * (j - 1 - 2 * i) + k
+        )
+    else:
+        return (
+            (
+                (3 * n_features) * (n_features * i - i**2)
+                + i ** 3 - i - (3 * j) * (j + 1)
+            ) / 6 + n_features * j + k
+        )
+
+
+def py_calc_expanded_nnz_deg2(n, interaction_only):
+    return n * (n + 1) // 2 - interaction_only * n
+
+
+def py_calc_expanded_nnz_deg3(n, interaction_only):
+    return n * (n**2 + 3 * n + 2) // 6 - interaction_only * n**2
+
+
+cpdef int64_t _calc_expanded_nnz(
+    LARGEST_INT_t n,
+    FLAG_t interaction_only,
+    LARGEST_INT_t degree
+):
+    """
+    Calculates the number of non-zero interaction terms generated by the
+    non-zero elements of a single row.
+    """
+    # This is the maximum value before the intermediate computation
+    # d**2 + d overflows
+    # Solution to d**2 + d = maxint64
+    # SymPy: solve(x**2 + x - int64_max, x)
+    cdef int64_t MAX_SAFE_INDEX_CALC_DEG2 = 3037000499
+
+    # This is the maximum value before the intermediate computation
+    # d**3 + 3 * d**2 + 2*d overflows
+    # Solution to d**3 + 3 * d**2 + 2*d = maxint64
+    # SymPy: solve(x * (x**2 + 3 * x + 2) - int64_max, x)
+    cdef int64_t MAX_SAFE_INDEX_CALC_DEG3 = 2097151
+
+    if degree == 2:
+        # Only need to check when not using 128-bit integers
+        if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG2:
+            return n * (n + 1) / 2 - interaction_only * n
+        return <int64_t> py_calc_expanded_nnz_deg2(n, interaction_only)
+    else:
+        # Only need to check when not using 128-bit integers
+        if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG3:
+            return n * (n**2 + 3 * n + 2) / 6 - interaction_only * n**2
+        return <int64_t> py_calc_expanded_nnz_deg3(n, interaction_only)
+
+cpdef int64_t _calc_total_nnz(
+    INDEX_A_t[:] indptr,
+    FLAG_t interaction_only,
+    int64_t degree,
+):
+    """
+    Calculates the number of non-zero interaction terms generated by the
+    non-zero elements across all rows for a single degree.
+    """
+    cdef int64_t total_nnz=0
+    cdef intp_t row_idx
+    for row_idx in range(len(indptr) - 1):
+        total_nnz += _calc_expanded_nnz(
+            indptr[row_idx + 1] - indptr[row_idx],
+            interaction_only,
+            degree
+        )
+    return total_nnz
+
+
+cpdef void _csr_polynomial_expansion(
+    const DATA_t[:] data,           # IN READ-ONLY
+    const INDEX_A_t[:] indices,     # IN READ-ONLY
+    const INDEX_A_t[:] indptr,      # IN READ-ONLY
+    INDEX_A_t n_features,
+    DATA_t[:] result_data,          # OUT
+    INDEX_B_t[:] result_indices,    # OUT
+    INDEX_B_t[:] result_indptr,     # OUT
+    FLAG_t interaction_only,
+    FLAG_t degree
+):
+    """
+    Perform a second or third degree polynomial or interaction expansion on a
+    compressed sparse row (CSR) matrix. The method used only takes products of
+    non-zero features. For a matrix with density :math:`d`, this results in a
+    speedup on the order of :math:`(1/d)^k` where :math:`k` is the degree of
+    the expansion, assuming all rows are of similar density.
+
+    Parameters
+    ----------
+    data : memory view on nd-array
+        The "data" attribute of the input CSR matrix.
+
+    indices : memory view on nd-array
+        The "indices" attribute of the input CSR matrix.
+
+    indptr : memory view on nd-array
+        The "indptr" attribute of the input CSR matrix.
+
+    n_features : int
+        The dimensionality of the input CSR matrix.
+
+    result_data : nd-array
+        The output CSR matrix's "data" attribute.
+        It is modified by this routine.
+
+    result_indices : nd-array
+        The output CSR matrix's "indices" attribute.
+        It is modified by this routine.
+
+    result_indptr : nd-array
+        The output CSR matrix's "indptr" attribute.
+        It is modified by this routine.
+
+    interaction_only : int
+        0 for a polynomial expansion, 1 for an interaction expansion.
+
+    degree : int
+        The degree of the expansion. This must be either 2 or 3.
+
+    References
+    ----------
+    "Leveraging Sparsity to Speed Up Polynomial Feature Expansions of CSR
+    Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
+    """
+
+    # Make the arrays that will form the CSR matrix of the expansion.
+    cdef INDEX_A_t row_i, row_starts, row_ends, i, j, k, i_ptr, j_ptr, k_ptr
+    cdef INDEX_B_t expanded_index=0, num_cols_in_row, col
+    with nogil:
+        result_indptr[0] = indptr[0]
+        for row_i in range(indptr.shape[0]-1):
+            row_starts = indptr[row_i]
+            row_ends = indptr[row_i + 1]
+            num_cols_in_row = 0
+            for i_ptr in range(row_starts, row_ends):
+                i = indices[i_ptr]
+                for j_ptr in range(i_ptr + interaction_only, row_ends):
+                    j = indices[j_ptr]
+                    if degree == 2:
+                        col = <INDEX_B_t> _deg2_column(
+                            n_features,
+                            i, j,
+                            interaction_only
+                        )
+                        result_indices[expanded_index] = col
+                        result_data[expanded_index] = (
+                            data[i_ptr] * data[j_ptr]
+                        )
+                        expanded_index += 1
+                        num_cols_in_row += 1
+                    else:
+                        # degree == 3
+                        for k_ptr in range(j_ptr + interaction_only, row_ends):
+                            k = indices[k_ptr]
+                            col = <INDEX_B_t> _deg3_column(
+                                n_features,
+                                i, j, k,
+                                interaction_only
+                            )
+                            result_indices[expanded_index] = col
+                            result_data[expanded_index] = (
+                                data[i_ptr] * data[j_ptr] * data[k_ptr]
+                            )
+                            expanded_index += 1
+                            num_cols_in_row += 1
+
+            result_indptr[row_i+1] = result_indptr[row_i] + num_cols_in_row
+    return
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_data.py b/.venv/Lib/site-packages/sklearn/preprocessing/_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8f4ec3696dc3379521a359ecbc22db9a419a4d9
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/_data.py
@@ -0,0 +1,3683 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import optimize, sparse, stats
+from scipy.special import boxcox, inv_boxcox
+
+from sklearn.utils import metadata_routing
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    OneToOneFeatureMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..utils import _array_api, check_array, resample
+from ..utils._array_api import _modify_in_place_if_numpy, device, get_namespace
+from ..utils._param_validation import Interval, Options, StrOptions, validate_params
+from ..utils.extmath import _incremental_mean_and_var, row_norms
+from ..utils.sparsefuncs import (
+    incr_mean_variance_axis,
+    inplace_column_scale,
+    mean_variance_axis,
+    min_max_axis,
+)
+from ..utils.sparsefuncs_fast import (
+    inplace_csr_row_normalize_l1,
+    inplace_csr_row_normalize_l2,
+)
+from ..utils.validation import (
+    FLOAT_DTYPES,
+    _check_sample_weight,
+    check_is_fitted,
+    check_random_state,
+    validate_data,
+)
+from ._encoders import OneHotEncoder
+
+BOUNDS_THRESHOLD = 1e-7
+
+__all__ = [
+    "Binarizer",
+    "KernelCenterer",
+    "MinMaxScaler",
+    "MaxAbsScaler",
+    "Normalizer",
+    "OneHotEncoder",
+    "RobustScaler",
+    "StandardScaler",
+    "QuantileTransformer",
+    "PowerTransformer",
+    "add_dummy_feature",
+    "binarize",
+    "normalize",
+    "scale",
+    "robust_scale",
+    "maxabs_scale",
+    "minmax_scale",
+    "quantile_transform",
+    "power_transform",
+]
+
+
+def _is_constant_feature(var, mean, n_samples):
+    """Detect if a feature is indistinguishable from a constant feature.
+
+    The detection is based on its computed variance and on the theoretical
+    error bounds of the '2 pass algorithm' for variance computation.
+
+    See "Algorithms for computing the sample variance: analysis and
+    recommendations", by Chan, Golub, and LeVeque.
+    """
+    # In scikit-learn, variance is always computed using float64 accumulators.
+    eps = np.finfo(np.float64).eps
+
+    upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2
+    return var <= upper_bound
+
+
+def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
+    """Set scales of near constant features to 1.
+
+    The goal is to avoid division by very small or zero values.
+
+    Near constant features are detected automatically by identifying
+    scales close to machine precision unless they are precomputed by
+    the caller and passed with the `constant_mask` kwarg.
+
+    Typically for standard scaling, the scales are the standard
+    deviation while near constant features are better detected on the
+    computed variances which are closer to machine precision by
+    construction.
+    """
+    # if we are fitting on 1D arrays, scale might be a scalar
+    if np.isscalar(scale):
+        if scale == 0.0:
+            scale = 1.0
+        return scale
+    # scale is an array
+    else:
+        xp, _ = get_namespace(scale)
+        if constant_mask is None:
+            # Detect near constant values to avoid dividing by a very small
+            # value that could lead to surprising results and numerical
+            # stability issues.
+            constant_mask = scale < 10 * xp.finfo(scale.dtype).eps
+
+        if copy:
+            # New array to avoid side-effects
+            scale = xp.asarray(scale, copy=True)
+        scale[constant_mask] = 1.0
+        return scale
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "axis": [Options(Integral, {0, 1})],
+        "with_mean": ["boolean"],
+        "with_std": ["boolean"],
+        "copy": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
+    """Standardize a dataset along any axis.
+
+    Center to the mean and component wise scale to unit variance.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to center and scale.
+
+    axis : {0, 1}, default=0
+        Axis used to compute the means and standard deviations along. If 0,
+        independently standardize each feature, otherwise (if 1) standardize
+        each sample.
+
+    with_mean : bool, default=True
+        If True, center the data before scaling.
+
+    with_std : bool, default=True
+        If True, scale the data to unit variance (or equivalently,
+        unit standard deviation).
+
+    copy : bool, default=True
+        If False, try to avoid a copy and scale in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    Returns
+    -------
+    X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The transformed data.
+
+    See Also
+    --------
+    StandardScaler : Performs scaling to unit variance using the Transformer
+        API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    This implementation will refuse to center scipy.sparse matrices
+    since it would make them non-sparse and would potentially crash the
+    program with memory exhaustion problems.
+
+    Instead the caller is expected to either set explicitly
+    `with_mean=False` (in that case, only variance scaling will be
+    performed on the features of the CSC matrix) or to call `X.toarray()`
+    if he/she expects the materialized dense array to fit in memory.
+
+    To avoid memory copy the caller should pass a CSC matrix.
+
+    NaNs are treated as missing values: disregarded to compute the statistics,
+    and maintained during the data transformation.
+
+    We use a biased estimator for the standard deviation, equivalent to
+    `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
+    affect model performance.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.scale` unless you know
+        what you are doing. A common mistake is to apply it to the entire data
+        *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.StandardScaler` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import scale
+    >>> X = [[-2, 1, 2], [-1, 0, 1]]
+    >>> scale(X, axis=0)  # scaling each column independently
+    array([[-1.,  1.,  1.],
+           [ 1., -1., -1.]])
+    >>> scale(X, axis=1)  # scaling each row independently
+    array([[-1.37...,  0.39...,  0.98...],
+           [-1.22...,  0.     ,  1.22...]])
+    """
+    X = check_array(
+        X,
+        accept_sparse="csc",
+        copy=copy,
+        ensure_2d=False,
+        estimator="the scale function",
+        dtype=FLOAT_DTYPES,
+        ensure_all_finite="allow-nan",
+    )
+    if sparse.issparse(X):
+        if with_mean:
+            raise ValueError(
+                "Cannot center sparse matrices: pass `with_mean=False` instead"
+                " See docstring for motivation and alternatives."
+            )
+        if axis != 0:
+            raise ValueError(
+                "Can only scale sparse matrix on axis=0,  got axis=%d" % axis
+            )
+        if with_std:
+            _, var = mean_variance_axis(X, axis=0)
+            var = _handle_zeros_in_scale(var, copy=False)
+            inplace_column_scale(X, 1 / np.sqrt(var))
+    else:
+        X = np.asarray(X)
+        if with_mean:
+            mean_ = np.nanmean(X, axis)
+        if with_std:
+            scale_ = np.nanstd(X, axis)
+        # Xr is a view on the original array that enables easy use of
+        # broadcasting on the axis in which we are interested in
+        Xr = np.rollaxis(X, axis)
+        if with_mean:
+            Xr -= mean_
+            mean_1 = np.nanmean(Xr, axis=0)
+            # Verify that mean_1 is 'close to zero'. If X contains very
+            # large values, mean_1 can also be very large, due to a lack of
+            # precision of mean_. In this case, a pre-scaling of the
+            # concerned feature is efficient, for instance by its mean or
+            # maximum.
+            if not np.allclose(mean_1, 0):
+                warnings.warn(
+                    "Numerical issues were encountered "
+                    "when centering the data "
+                    "and might not be solved. Dataset may "
+                    "contain too large values. You may need "
+                    "to prescale your features."
+                )
+                Xr -= mean_1
+        if with_std:
+            scale_ = _handle_zeros_in_scale(scale_, copy=False)
+            Xr /= scale_
+            if with_mean:
+                mean_2 = np.nanmean(Xr, axis=0)
+                # If mean_2 is not 'close to zero', it comes from the fact that
+                # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even
+                # if mean_1 was close to zero. The problem is thus essentially
+                # due to the lack of precision of mean_. A solution is then to
+                # subtract the mean again:
+                if not np.allclose(mean_2, 0):
+                    warnings.warn(
+                        "Numerical issues were encountered "
+                        "when scaling the data "
+                        "and might not be solved. The standard "
+                        "deviation of the data is probably "
+                        "very close to 0. "
+                    )
+                    Xr -= mean_2
+    return X
+
+
+class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Transform features by scaling each feature to a given range.
+
+    This estimator scales and translates each feature individually such
+    that it is in the given range on the training set, e.g. between
+    zero and one.
+
+    The transformation is given by::
+
+        X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
+        X_scaled = X_std * (max - min) + min
+
+    where min, max = feature_range.
+
+    This transformation is often used as an alternative to zero mean,
+    unit variance scaling.
+
+    `MinMaxScaler` doesn't reduce the effect of outliers, but it linearly
+    scales them down into a fixed range, where the largest occurring data point
+    corresponds to the maximum value and the smallest one corresponds to the
+    minimum value. For an example visualization, refer to :ref:`Compare
+    MinMaxScaler with other scalers <plot_all_scaling_minmax_scaler_section>`.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    feature_range : tuple (min, max), default=(0, 1)
+        Desired range of transformed data.
+
+    copy : bool, default=True
+        Set to False to perform inplace row normalization and avoid a
+        copy (if the input is already a numpy array).
+
+    clip : bool, default=False
+        Set to True to clip transformed values of held-out data to
+        provided `feature range`.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    min_ : ndarray of shape (n_features,)
+        Per feature adjustment for minimum. Equivalent to
+        ``min - X.min(axis=0) * self.scale_``
+
+    scale_ : ndarray of shape (n_features,)
+        Per feature relative scaling of the data. Equivalent to
+        ``(max - min) / (X.max(axis=0) - X.min(axis=0))``
+
+        .. versionadded:: 0.17
+           *scale_* attribute.
+
+    data_min_ : ndarray of shape (n_features,)
+        Per feature minimum seen in the data
+
+        .. versionadded:: 0.17
+           *data_min_*
+
+    data_max_ : ndarray of shape (n_features,)
+        Per feature maximum seen in the data
+
+        .. versionadded:: 0.17
+           *data_max_*
+
+    data_range_ : ndarray of shape (n_features,)
+        Per feature range ``(data_max_ - data_min_)`` seen in the data
+
+        .. versionadded:: 0.17
+           *data_range_*
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    n_samples_seen_ : int
+        The number of samples processed by the estimator.
+        It will be reset on new calls to fit, but increments across
+        ``partial_fit`` calls.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    minmax_scale : Equivalent function without the estimator API.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import MinMaxScaler
+    >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
+    >>> scaler = MinMaxScaler()
+    >>> print(scaler.fit(data))
+    MinMaxScaler()
+    >>> print(scaler.data_max_)
+    [ 1. 18.]
+    >>> print(scaler.transform(data))
+    [[0.   0.  ]
+     [0.25 0.25]
+     [0.5  0.5 ]
+     [1.   1.  ]]
+    >>> print(scaler.transform([[2, 2]]))
+    [[1.5 0. ]]
+    """
+
+    _parameter_constraints: dict = {
+        "feature_range": [tuple],
+        "copy": ["boolean"],
+        "clip": ["boolean"],
+    }
+
+    def __init__(self, feature_range=(0, 1), *, copy=True, clip=False):
+        self.feature_range = feature_range
+        self.copy = copy
+        self.clip = clip
+
+    def _reset(self):
+        """Reset internal data-dependent state of the scaler, if necessary.
+
+        __init__ parameters are not touched.
+        """
+        # Checking one attribute is enough, because they are all set together
+        # in partial_fit
+        if hasattr(self, "scale_"):
+            del self.scale_
+            del self.min_
+            del self.n_samples_seen_
+            del self.data_min_
+            del self.data_max_
+            del self.data_range_
+
+    def fit(self, X, y=None):
+        """Compute the minimum and maximum to be used for later scaling.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to compute the per-feature minimum and maximum
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        # Reset internal state before fitting
+        self._reset()
+        return self.partial_fit(X, y)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None):
+        """Online computation of min and max on X for later scaling.
+
+        All of X is processed as a single batch. This is intended for cases
+        when :meth:`fit` is not feasible due to very large number of
+        `n_samples` or because X is read from a continuous stream.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        feature_range = self.feature_range
+        if feature_range[0] >= feature_range[1]:
+            raise ValueError(
+                "Minimum of desired feature range must be smaller than maximum. Got %s."
+                % str(feature_range)
+            )
+
+        if sparse.issparse(X):
+            raise TypeError(
+                "MinMaxScaler does not support sparse input. "
+                "Consider using MaxAbsScaler instead."
+            )
+
+        xp, _ = get_namespace(X)
+
+        first_pass = not hasattr(self, "n_samples_seen_")
+        X = validate_data(
+            self,
+            X,
+            reset=first_pass,
+            dtype=_array_api.supported_float_dtypes(xp),
+            ensure_all_finite="allow-nan",
+        )
+
+        data_min = _array_api._nanmin(X, axis=0, xp=xp)
+        data_max = _array_api._nanmax(X, axis=0, xp=xp)
+
+        if first_pass:
+            self.n_samples_seen_ = X.shape[0]
+        else:
+            data_min = xp.minimum(self.data_min_, data_min)
+            data_max = xp.maximum(self.data_max_, data_max)
+            self.n_samples_seen_ += X.shape[0]
+
+        data_range = data_max - data_min
+        self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(
+            data_range, copy=True
+        )
+        self.min_ = feature_range[0] - data_min * self.scale_
+        self.data_min_ = data_min
+        self.data_max_ = data_max
+        self.data_range_ = data_range
+        return self
+
+    def transform(self, X):
+        """Scale features of X according to feature_range.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data that will be transformed.
+
+        Returns
+        -------
+        Xt : ndarray of shape (n_samples, n_features)
+            Transformed data.
+        """
+        check_is_fitted(self)
+
+        xp, _ = get_namespace(X)
+
+        X = validate_data(
+            self,
+            X,
+            copy=self.copy,
+            dtype=_array_api.supported_float_dtypes(xp),
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+            reset=False,
+        )
+
+        X *= self.scale_
+        X += self.min_
+        if self.clip:
+            device_ = device(X)
+            X = _modify_in_place_if_numpy(
+                xp,
+                xp.clip,
+                X,
+                xp.asarray(self.feature_range[0], dtype=X.dtype, device=device_),
+                xp.asarray(self.feature_range[1], dtype=X.dtype, device=device_),
+                out=X,
+            )
+        return X
+
+    def inverse_transform(self, X):
+        """Undo the scaling of X according to feature_range.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data that will be transformed. It cannot be sparse.
+
+        Returns
+        -------
+        Xt : ndarray of shape (n_samples, n_features)
+            Transformed data.
+        """
+        check_is_fitted(self)
+
+        xp, _ = get_namespace(X)
+
+        X = check_array(
+            X,
+            copy=self.copy,
+            dtype=_array_api.supported_float_dtypes(xp),
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        X -= self.min_
+        X /= self.scale_
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        tags.array_api_support = True
+        return tags
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "axis": [Options(Integral, {0, 1})],
+    },
+    prefer_skip_nested_validation=False,
+)
+def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):
+    """Transform features by scaling each feature to a given range.
+
+    This estimator scales and translates each feature individually such
+    that it is in the given range on the training set, i.e. between
+    zero and one.
+
+    The transformation is given by (when ``axis=0``)::
+
+        X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
+        X_scaled = X_std * (max - min) + min
+
+    where min, max = feature_range.
+
+    The transformation is calculated as (when ``axis=0``)::
+
+       X_scaled = scale * X + min - X.min(axis=0) * scale
+       where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))
+
+    This transformation is often used as an alternative to zero mean,
+    unit variance scaling.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    .. versionadded:: 0.17
+       *minmax_scale* function interface
+       to :class:`~sklearn.preprocessing.MinMaxScaler`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        The data.
+
+    feature_range : tuple (min, max), default=(0, 1)
+        Desired range of transformed data.
+
+    axis : {0, 1}, default=0
+        Axis used to scale along. If 0, independently scale each feature,
+        otherwise (if 1) scale each sample.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and scale in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    Returns
+    -------
+    X_tr : ndarray of shape (n_samples, n_features)
+        The transformed data.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know
+        what you are doing. A common mistake is to apply it to the entire data
+        *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.MinMaxScaler` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`.
+
+    See Also
+    --------
+    MinMaxScaler : Performs scaling to a given range using the Transformer
+        API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import minmax_scale
+    >>> X = [[-2, 1, 2], [-1, 0, 1]]
+    >>> minmax_scale(X, axis=0)  # scale each column independently
+    array([[0., 1., 1.],
+           [1., 0., 0.]])
+    >>> minmax_scale(X, axis=1)  # scale each row independently
+    array([[0.  , 0.75, 1.  ],
+           [0.  , 0.5 , 1.  ]])
+    """
+    # Unlike the scaler object, this function allows 1d input.
+    # If copy is required, it will be done inside the scaler object.
+    X = check_array(
+        X,
+        copy=False,
+        ensure_2d=False,
+        dtype=FLOAT_DTYPES,
+        ensure_all_finite="allow-nan",
+    )
+    original_ndim = X.ndim
+
+    if original_ndim == 1:
+        X = X.reshape(X.shape[0], 1)
+
+    s = MinMaxScaler(feature_range=feature_range, copy=copy)
+    if axis == 0:
+        X = s.fit_transform(X)
+    else:
+        X = s.fit_transform(X.T).T
+
+    if original_ndim == 1:
+        X = X.ravel()
+
+    return X
+
+
+class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Standardize features by removing the mean and scaling to unit variance.
+
+    The standard score of a sample `x` is calculated as:
+
+    .. code-block:: text
+
+        z = (x - u) / s
+
+    where `u` is the mean of the training samples or zero if `with_mean=False`,
+    and `s` is the standard deviation of the training samples or one if
+    `with_std=False`.
+
+    Centering and scaling happen independently on each feature by computing
+    the relevant statistics on the samples in the training set. Mean and
+    standard deviation are then stored to be used on later data using
+    :meth:`transform`.
+
+    Standardization of a dataset is a common requirement for many
+    machine learning estimators: they might behave badly if the
+    individual features do not more or less look like standard normally
+    distributed data (e.g. Gaussian with 0 mean and unit variance).
+
+    For instance many elements used in the objective function of
+    a learning algorithm (such as the RBF kernel of Support Vector
+    Machines or the L1 and L2 regularizers of linear models) assume that
+    all features are centered around 0 and have variance in the same
+    order. If a feature has a variance that is orders of magnitude larger
+    than others, it might dominate the objective function and make the
+    estimator unable to learn from other features correctly as expected.
+
+    `StandardScaler` is sensitive to outliers, and the features may scale
+    differently from each other in the presence of outliers. For an example
+    visualization, refer to :ref:`Compare StandardScaler with other scalers
+    <plot_all_scaling_standard_scaler_section>`.
+
+    This scaler can also be applied to sparse CSR or CSC matrices by passing
+    `with_mean=False` to avoid breaking the sparsity structure of the data.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    copy : bool, default=True
+        If False, try to avoid a copy and do inplace scaling instead.
+        This is not guaranteed to always work inplace; e.g. if the data is
+        not a NumPy array or scipy.sparse CSR matrix, a copy may still be
+        returned.
+
+    with_mean : bool, default=True
+        If True, center the data before scaling.
+        This does not work (and will raise an exception) when attempted on
+        sparse matrices, because centering them entails building a dense
+        matrix which in common use cases is likely to be too large to fit in
+        memory.
+
+    with_std : bool, default=True
+        If True, scale the data to unit variance (or equivalently,
+        unit standard deviation).
+
+    Attributes
+    ----------
+    scale_ : ndarray of shape (n_features,) or None
+        Per feature relative scaling of the data to achieve zero mean and unit
+        variance. Generally this is calculated using `np.sqrt(var_)`. If a
+        variance is zero, we can't achieve unit variance, and the data is left
+        as-is, giving a scaling factor of 1. `scale_` is equal to `None`
+        when `with_std=False`.
+
+        .. versionadded:: 0.17
+           *scale_*
+
+    mean_ : ndarray of shape (n_features,) or None
+        The mean value for each feature in the training set.
+        Equal to ``None`` when ``with_mean=False`` and ``with_std=False``.
+
+    var_ : ndarray of shape (n_features,) or None
+        The variance for each feature in the training set. Used to compute
+        `scale_`. Equal to ``None`` when ``with_mean=False`` and
+        ``with_std=False``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_seen_ : int or ndarray of shape (n_features,)
+        The number of samples processed by the estimator for each feature.
+        If there are no missing samples, the ``n_samples_seen`` will be an
+        integer, otherwise it will be an array of dtype int. If
+        `sample_weights` are used it will be a float (if no missing data)
+        or an array of dtype float that sums the weights seen so far.
+        Will be reset on new calls to fit, but increments across
+        ``partial_fit`` calls.
+
+    See Also
+    --------
+    scale : Equivalent function without the estimator API.
+
+    :class:`~sklearn.decomposition.PCA` : Further removes the linear
+        correlation across features with 'whiten=True'.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    We use a biased estimator for the standard deviation, equivalent to
+    `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
+    affect model performance.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
+    >>> scaler = StandardScaler()
+    >>> print(scaler.fit(data))
+    StandardScaler()
+    >>> print(scaler.mean_)
+    [0.5 0.5]
+    >>> print(scaler.transform(data))
+    [[-1. -1.]
+     [-1. -1.]
+     [ 1.  1.]
+     [ 1.  1.]]
+    >>> print(scaler.transform([[2, 2]]))
+    [[3. 3.]]
+    """
+
+    _parameter_constraints: dict = {
+        "copy": ["boolean"],
+        "with_mean": ["boolean"],
+        "with_std": ["boolean"],
+    }
+
+    def __init__(self, *, copy=True, with_mean=True, with_std=True):
+        self.with_mean = with_mean
+        self.with_std = with_std
+        self.copy = copy
+
+    def _reset(self):
+        """Reset internal data-dependent state of the scaler, if necessary.
+
+        __init__ parameters are not touched.
+        """
+        # Checking one attribute is enough, because they are all set together
+        # in partial_fit
+        if hasattr(self, "scale_"):
+            del self.scale_
+            del self.n_samples_seen_
+            del self.mean_
+            del self.var_
+
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute the mean and std to be used for later scaling.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Individual weights for each sample.
+
+            .. versionadded:: 0.24
+               parameter *sample_weight* support to StandardScaler.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        # Reset internal state before fitting
+        self._reset()
+        return self.partial_fit(X, y, sample_weight)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None, sample_weight=None):
+        """Online computation of mean and std on X for later scaling.
+
+        All of X is processed as a single batch. This is intended for cases
+        when :meth:`fit` is not feasible due to very large number of
+        `n_samples` or because X is read from a continuous stream.
+
+        The algorithm for incremental mean and std is given in Equation 1.5a,b
+        in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms
+        for computing the sample variance: Analysis and recommendations."
+        The American Statistician 37.3 (1983): 242-247:
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Individual weights for each sample.
+
+            .. versionadded:: 0.24
+               parameter *sample_weight* support to StandardScaler.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        first_call = not hasattr(self, "n_samples_seen_")
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=("csr", "csc"),
+            dtype=FLOAT_DTYPES,
+            ensure_all_finite="allow-nan",
+            reset=first_call,
+        )
+        n_features = X.shape[1]
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        # Even in the case of `with_mean=False`, we update the mean anyway
+        # This is needed for the incremental computation of the var
+        # See incr_mean_variance_axis and _incremental_mean_variance_axis
+
+        # if n_samples_seen_ is an integer (i.e. no missing values), we need to
+        # transform it to a NumPy array of shape (n_features,) required by
+        # incr_mean_variance_axis and _incremental_variance_axis
+        dtype = np.int64 if sample_weight is None else X.dtype
+        if not hasattr(self, "n_samples_seen_"):
+            self.n_samples_seen_ = np.zeros(n_features, dtype=dtype)
+        elif np.size(self.n_samples_seen_) == 1:
+            self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1])
+            self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False)
+
+        if sparse.issparse(X):
+            if self.with_mean:
+                raise ValueError(
+                    "Cannot center sparse matrices: pass `with_mean=False` "
+                    "instead. See docstring for motivation and alternatives."
+                )
+            sparse_constructor = (
+                sparse.csr_matrix if X.format == "csr" else sparse.csc_matrix
+            )
+
+            if self.with_std:
+                # First pass
+                if not hasattr(self, "scale_"):
+                    self.mean_, self.var_, self.n_samples_seen_ = mean_variance_axis(
+                        X, axis=0, weights=sample_weight, return_sum_weights=True
+                    )
+                # Next passes
+                else:
+                    (
+                        self.mean_,
+                        self.var_,
+                        self.n_samples_seen_,
+                    ) = incr_mean_variance_axis(
+                        X,
+                        axis=0,
+                        last_mean=self.mean_,
+                        last_var=self.var_,
+                        last_n=self.n_samples_seen_,
+                        weights=sample_weight,
+                    )
+                # We force the mean and variance to float64 for large arrays
+                # See https://github.com/scikit-learn/scikit-learn/pull/12338
+                self.mean_ = self.mean_.astype(np.float64, copy=False)
+                self.var_ = self.var_.astype(np.float64, copy=False)
+            else:
+                self.mean_ = None  # as with_mean must be False for sparse
+                self.var_ = None
+                weights = _check_sample_weight(sample_weight, X)
+                sum_weights_nan = weights @ sparse_constructor(
+                    (np.isnan(X.data), X.indices, X.indptr), shape=X.shape
+                )
+                self.n_samples_seen_ += (np.sum(weights) - sum_weights_nan).astype(
+                    dtype
+                )
+        else:
+            # First pass
+            if not hasattr(self, "scale_"):
+                self.mean_ = 0.0
+                if self.with_std:
+                    self.var_ = 0.0
+                else:
+                    self.var_ = None
+
+            if not self.with_mean and not self.with_std:
+                self.mean_ = None
+                self.var_ = None
+                self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
+
+            else:
+                self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var(
+                    X,
+                    self.mean_,
+                    self.var_,
+                    self.n_samples_seen_,
+                    sample_weight=sample_weight,
+                )
+
+        # for backward-compatibility, reduce n_samples_seen_ to an integer
+        # if the number of samples is the same for each feature (i.e. no
+        # missing values)
+        if np.ptp(self.n_samples_seen_) == 0:
+            self.n_samples_seen_ = self.n_samples_seen_[0]
+
+        if self.with_std:
+            # Extract the list of near constant features on the raw variances,
+            # before taking the square root.
+            constant_mask = _is_constant_feature(
+                self.var_, self.mean_, self.n_samples_seen_
+            )
+            self.scale_ = _handle_zeros_in_scale(
+                np.sqrt(self.var_), copy=False, constant_mask=constant_mask
+            )
+        else:
+            self.scale_ = None
+
+        return self
+
+    def transform(self, X, copy=None):
+        """Perform standardization by centering and scaling.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix of shape (n_samples, n_features)
+            The data used to scale along the features axis.
+        copy : bool, default=None
+            Copy the input X or not.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+
+        copy = copy if copy is not None else self.copy
+        X = validate_data(
+            self,
+            X,
+            reset=False,
+            accept_sparse="csr",
+            copy=copy,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            if self.with_mean:
+                raise ValueError(
+                    "Cannot center sparse matrices: pass `with_mean=False` "
+                    "instead. See docstring for motivation and alternatives."
+                )
+            if self.scale_ is not None:
+                inplace_column_scale(X, 1 / self.scale_)
+        else:
+            if self.with_mean:
+                X -= self.mean_
+            if self.with_std:
+                X /= self.scale_
+        return X
+
+    def inverse_transform(self, X, copy=None):
+        """Scale back the data to the original representation.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the features axis.
+        copy : bool, default=None
+            Copy the input X or not.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+
+        copy = copy if copy is not None else self.copy
+        X = check_array(
+            X,
+            accept_sparse="csr",
+            copy=copy,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            if self.with_mean:
+                raise ValueError(
+                    "Cannot uncenter sparse matrices: pass `with_mean=False` "
+                    "instead See docstring for motivation and alternatives."
+                )
+            if self.scale_ is not None:
+                inplace_column_scale(X, self.scale_)
+        else:
+            if self.with_std:
+                X *= self.scale_
+            if self.with_mean:
+                X += self.mean_
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
+
+
+class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Scale each feature by its maximum absolute value.
+
+    This estimator scales and translates each feature individually such
+    that the maximal absolute value of each feature in the
+    training set will be 1.0. It does not shift/center the data, and
+    thus does not destroy any sparsity.
+
+    This scaler can also be applied to sparse CSR or CSC matrices.
+
+    `MaxAbsScaler` doesn't reduce the effect of outliers; it only linearly
+    scales them down. For an example visualization, refer to :ref:`Compare
+    MaxAbsScaler with other scalers <plot_all_scaling_max_abs_scaler_section>`.
+
+    .. versionadded:: 0.17
+
+    Parameters
+    ----------
+    copy : bool, default=True
+        Set to False to perform inplace scaling and avoid a copy (if the input
+        is already a numpy array).
+
+    Attributes
+    ----------
+    scale_ : ndarray of shape (n_features,)
+        Per feature relative scaling of the data.
+
+        .. versionadded:: 0.17
+           *scale_* attribute.
+
+    max_abs_ : ndarray of shape (n_features,)
+        Per feature maximum absolute value.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_seen_ : int
+        The number of samples processed by the estimator. Will be reset on
+        new calls to fit, but increments across ``partial_fit`` calls.
+
+    See Also
+    --------
+    maxabs_scale : Equivalent function without the estimator API.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import MaxAbsScaler
+    >>> X = [[ 1., -1.,  2.],
+    ...      [ 2.,  0.,  0.],
+    ...      [ 0.,  1., -1.]]
+    >>> transformer = MaxAbsScaler().fit(X)
+    >>> transformer
+    MaxAbsScaler()
+    >>> transformer.transform(X)
+    array([[ 0.5, -1. ,  1. ],
+           [ 1. ,  0. ,  0. ],
+           [ 0. ,  1. , -0.5]])
+    """
+
+    _parameter_constraints: dict = {"copy": ["boolean"]}
+
+    def __init__(self, *, copy=True):
+        self.copy = copy
+
+    def _reset(self):
+        """Reset internal data-dependent state of the scaler, if necessary.
+
+        __init__ parameters are not touched.
+        """
+        # Checking one attribute is enough, because they are all set together
+        # in partial_fit
+        if hasattr(self, "scale_"):
+            del self.scale_
+            del self.n_samples_seen_
+            del self.max_abs_
+
+    def fit(self, X, y=None):
+        """Compute the maximum absolute value to be used for later scaling.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to compute the per-feature minimum and maximum
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        # Reset internal state before fitting
+        self._reset()
+        return self.partial_fit(X, y)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None):
+        """Online computation of max absolute value of X for later scaling.
+
+        All of X is processed as a single batch. This is intended for cases
+        when :meth:`fit` is not feasible due to very large number of
+        `n_samples` or because X is read from a continuous stream.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        xp, _ = get_namespace(X)
+
+        first_pass = not hasattr(self, "n_samples_seen_")
+        X = validate_data(
+            self,
+            X,
+            reset=first_pass,
+            accept_sparse=("csr", "csc"),
+            dtype=_array_api.supported_float_dtypes(xp),
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            mins, maxs = min_max_axis(X, axis=0, ignore_nan=True)
+            max_abs = np.maximum(np.abs(mins), np.abs(maxs))
+        else:
+            max_abs = _array_api._nanmax(xp.abs(X), axis=0, xp=xp)
+
+        if first_pass:
+            self.n_samples_seen_ = X.shape[0]
+        else:
+            max_abs = xp.maximum(self.max_abs_, max_abs)
+            self.n_samples_seen_ += X.shape[0]
+
+        self.max_abs_ = max_abs
+        self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)
+        return self
+
+    def transform(self, X):
+        """Scale the data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data that should be scaled.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+
+        xp, _ = get_namespace(X)
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=("csr", "csc"),
+            copy=self.copy,
+            reset=False,
+            dtype=_array_api.supported_float_dtypes(xp),
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            inplace_column_scale(X, 1.0 / self.scale_)
+        else:
+            X /= self.scale_
+        return X
+
+    def inverse_transform(self, X):
+        """Scale back the data to the original representation.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data that should be transformed back.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+
+        xp, _ = get_namespace(X)
+
+        X = check_array(
+            X,
+            accept_sparse=("csr", "csc"),
+            copy=self.copy,
+            dtype=_array_api.supported_float_dtypes(xp),
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            inplace_column_scale(X, self.scale_)
+        else:
+            X *= self.scale_
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "axis": [Options(Integral, {0, 1})],
+    },
+    prefer_skip_nested_validation=False,
+)
+def maxabs_scale(X, *, axis=0, copy=True):
+    """Scale each feature to the [-1, 1] range without breaking the sparsity.
+
+    This estimator scales each feature individually such
+    that the maximal absolute value of each feature in the
+    training set will be 1.0.
+
+    This scaler can also be applied to sparse CSR or CSC matrices.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data.
+
+    axis : {0, 1}, default=0
+        Axis used to scale along. If 0, independently scale each feature,
+        otherwise (if 1) scale each sample.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and scale in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    Returns
+    -------
+    X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The transformed data.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know
+        what you are doing. A common mistake is to apply it to the entire data
+        *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.MaxAbsScaler` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`.
+
+    See Also
+    --------
+    MaxAbsScaler : Performs scaling to the [-1, 1] range using
+        the Transformer API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded to compute the statistics,
+    and maintained during the data transformation.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import maxabs_scale
+    >>> X = [[-2, 1, 2], [-1, 0, 1]]
+    >>> maxabs_scale(X, axis=0)  # scale each column independently
+    array([[-1. ,  1. ,  1. ],
+           [-0.5,  0. ,  0.5]])
+    >>> maxabs_scale(X, axis=1)  # scale each row independently
+    array([[-1. ,  0.5,  1. ],
+           [-1. ,  0. ,  1. ]])
+    """
+    # Unlike the scaler object, this function allows 1d input.
+
+    # If copy is required, it will be done inside the scaler object.
+    X = check_array(
+        X,
+        accept_sparse=("csr", "csc"),
+        copy=False,
+        ensure_2d=False,
+        dtype=FLOAT_DTYPES,
+        ensure_all_finite="allow-nan",
+    )
+    original_ndim = X.ndim
+
+    if original_ndim == 1:
+        X = X.reshape(X.shape[0], 1)
+
+    s = MaxAbsScaler(copy=copy)
+    if axis == 0:
+        X = s.fit_transform(X)
+    else:
+        X = s.fit_transform(X.T).T
+
+    if original_ndim == 1:
+        X = X.ravel()
+
+    return X
+
+
+class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Scale features using statistics that are robust to outliers.
+
+    This Scaler removes the median and scales the data according to
+    the quantile range (defaults to IQR: Interquartile Range).
+    The IQR is the range between the 1st quartile (25th quantile)
+    and the 3rd quartile (75th quantile).
+
+    Centering and scaling happen independently on each feature by
+    computing the relevant statistics on the samples in the training
+    set. Median and interquartile range are then stored to be used on
+    later data using the :meth:`transform` method.
+
+    Standardization of a dataset is a common preprocessing for many machine
+    learning estimators. Typically this is done by removing the mean and
+    scaling to unit variance. However, outliers can often influence the sample
+    mean / variance in a negative way. In such cases, using the median and the
+    interquartile range often give better results. For an example visualization
+    and comparison to other scalers, refer to :ref:`Compare RobustScaler with
+    other scalers <plot_all_scaling_robust_scaler_section>`.
+
+    .. versionadded:: 0.17
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    with_centering : bool, default=True
+        If `True`, center the data before scaling.
+        This will cause :meth:`transform` to raise an exception when attempted
+        on sparse matrices, because centering them entails building a dense
+        matrix which in common use cases is likely to be too large to fit in
+        memory.
+
+    with_scaling : bool, default=True
+        If `True`, scale the data to interquartile range.
+
+    quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, \
+        default=(25.0, 75.0)
+        Quantile range used to calculate `scale_`. By default this is equal to
+        the IQR, i.e., `q_min` is the first quantile and `q_max` is the third
+        quantile.
+
+        .. versionadded:: 0.18
+
+    copy : bool, default=True
+        If `False`, try to avoid a copy and do inplace scaling instead.
+        This is not guaranteed to always work inplace; e.g. if the data is
+        not a NumPy array or scipy.sparse CSR matrix, a copy may still be
+        returned.
+
+    unit_variance : bool, default=False
+        If `True`, scale data so that normally distributed features have a
+        variance of 1. In general, if the difference between the x-values of
+        `q_max` and `q_min` for a standard normal distribution is greater
+        than 1, the dataset will be scaled down. If less than 1, the dataset
+        will be scaled up.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    center_ : array of floats
+        The median value for each feature in the training set.
+
+    scale_ : array of floats
+        The (scaled) interquartile range for each feature in the training set.
+
+        .. versionadded:: 0.17
+           *scale_* attribute.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    robust_scale : Equivalent function without the estimator API.
+    sklearn.decomposition.PCA : Further removes the linear correlation across
+        features with 'whiten=True'.
+
+    Notes
+    -----
+
+    https://en.wikipedia.org/wiki/Median
+    https://en.wikipedia.org/wiki/Interquartile_range
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import RobustScaler
+    >>> X = [[ 1., -2.,  2.],
+    ...      [ -2.,  1.,  3.],
+    ...      [ 4.,  1., -2.]]
+    >>> transformer = RobustScaler().fit(X)
+    >>> transformer
+    RobustScaler()
+    >>> transformer.transform(X)
+    array([[ 0. , -2. ,  0. ],
+           [-1. ,  0. ,  0.4],
+           [ 1. ,  0. , -1.6]])
+    """
+
+    _parameter_constraints: dict = {
+        "with_centering": ["boolean"],
+        "with_scaling": ["boolean"],
+        "quantile_range": [tuple],
+        "copy": ["boolean"],
+        "unit_variance": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        with_centering=True,
+        with_scaling=True,
+        quantile_range=(25.0, 75.0),
+        copy=True,
+        unit_variance=False,
+    ):
+        self.with_centering = with_centering
+        self.with_scaling = with_scaling
+        self.quantile_range = quantile_range
+        self.unit_variance = unit_variance
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Compute the median and quantiles to be used for scaling.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to compute the median and quantiles
+            used for later scaling along the features axis.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        # at fit, convert sparse matrices to csc for optimized computation of
+        # the quantiles
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csc",
+            dtype=FLOAT_DTYPES,
+            ensure_all_finite="allow-nan",
+        )
+
+        q_min, q_max = self.quantile_range
+        if not 0 <= q_min <= q_max <= 100:
+            raise ValueError("Invalid quantile range: %s" % str(self.quantile_range))
+
+        if self.with_centering:
+            if sparse.issparse(X):
+                raise ValueError(
+                    "Cannot center sparse matrices: use `with_centering=False`"
+                    " instead. See docstring for motivation and alternatives."
+                )
+            self.center_ = np.nanmedian(X, axis=0)
+        else:
+            self.center_ = None
+
+        if self.with_scaling:
+            quantiles = []
+            for feature_idx in range(X.shape[1]):
+                if sparse.issparse(X):
+                    column_nnz_data = X.data[
+                        X.indptr[feature_idx] : X.indptr[feature_idx + 1]
+                    ]
+                    column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)
+                    column_data[: len(column_nnz_data)] = column_nnz_data
+                else:
+                    column_data = X[:, feature_idx]
+
+                quantiles.append(np.nanpercentile(column_data, self.quantile_range))
+
+            quantiles = np.transpose(quantiles)
+
+            self.scale_ = quantiles[1] - quantiles[0]
+            self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
+            if self.unit_variance:
+                adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0)
+                self.scale_ = self.scale_ / adjust
+        else:
+            self.scale_ = None
+
+        return self
+
+    def transform(self, X):
+        """Center and scale the data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the specified axis.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=("csr", "csc"),
+            copy=self.copy,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            reset=False,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            if self.with_scaling:
+                inplace_column_scale(X, 1.0 / self.scale_)
+        else:
+            if self.with_centering:
+                X -= self.center_
+            if self.with_scaling:
+                X /= self.scale_
+        return X
+
+    def inverse_transform(self, X):
+        """Scale back the data to the original representation.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The rescaled data to be transformed back.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+        X = check_array(
+            X,
+            accept_sparse=("csr", "csc"),
+            copy=self.copy,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            if self.with_scaling:
+                inplace_column_scale(X, self.scale_)
+        else:
+            if self.with_scaling:
+                X *= self.scale_
+            if self.with_centering:
+                X += self.center_
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+@validate_params(
+    {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]},
+    prefer_skip_nested_validation=False,
+)
+def robust_scale(
+    X,
+    *,
+    axis=0,
+    with_centering=True,
+    with_scaling=True,
+    quantile_range=(25.0, 75.0),
+    copy=True,
+    unit_variance=False,
+):
+    """Standardize a dataset along any axis.
+
+    Center to the median and component wise scale
+    according to the interquartile range.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_sample, n_features)
+        The data to center and scale.
+
+    axis : int, default=0
+        Axis used to compute the medians and IQR along. If 0,
+        independently scale each feature, otherwise (if 1) scale
+        each sample.
+
+    with_centering : bool, default=True
+        If `True`, center the data before scaling.
+
+    with_scaling : bool, default=True
+        If `True`, scale the data to unit variance (or equivalently,
+        unit standard deviation).
+
+    quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0,\
+        default=(25.0, 75.0)
+        Quantile range used to calculate `scale_`. By default this is equal to
+        the IQR, i.e., `q_min` is the first quantile and `q_max` is the third
+        quantile.
+
+        .. versionadded:: 0.18
+
+    copy : bool, default=True
+        If False, try to avoid a copy and scale in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    unit_variance : bool, default=False
+        If `True`, scale data so that normally distributed features have a
+        variance of 1. In general, if the difference between the x-values of
+        `q_max` and `q_min` for a standard normal distribution is greater
+        than 1, the dataset will be scaled down. If less than 1, the dataset
+        will be scaled up.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The transformed data.
+
+    See Also
+    --------
+    RobustScaler : Performs centering and scaling using the Transformer API
+        (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    This implementation will refuse to center scipy.sparse matrices
+    since it would make them non-sparse and would potentially crash the
+    program with memory exhaustion problems.
+
+    Instead the caller is expected to either set explicitly
+    `with_centering=False` (in that case, only variance scaling will be
+    performed on the features of the CSR matrix) or to call `X.toarray()`
+    if he/she expects the materialized dense array to fit in memory.
+
+    To avoid memory copy the caller should pass a CSR matrix.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know
+        what you are doing. A common mistake is to apply it to the entire data
+        *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.RobustScaler` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import robust_scale
+    >>> X = [[-2, 1, 2], [-1, 0, 1]]
+    >>> robust_scale(X, axis=0)  # scale each column independently
+    array([[-1.,  1.,  1.],
+           [ 1., -1., -1.]])
+    >>> robust_scale(X, axis=1)  # scale each row independently
+    array([[-1.5,  0. ,  0.5],
+           [-1. ,  0. ,  1. ]])
+    """
+    X = check_array(
+        X,
+        accept_sparse=("csr", "csc"),
+        copy=False,
+        ensure_2d=False,
+        dtype=FLOAT_DTYPES,
+        ensure_all_finite="allow-nan",
+    )
+    original_ndim = X.ndim
+
+    if original_ndim == 1:
+        X = X.reshape(X.shape[0], 1)
+
+    s = RobustScaler(
+        with_centering=with_centering,
+        with_scaling=with_scaling,
+        quantile_range=quantile_range,
+        unit_variance=unit_variance,
+        copy=copy,
+    )
+    if axis == 0:
+        X = s.fit_transform(X)
+    else:
+        X = s.fit_transform(X.T).T
+
+    if original_ndim == 1:
+        X = X.ravel()
+
+    return X
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "norm": [StrOptions({"l1", "l2", "max"})],
+        "axis": [Options(Integral, {0, 1})],
+        "copy": ["boolean"],
+        "return_norm": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False):
+    """Scale input vectors individually to unit norm (vector length).
+
+    Read more in the :ref:`User Guide <preprocessing_normalization>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to normalize, element by element.
+        scipy.sparse matrices should be in CSR format to avoid an
+        un-necessary copy.
+
+    norm : {'l1', 'l2', 'max'}, default='l2'
+        The norm to use to normalize each non zero sample (or each non-zero
+        feature if axis is 0).
+
+    axis : {0, 1}, default=1
+        Define axis used to normalize the data along. If 1, independently
+        normalize each sample, otherwise (if 0) normalize each feature.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and normalize in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    return_norm : bool, default=False
+        Whether to return the computed norms.
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        Normalized input X.
+
+    norms : ndarray of shape (n_samples, ) if axis=1 else (n_features, )
+        An array of norms along given axis for X.
+        When X is sparse, a NotImplementedError will be raised
+        for norm 'l1' or 'l2'.
+
+    See Also
+    --------
+    Normalizer : Performs normalization using the Transformer API
+        (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import normalize
+    >>> X = [[-2, 1, 2], [-1, 0, 1]]
+    >>> normalize(X, norm="l1")  # L1 normalization each row independently
+    array([[-0.4,  0.2,  0.4],
+           [-0.5,  0. ,  0.5]])
+    >>> normalize(X, norm="l2")  # L2 normalization each row independently
+    array([[-0.66...,  0.33...,  0.66...],
+           [-0.70...,  0.     ,  0.70...]])
+    """
+    if axis == 0:
+        sparse_format = "csc"
+    else:  # axis == 1:
+        sparse_format = "csr"
+
+    xp, _ = get_namespace(X)
+
+    X = check_array(
+        X,
+        accept_sparse=sparse_format,
+        copy=copy,
+        estimator="the normalize function",
+        dtype=_array_api.supported_float_dtypes(xp),
+        force_writeable=True,
+    )
+    if axis == 0:
+        X = X.T
+
+    if sparse.issparse(X):
+        if return_norm and norm in ("l1", "l2"):
+            raise NotImplementedError(
+                "return_norm=True is not implemented "
+                "for sparse matrices with norm 'l1' "
+                "or norm 'l2'"
+            )
+        if norm == "l1":
+            inplace_csr_row_normalize_l1(X)
+        elif norm == "l2":
+            inplace_csr_row_normalize_l2(X)
+        elif norm == "max":
+            mins, maxes = min_max_axis(X, 1)
+            norms = np.maximum(abs(mins), maxes)
+            norms_elementwise = norms.repeat(np.diff(X.indptr))
+            mask = norms_elementwise != 0
+            X.data[mask] /= norms_elementwise[mask]
+    else:
+        if norm == "l1":
+            norms = xp.sum(xp.abs(X), axis=1)
+        elif norm == "l2":
+            norms = row_norms(X)
+        elif norm == "max":
+            norms = xp.max(xp.abs(X), axis=1)
+        norms = _handle_zeros_in_scale(norms, copy=False)
+        X /= norms[:, None]
+
+    if axis == 0:
+        X = X.T
+
+    if return_norm:
+        return X, norms
+    else:
+        return X
+
+
+class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Normalize samples individually to unit norm.
+
+    Each sample (i.e. each row of the data matrix) with at least one
+    non zero component is rescaled independently of other samples so
+    that its norm (l1, l2 or inf) equals one.
+
+    This transformer is able to work both with dense numpy arrays and
+    scipy.sparse matrix (use CSR format if you want to avoid the burden of
+    a copy / conversion).
+
+    Scaling inputs to unit norms is a common operation for text
+    classification or clustering for instance. For instance the dot
+    product of two l2-normalized TF-IDF vectors is the cosine similarity
+    of the vectors and is the base similarity metric for the Vector
+    Space Model commonly used by the Information Retrieval community.
+
+    For an example visualization, refer to :ref:`Compare Normalizer with other
+    scalers <plot_all_scaling_normalizer_section>`.
+
+    Read more in the :ref:`User Guide <preprocessing_normalization>`.
+
+    Parameters
+    ----------
+    norm : {'l1', 'l2', 'max'}, default='l2'
+        The norm to use to normalize each non zero sample. If norm='max'
+        is used, values will be rescaled by the maximum of the absolute
+        values.
+
+    copy : bool, default=True
+        Set to False to perform inplace row normalization and avoid a
+        copy (if the input is already a numpy array or a scipy.sparse
+        CSR matrix).
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    normalize : Equivalent function without the estimator API.
+
+    Notes
+    -----
+    This estimator is :term:`stateless` and does not need to be fitted.
+    However, we recommend to call :meth:`fit_transform` instead of
+    :meth:`transform`, as parameter validation is only performed in
+    :meth:`fit`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import Normalizer
+    >>> X = [[4, 1, 2, 2],
+    ...      [1, 3, 9, 3],
+    ...      [5, 7, 5, 1]]
+    >>> transformer = Normalizer().fit(X)  # fit does nothing.
+    >>> transformer
+    Normalizer()
+    >>> transformer.transform(X)
+    array([[0.8, 0.2, 0.4, 0.4],
+           [0.1, 0.3, 0.9, 0.3],
+           [0.5, 0.7, 0.5, 0.1]])
+    """
+
+    _parameter_constraints: dict = {
+        "norm": [StrOptions({"l1", "l2", "max"})],
+        "copy": ["boolean"],
+    }
+
+    def __init__(self, norm="l2", *, copy=True):
+        self.norm = norm
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Only validates estimator's parameters.
+
+        This method allows to: (i) validate the estimator's parameters and
+        (ii) be consistent with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to estimate the normalization parameters.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        validate_data(self, X, accept_sparse="csr")
+        return self
+
+    def transform(self, X, copy=None):
+        """Scale each non zero row of X to unit norm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to normalize, row by row. scipy.sparse matrices should be
+            in CSR format to avoid an un-necessary copy.
+
+        copy : bool, default=None
+            Copy the input X or not.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        copy = copy if copy is not None else self.copy
+        X = validate_data(
+            self, X, accept_sparse="csr", force_writeable=True, copy=copy, reset=False
+        )
+        return normalize(X, norm=self.norm, axis=1, copy=False)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.requires_fit = False
+        tags.array_api_support = True
+        return tags
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "threshold": [Interval(Real, None, None, closed="neither")],
+        "copy": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def binarize(X, *, threshold=0.0, copy=True):
+    """Boolean thresholding of array-like or scipy.sparse matrix.
+
+    Read more in the :ref:`User Guide <preprocessing_binarization>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to binarize, element by element.
+        scipy.sparse matrices should be in CSR or CSC format to avoid an
+        un-necessary copy.
+
+    threshold : float, default=0.0
+        Feature values below or equal to this are replaced by 0, above it by 1.
+        Threshold may not be less than 0 for operations on sparse matrices.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and binarize in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an object dtype, a copy will be returned even with
+        copy=False.
+
+    Returns
+    -------
+    X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The transformed data.
+
+    See Also
+    --------
+    Binarizer : Performs binarization using the Transformer API
+        (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import binarize
+    >>> X = [[0.4, 0.6, 0.5], [0.6, 0.1, 0.2]]
+    >>> binarize(X, threshold=0.5)
+    array([[0., 1., 0.],
+           [1., 0., 0.]])
+    """
+    X = check_array(X, accept_sparse=["csr", "csc"], force_writeable=True, copy=copy)
+    if sparse.issparse(X):
+        if threshold < 0:
+            raise ValueError("Cannot binarize a sparse matrix with threshold < 0")
+        cond = X.data > threshold
+        not_cond = np.logical_not(cond)
+        X.data[cond] = 1
+        X.data[not_cond] = 0
+        X.eliminate_zeros()
+    else:
+        cond = X > threshold
+        not_cond = np.logical_not(cond)
+        X[cond] = 1
+        X[not_cond] = 0
+    return X
+
+
+class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Binarize data (set feature values to 0 or 1) according to a threshold.
+
+    Values greater than the threshold map to 1, while values less than
+    or equal to the threshold map to 0. With the default threshold of 0,
+    only positive values map to 1.
+
+    Binarization is a common operation on text count data where the
+    analyst can decide to only consider the presence or absence of a
+    feature rather than a quantified number of occurrences for instance.
+
+    It can also be used as a pre-processing step for estimators that
+    consider boolean random variables (e.g. modelled using the Bernoulli
+    distribution in a Bayesian setting).
+
+    Read more in the :ref:`User Guide <preprocessing_binarization>`.
+
+    Parameters
+    ----------
+    threshold : float, default=0.0
+        Feature values below or equal to this are replaced by 0, above it by 1.
+        Threshold may not be less than 0 for operations on sparse matrices.
+
+    copy : bool, default=True
+        Set to False to perform inplace binarization and avoid a copy (if
+        the input is already a numpy array or a scipy.sparse CSR matrix).
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    binarize : Equivalent function without the estimator API.
+    KBinsDiscretizer : Bin continuous data into intervals.
+    OneHotEncoder : Encode categorical features as a one-hot numeric array.
+
+    Notes
+    -----
+    If the input is a sparse matrix, only the non-zero values are subject
+    to update by the :class:`Binarizer` class.
+
+    This estimator is :term:`stateless` and does not need to be fitted.
+    However, we recommend to call :meth:`fit_transform` instead of
+    :meth:`transform`, as parameter validation is only performed in
+    :meth:`fit`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import Binarizer
+    >>> X = [[ 1., -1.,  2.],
+    ...      [ 2.,  0.,  0.],
+    ...      [ 0.,  1., -1.]]
+    >>> transformer = Binarizer().fit(X)  # fit does nothing.
+    >>> transformer
+    Binarizer()
+    >>> transformer.transform(X)
+    array([[1., 0., 1.],
+           [1., 0., 0.],
+           [0., 1., 0.]])
+    """
+
+    _parameter_constraints: dict = {
+        "threshold": [Real],
+        "copy": ["boolean"],
+    }
+
+    def __init__(self, *, threshold=0.0, copy=True):
+        self.threshold = threshold
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Only validates estimator's parameters.
+
+        This method allows to: (i) validate the estimator's parameters and
+        (ii) be consistent with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        validate_data(self, X, accept_sparse="csr")
+        return self
+
+    def transform(self, X, copy=None):
+        """Binarize each element of X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to binarize, element by element.
+            scipy.sparse matrices should be in CSR format to avoid an
+            un-necessary copy.
+
+        copy : bool
+            Copy the input X or not.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        copy = copy if copy is not None else self.copy
+        # TODO: This should be refactored because binarize also calls
+        # check_array
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=["csr", "csc"],
+            force_writeable=True,
+            copy=copy,
+            reset=False,
+        )
+        return binarize(X, threshold=self.threshold, copy=False)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.requires_fit = False
+        return tags
+
+
+class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    r"""Center an arbitrary kernel matrix :math:`K`.
+
+    Let define a kernel :math:`K` such that:
+
+    .. math::
+        K(X, Y) = \phi(X) . \phi(Y)^{T}
+
+    :math:`\phi(X)` is a function mapping of rows of :math:`X` to a
+    Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`.
+
+    This class allows to compute :math:`\tilde{K}(X, Y)` such that:
+
+    .. math::
+        \tilde{K(X, Y)} = \tilde{\phi}(X) . \tilde{\phi}(Y)^{T}
+
+    :math:`\tilde{\phi}(X)` is the centered mapped data in the Hilbert
+    space.
+
+    `KernelCenterer` centers the features without explicitly computing the
+    mapping :math:`\phi(\cdot)`. Working with centered kernels is sometime
+    expected when dealing with algebra computation such as eigendecomposition
+    for :class:`~sklearn.decomposition.KernelPCA` for instance.
+
+    Read more in the :ref:`User Guide <kernel_centering>`.
+
+    Attributes
+    ----------
+    K_fit_rows_ : ndarray of shape (n_samples,)
+        Average of each column of kernel matrix.
+
+    K_fit_all_ : float
+        Average of kernel matrix.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.kernel_approximation.Nystroem : Approximate a kernel map
+        using a subset of the training data.
+
+    References
+    ----------
+    .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       "Nonlinear component analysis as a kernel eigenvalue problem."
+       Neural computation 10.5 (1998): 1299-1319.
+       <https://www.mlpack.org/papers/kpca.pdf>`_
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import KernelCenterer
+    >>> from sklearn.metrics.pairwise import pairwise_kernels
+    >>> X = [[ 1., -2.,  2.],
+    ...      [ -2.,  1.,  3.],
+    ...      [ 4.,  1., -2.]]
+    >>> K = pairwise_kernels(X, metric='linear')
+    >>> K
+    array([[  9.,   2.,  -2.],
+           [  2.,  14., -13.],
+           [ -2., -13.,  21.]])
+    >>> transformer = KernelCenterer().fit(K)
+    >>> transformer
+    KernelCenterer()
+    >>> transformer.transform(K)
+    array([[  5.,   0.,  -5.],
+           [  0.,  14., -14.],
+           [ -5., -14.,  19.]])
+    """
+
+    # X is called K in these methods.
+    __metadata_request__transform = {"K": metadata_routing.UNUSED}
+    __metadata_request__fit = {"K": metadata_routing.UNUSED}
+
+    def fit(self, K, y=None):
+        """Fit KernelCenterer.
+
+        Parameters
+        ----------
+        K : ndarray of shape (n_samples, n_samples)
+            Kernel matrix.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        xp, _ = get_namespace(K)
+
+        K = validate_data(self, K, dtype=_array_api.supported_float_dtypes(xp))
+
+        if K.shape[0] != K.shape[1]:
+            raise ValueError(
+                "Kernel matrix must be a square matrix."
+                " Input is a {}x{} matrix.".format(K.shape[0], K.shape[1])
+            )
+
+        n_samples = K.shape[0]
+        self.K_fit_rows_ = xp.sum(K, axis=0) / n_samples
+        self.K_fit_all_ = xp.sum(self.K_fit_rows_) / n_samples
+        return self
+
+    def transform(self, K, copy=True):
+        """Center kernel matrix.
+
+        Parameters
+        ----------
+        K : ndarray of shape (n_samples1, n_samples2)
+            Kernel matrix.
+
+        copy : bool, default=True
+            Set to False to perform inplace computation.
+
+        Returns
+        -------
+        K_new : ndarray of shape (n_samples1, n_samples2)
+            Returns the instance itself.
+        """
+        check_is_fitted(self)
+
+        xp, _ = get_namespace(K)
+
+        K = validate_data(
+            self,
+            K,
+            copy=copy,
+            force_writeable=True,
+            dtype=_array_api.supported_float_dtypes(xp),
+            reset=False,
+        )
+
+        K_pred_cols = (xp.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, None]
+
+        K -= self.K_fit_rows_
+        K -= K_pred_cols
+        K += self.K_fit_all_
+
+        return K
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        # Used by ClassNamePrefixFeaturesOutMixin. This model preserves the
+        # number of input features but this is not a one-to-one mapping in the
+        # usual sense. Hence the choice not to use OneToOneFeatureMixin to
+        # implement get_feature_names_out for this class.
+        return self.n_features_in_
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.pairwise = True
+        tags.array_api_support = True
+        return tags
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "value": [Interval(Real, None, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def add_dummy_feature(X, value=1.0):
+    """Augment dataset with an additional dummy feature.
+
+    This is useful for fitting an intercept term with implementations which
+    cannot otherwise fit it directly.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Data.
+
+    value : float
+        Value to use for the dummy feature.
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features + 1)
+        Same data with dummy feature added as first column.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import add_dummy_feature
+    >>> add_dummy_feature([[0, 1], [1, 0]])
+    array([[1., 0., 1.],
+           [1., 1., 0.]])
+    """
+    X = check_array(X, accept_sparse=["csc", "csr", "coo"], dtype=FLOAT_DTYPES)
+    n_samples, n_features = X.shape
+    shape = (n_samples, n_features + 1)
+    if sparse.issparse(X):
+        if X.format == "coo":
+            # Shift columns to the right.
+            col = X.col + 1
+            # Column indices of dummy feature are 0 everywhere.
+            col = np.concatenate((np.zeros(n_samples), col))
+            # Row indices of dummy feature are 0, ..., n_samples-1.
+            row = np.concatenate((np.arange(n_samples), X.row))
+            # Prepend the dummy feature n_samples times.
+            data = np.concatenate((np.full(n_samples, value), X.data))
+            return sparse.coo_matrix((data, (row, col)), shape)
+        elif X.format == "csc":
+            # Shift index pointers since we need to add n_samples elements.
+            indptr = X.indptr + n_samples
+            # indptr[0] must be 0.
+            indptr = np.concatenate((np.array([0]), indptr))
+            # Row indices of dummy feature are 0, ..., n_samples-1.
+            indices = np.concatenate((np.arange(n_samples), X.indices))
+            # Prepend the dummy feature n_samples times.
+            data = np.concatenate((np.full(n_samples, value), X.data))
+            return sparse.csc_matrix((data, indices, indptr), shape)
+        else:
+            klass = X.__class__
+            return klass(add_dummy_feature(X.tocoo(), value))
+    else:
+        return np.hstack((np.full((n_samples, 1), value), X))
+
+
+class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Transform features using quantiles information.
+
+    This method transforms the features to follow a uniform or a normal
+    distribution. Therefore, for a given feature, this transformation tends
+    to spread out the most frequent values. It also reduces the impact of
+    (marginal) outliers: this is therefore a robust preprocessing scheme.
+
+    The transformation is applied on each feature independently. First an
+    estimate of the cumulative distribution function of a feature is
+    used to map the original values to a uniform distribution. The obtained
+    values are then mapped to the desired output distribution using the
+    associated quantile function. Features values of new/unseen data that fall
+    below or above the fitted range will be mapped to the bounds of the output
+    distribution. Note that this transform is non-linear. It may distort linear
+    correlations between variables measured at the same scale but renders
+    variables measured at different scales more directly comparable.
+
+    For example visualizations, refer to :ref:`Compare QuantileTransformer with
+    other scalers <plot_all_scaling_quantile_transformer_section>`.
+
+    Read more in the :ref:`User Guide <preprocessing_transformer>`.
+
+    .. versionadded:: 0.19
+
+    Parameters
+    ----------
+    n_quantiles : int, default=1000 or n_samples
+        Number of quantiles to be computed. It corresponds to the number
+        of landmarks used to discretize the cumulative distribution function.
+        If n_quantiles is larger than the number of samples, n_quantiles is set
+        to the number of samples as a larger number of quantiles does not give
+        a better approximation of the cumulative distribution function
+        estimator.
+
+    output_distribution : {'uniform', 'normal'}, default='uniform'
+        Marginal distribution for the transformed data. The choices are
+        'uniform' (default) or 'normal'.
+
+    ignore_implicit_zeros : bool, default=False
+        Only applies to sparse matrices. If True, the sparse entries of the
+        matrix are discarded to compute the quantile statistics. If False,
+        these entries are treated as zeros.
+
+    subsample : int or None, default=10_000
+        Maximum number of samples used to estimate the quantiles for
+        computational efficiency. Note that the subsampling procedure may
+        differ for value-identical sparse and dense matrices.
+        Disable subsampling by setting `subsample=None`.
+
+        .. versionadded:: 1.5
+           The option `None` to disable subsampling was added.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for subsampling and smoothing
+        noise.
+        Please see ``subsample`` for more details.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    copy : bool, default=True
+        Set to False to perform inplace transformation and avoid a copy (if the
+        input is already a numpy array).
+
+    Attributes
+    ----------
+    n_quantiles_ : int
+        The actual number of quantiles used to discretize the cumulative
+        distribution function.
+
+    quantiles_ : ndarray of shape (n_quantiles, n_features)
+        The values corresponding the quantiles of reference.
+
+    references_ : ndarray of shape (n_quantiles, )
+        Quantiles of references.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    quantile_transform : Equivalent function without the estimator API.
+    PowerTransformer : Perform mapping to a normal distribution using a power
+        transform.
+    StandardScaler : Perform standardization that is faster, but less robust
+        to outliers.
+    RobustScaler : Perform robust standardization that removes the influence
+        of outliers but does not put outliers and inliers on the same scale.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import QuantileTransformer
+    >>> rng = np.random.RandomState(0)
+    >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
+    >>> qt = QuantileTransformer(n_quantiles=10, random_state=0)
+    >>> qt.fit_transform(X)
+    array([...])
+    """
+
+    _parameter_constraints: dict = {
+        "n_quantiles": [Interval(Integral, 1, None, closed="left")],
+        "output_distribution": [StrOptions({"uniform", "normal"})],
+        "ignore_implicit_zeros": ["boolean"],
+        "subsample": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+        "copy": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        n_quantiles=1000,
+        output_distribution="uniform",
+        ignore_implicit_zeros=False,
+        subsample=10_000,
+        random_state=None,
+        copy=True,
+    ):
+        self.n_quantiles = n_quantiles
+        self.output_distribution = output_distribution
+        self.ignore_implicit_zeros = ignore_implicit_zeros
+        self.subsample = subsample
+        self.random_state = random_state
+        self.copy = copy
+
+    def _dense_fit(self, X, random_state):
+        """Compute percentiles for dense matrices.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The data used to scale along the features axis.
+        """
+        if self.ignore_implicit_zeros:
+            warnings.warn(
+                "'ignore_implicit_zeros' takes effect only with"
+                " sparse matrix. This parameter has no effect."
+            )
+
+        n_samples, n_features = X.shape
+        references = self.references_ * 100
+
+        if self.subsample is not None and self.subsample < n_samples:
+            # Take a subsample of `X`
+            X = resample(
+                X, replace=False, n_samples=self.subsample, random_state=random_state
+            )
+
+        self.quantiles_ = np.nanpercentile(X, references, axis=0)
+        # Due to floating-point precision error in `np.nanpercentile`,
+        # make sure that quantiles are monotonically increasing.
+        # Upstream issue in numpy:
+        # https://github.com/numpy/numpy/issues/14685
+        self.quantiles_ = np.maximum.accumulate(self.quantiles_)
+
+    def _sparse_fit(self, X, random_state):
+        """Compute percentiles for sparse matrices.
+
+        Parameters
+        ----------
+        X : sparse matrix of shape (n_samples, n_features)
+            The data used to scale along the features axis. The sparse matrix
+            needs to be nonnegative. If a sparse matrix is provided,
+            it will be converted into a sparse ``csc_matrix``.
+        """
+        n_samples, n_features = X.shape
+        references = self.references_ * 100
+
+        self.quantiles_ = []
+        for feature_idx in range(n_features):
+            column_nnz_data = X.data[X.indptr[feature_idx] : X.indptr[feature_idx + 1]]
+            if self.subsample is not None and len(column_nnz_data) > self.subsample:
+                column_subsample = self.subsample * len(column_nnz_data) // n_samples
+                if self.ignore_implicit_zeros:
+                    column_data = np.zeros(shape=column_subsample, dtype=X.dtype)
+                else:
+                    column_data = np.zeros(shape=self.subsample, dtype=X.dtype)
+                column_data[:column_subsample] = random_state.choice(
+                    column_nnz_data, size=column_subsample, replace=False
+                )
+            else:
+                if self.ignore_implicit_zeros:
+                    column_data = np.zeros(shape=len(column_nnz_data), dtype=X.dtype)
+                else:
+                    column_data = np.zeros(shape=n_samples, dtype=X.dtype)
+                column_data[: len(column_nnz_data)] = column_nnz_data
+
+            if not column_data.size:
+                # if no nnz, an error will be raised for computing the
+                # quantiles. Force the quantiles to be zeros.
+                self.quantiles_.append([0] * len(references))
+            else:
+                self.quantiles_.append(np.nanpercentile(column_data, references))
+        self.quantiles_ = np.transpose(self.quantiles_)
+        # due to floating-point precision error in `np.nanpercentile`,
+        # make sure the quantiles are monotonically increasing
+        # Upstream issue in numpy:
+        # https://github.com/numpy/numpy/issues/14685
+        self.quantiles_ = np.maximum.accumulate(self.quantiles_)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Compute the quantiles used for transforming.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the features axis. If a sparse
+            matrix is provided, it will be converted into a sparse
+            ``csc_matrix``. Additionally, the sparse matrix needs to be
+            nonnegative if `ignore_implicit_zeros` is False.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+           Fitted transformer.
+        """
+        if self.subsample is not None and self.n_quantiles > self.subsample:
+            raise ValueError(
+                "The number of quantiles cannot be greater than"
+                " the number of samples used. Got {} quantiles"
+                " and {} samples.".format(self.n_quantiles, self.subsample)
+            )
+
+        X = self._check_inputs(X, in_fit=True, copy=False)
+        n_samples = X.shape[0]
+
+        if self.n_quantiles > n_samples:
+            warnings.warn(
+                "n_quantiles (%s) is greater than the total number "
+                "of samples (%s). n_quantiles is set to "
+                "n_samples." % (self.n_quantiles, n_samples)
+            )
+        self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))
+
+        rng = check_random_state(self.random_state)
+
+        # Create the quantiles of reference
+        self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True)
+        if sparse.issparse(X):
+            self._sparse_fit(X, rng)
+        else:
+            self._dense_fit(X, rng)
+
+        return self
+
+    def _transform_col(self, X_col, quantiles, inverse):
+        """Private function to transform a single feature."""
+
+        output_distribution = self.output_distribution
+
+        if not inverse:
+            lower_bound_x = quantiles[0]
+            upper_bound_x = quantiles[-1]
+            lower_bound_y = 0
+            upper_bound_y = 1
+        else:
+            lower_bound_x = 0
+            upper_bound_x = 1
+            lower_bound_y = quantiles[0]
+            upper_bound_y = quantiles[-1]
+            # for inverse transform, match a uniform distribution
+            with np.errstate(invalid="ignore"):  # hide NaN comparison warnings
+                if output_distribution == "normal":
+                    X_col = stats.norm.cdf(X_col)
+                # else output distribution is already a uniform distribution
+
+        # find index for lower and higher bounds
+        with np.errstate(invalid="ignore"):  # hide NaN comparison warnings
+            if output_distribution == "normal":
+                lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x
+                upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x
+            if output_distribution == "uniform":
+                lower_bounds_idx = X_col == lower_bound_x
+                upper_bounds_idx = X_col == upper_bound_x
+
+        isfinite_mask = ~np.isnan(X_col)
+        X_col_finite = X_col[isfinite_mask]
+        if not inverse:
+            # Interpolate in one direction and in the other and take the
+            # mean. This is in case of repeated values in the features
+            # and hence repeated quantiles
+            #
+            # If we don't do this, only one extreme of the duplicated is
+            # used (the upper when we do ascending, and the
+            # lower for descending). We take the mean of these two
+            X_col[isfinite_mask] = 0.5 * (
+                np.interp(X_col_finite, quantiles, self.references_)
+                - np.interp(-X_col_finite, -quantiles[::-1], -self.references_[::-1])
+            )
+        else:
+            X_col[isfinite_mask] = np.interp(X_col_finite, self.references_, quantiles)
+
+        X_col[upper_bounds_idx] = upper_bound_y
+        X_col[lower_bounds_idx] = lower_bound_y
+        # for forward transform, match the output distribution
+        if not inverse:
+            with np.errstate(invalid="ignore"):  # hide NaN comparison warnings
+                if output_distribution == "normal":
+                    X_col = stats.norm.ppf(X_col)
+                    # find the value to clip the data to avoid mapping to
+                    # infinity. Clip such that the inverse transform will be
+                    # consistent
+                    clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))
+                    clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - np.spacing(1)))
+                    X_col = np.clip(X_col, clip_min, clip_max)
+                # else output distribution is uniform and the ppf is the
+                # identity function so we let X_col unchanged
+
+        return X_col
+
+    def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False):
+        """Check inputs before fit and transform."""
+        X = validate_data(
+            self,
+            X,
+            reset=in_fit,
+            accept_sparse="csc",
+            copy=copy,
+            dtype=FLOAT_DTYPES,
+            # only set force_writeable for the validation at transform time because
+            # it's the only place where QuantileTransformer performs inplace operations.
+            force_writeable=True if not in_fit else None,
+            ensure_all_finite="allow-nan",
+        )
+        # we only accept positive sparse matrix when ignore_implicit_zeros is
+        # false and that we call fit or transform.
+        with np.errstate(invalid="ignore"):  # hide NaN comparison warnings
+            if (
+                not accept_sparse_negative
+                and not self.ignore_implicit_zeros
+                and (sparse.issparse(X) and np.any(X.data < 0))
+            ):
+                raise ValueError(
+                    "QuantileTransformer only accepts non-negative sparse matrices."
+                )
+
+        return X
+
+    def _transform(self, X, inverse=False):
+        """Forward and inverse transform.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The data used to scale along the features axis.
+
+        inverse : bool, default=False
+            If False, apply forward transform. If True, apply
+            inverse transform.
+
+        Returns
+        -------
+        X : ndarray of shape (n_samples, n_features)
+            Projected data.
+        """
+        if sparse.issparse(X):
+            for feature_idx in range(X.shape[1]):
+                column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1])
+                X.data[column_slice] = self._transform_col(
+                    X.data[column_slice], self.quantiles_[:, feature_idx], inverse
+                )
+        else:
+            for feature_idx in range(X.shape[1]):
+                X[:, feature_idx] = self._transform_col(
+                    X[:, feature_idx], self.quantiles_[:, feature_idx], inverse
+                )
+
+        return X
+
+    def transform(self, X):
+        """Feature-wise transformation of the data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the features axis. If a sparse
+            matrix is provided, it will be converted into a sparse
+            ``csc_matrix``. Additionally, the sparse matrix needs to be
+            nonnegative if `ignore_implicit_zeros` is False.
+
+        Returns
+        -------
+        Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The projected data.
+        """
+        check_is_fitted(self)
+        X = self._check_inputs(X, in_fit=False, copy=self.copy)
+
+        return self._transform(X, inverse=False)
+
+    def inverse_transform(self, X):
+        """Back-projection to the original space.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the features axis. If a sparse
+            matrix is provided, it will be converted into a sparse
+            ``csc_matrix``. Additionally, the sparse matrix needs to be
+            nonnegative if `ignore_implicit_zeros` is False.
+
+        Returns
+        -------
+        Xt : {ndarray, sparse matrix} of (n_samples, n_features)
+            The projected data.
+        """
+        check_is_fitted(self)
+        X = self._check_inputs(
+            X, in_fit=False, accept_sparse_negative=True, copy=self.copy
+        )
+
+        return self._transform(X, inverse=True)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+@validate_params(
+    {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]},
+    prefer_skip_nested_validation=False,
+)
+def quantile_transform(
+    X,
+    *,
+    axis=0,
+    n_quantiles=1000,
+    output_distribution="uniform",
+    ignore_implicit_zeros=False,
+    subsample=int(1e5),
+    random_state=None,
+    copy=True,
+):
+    """Transform features using quantiles information.
+
+    This method transforms the features to follow a uniform or a normal
+    distribution. Therefore, for a given feature, this transformation tends
+    to spread out the most frequent values. It also reduces the impact of
+    (marginal) outliers: this is therefore a robust preprocessing scheme.
+
+    The transformation is applied on each feature independently. First an
+    estimate of the cumulative distribution function of a feature is
+    used to map the original values to a uniform distribution. The obtained
+    values are then mapped to the desired output distribution using the
+    associated quantile function. Features values of new/unseen data that fall
+    below or above the fitted range will be mapped to the bounds of the output
+    distribution. Note that this transform is non-linear. It may distort linear
+    correlations between variables measured at the same scale but renders
+    variables measured at different scales more directly comparable.
+
+    Read more in the :ref:`User Guide <preprocessing_transformer>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to transform.
+
+    axis : int, default=0
+        Axis used to compute the means and standard deviations along. If 0,
+        transform each feature, otherwise (if 1) transform each sample.
+
+    n_quantiles : int, default=1000 or n_samples
+        Number of quantiles to be computed. It corresponds to the number
+        of landmarks used to discretize the cumulative distribution function.
+        If n_quantiles is larger than the number of samples, n_quantiles is set
+        to the number of samples as a larger number of quantiles does not give
+        a better approximation of the cumulative distribution function
+        estimator.
+
+    output_distribution : {'uniform', 'normal'}, default='uniform'
+        Marginal distribution for the transformed data. The choices are
+        'uniform' (default) or 'normal'.
+
+    ignore_implicit_zeros : bool, default=False
+        Only applies to sparse matrices. If True, the sparse entries of the
+        matrix are discarded to compute the quantile statistics. If False,
+        these entries are treated as zeros.
+
+    subsample : int or None, default=1e5
+        Maximum number of samples used to estimate the quantiles for
+        computational efficiency. Note that the subsampling procedure may
+        differ for value-identical sparse and dense matrices.
+        Disable subsampling by setting `subsample=None`.
+
+        .. versionadded:: 1.5
+           The option `None` to disable subsampling was added.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for subsampling and smoothing
+        noise.
+        Please see ``subsample`` for more details.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and transform in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+        .. versionchanged:: 0.23
+            The default value of `copy` changed from False to True in 0.23.
+
+    Returns
+    -------
+    Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The transformed data.
+
+    See Also
+    --------
+    QuantileTransformer : Performs quantile-based scaling using the
+        Transformer API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+    power_transform : Maps data to a normal distribution using a
+        power transformation.
+    scale : Performs standardization that is faster, but less robust
+        to outliers.
+    robust_scale : Performs robust standardization that removes the influence
+        of outliers but does not put outliers and inliers on the same scale.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.quantile_transform` unless
+        you know what you are doing. A common mistake is to apply it
+        to the entire data *before* splitting into training and
+        test sets. This will bias the model evaluation because
+        information would have leaked from the test set to the
+        training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.QuantileTransformer` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking:`pipe = make_pipeline(QuantileTransformer(),
+        LogisticRegression())`.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import quantile_transform
+    >>> rng = np.random.RandomState(0)
+    >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
+    >>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True)
+    array([...])
+    """
+    n = QuantileTransformer(
+        n_quantiles=n_quantiles,
+        output_distribution=output_distribution,
+        subsample=subsample,
+        ignore_implicit_zeros=ignore_implicit_zeros,
+        random_state=random_state,
+        copy=copy,
+    )
+    if axis == 0:
+        X = n.fit_transform(X)
+    else:  # axis == 1
+        X = n.fit_transform(X.T).T
+    return X
+
+
+class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Apply a power transform featurewise to make data more Gaussian-like.
+
+    Power transforms are a family of parametric, monotonic transformations
+    that are applied to make data more Gaussian-like. This is useful for
+    modeling issues related to heteroscedasticity (non-constant variance),
+    or other situations where normality is desired.
+
+    Currently, PowerTransformer supports the Box-Cox transform and the
+    Yeo-Johnson transform. The optimal parameter for stabilizing variance and
+    minimizing skewness is estimated through maximum likelihood.
+
+    Box-Cox requires input data to be strictly positive, while Yeo-Johnson
+    supports both positive or negative data.
+
+    By default, zero-mean, unit-variance normalization is applied to the
+    transformed data.
+
+    For an example visualization, refer to :ref:`Compare PowerTransformer with
+    other scalers <plot_all_scaling_power_transformer_section>`. To see the
+    effect of Box-Cox and Yeo-Johnson transformations on different
+    distributions, see:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_map_data_to_normal.py`.
+
+    Read more in the :ref:`User Guide <preprocessing_transformer>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'
+        The power transform method. Available methods are:
+
+        - 'yeo-johnson' [1]_, works with positive and negative values
+        - 'box-cox' [2]_, only works with strictly positive values
+
+    standardize : bool, default=True
+        Set to True to apply zero-mean, unit-variance normalization to the
+        transformed output.
+
+    copy : bool, default=True
+        Set to False to perform inplace computation during transformation.
+
+    Attributes
+    ----------
+    lambdas_ : ndarray of float of shape (n_features,)
+        The parameters of the power transformation for the selected features.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    power_transform : Equivalent function without the estimator API.
+
+    QuantileTransformer : Maps data to a standard normal distribution with
+        the parameter `output_distribution='normal'`.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in ``fit``, and maintained
+    in ``transform``.
+
+    References
+    ----------
+
+    .. [1] :doi:`I.K. Yeo and R.A. Johnson, "A new family of power
+           transformations to improve normality or symmetry." Biometrika,
+           87(4), pp.954-959, (2000). <10.1093/biomet/87.4.954>`
+
+    .. [2] :doi:`G.E.P. Box and D.R. Cox, "An Analysis of Transformations",
+           Journal of the Royal Statistical Society B, 26, 211-252 (1964).
+           <10.1111/j.2517-6161.1964.tb00553.x>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import PowerTransformer
+    >>> pt = PowerTransformer()
+    >>> data = [[1, 2], [3, 2], [4, 5]]
+    >>> print(pt.fit(data))
+    PowerTransformer()
+    >>> print(pt.lambdas_)
+    [ 1.386... -3.100...]
+    >>> print(pt.transform(data))
+    [[-1.316... -0.707...]
+     [ 0.209... -0.707...]
+     [ 1.106...  1.414...]]
+    """
+
+    _parameter_constraints: dict = {
+        "method": [StrOptions({"yeo-johnson", "box-cox"})],
+        "standardize": ["boolean"],
+        "copy": ["boolean"],
+    }
+
+    def __init__(self, method="yeo-johnson", *, standardize=True, copy=True):
+        self.method = method
+        self.standardize = standardize
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Estimate the optimal parameter lambda for each feature.
+
+        The optimal lambda parameter for minimizing skewness is estimated on
+        each feature independently using maximum likelihood.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to estimate the optimal transformation parameters.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        self._fit(X, y=y, force_transform=False)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None):
+        """Fit `PowerTransformer` to `X`, then transform `X`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to estimate the optimal transformation parameters
+            and to be transformed using a power transformation.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_features)
+            Transformed data.
+        """
+        return self._fit(X, y, force_transform=True)
+
+    def _fit(self, X, y=None, force_transform=False):
+        X = self._check_input(X, in_fit=True, check_positive=True)
+
+        if not self.copy and not force_transform:  # if call from fit()
+            X = X.copy()  # force copy so that fit does not change X inplace
+
+        n_samples = X.shape[0]
+        mean = np.mean(X, axis=0, dtype=np.float64)
+        var = np.var(X, axis=0, dtype=np.float64)
+
+        optim_function = {
+            "box-cox": self._box_cox_optimize,
+            "yeo-johnson": self._yeo_johnson_optimize,
+        }[self.method]
+
+        transform_function = {
+            "box-cox": boxcox,
+            "yeo-johnson": self._yeo_johnson_transform,
+        }[self.method]
+
+        with np.errstate(invalid="ignore"):  # hide NaN warnings
+            self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype)
+            for i, col in enumerate(X.T):
+                # For yeo-johnson, leave constant features unchanged
+                # lambda=1 corresponds to the identity transformation
+                is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples)
+                if self.method == "yeo-johnson" and is_constant_feature:
+                    self.lambdas_[i] = 1.0
+                    continue
+
+                self.lambdas_[i] = optim_function(col)
+
+                if self.standardize or force_transform:
+                    X[:, i] = transform_function(X[:, i], self.lambdas_[i])
+
+        if self.standardize:
+            self._scaler = StandardScaler(copy=False).set_output(transform="default")
+            if force_transform:
+                X = self._scaler.fit_transform(X)
+            else:
+                self._scaler.fit(X)
+
+        return X
+
+    def transform(self, X):
+        """Apply the power transform to each feature using the fitted lambdas.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to be transformed using a power transformation.
+
+        Returns
+        -------
+        X_trans : ndarray of shape (n_samples, n_features)
+            The transformed data.
+        """
+        check_is_fitted(self)
+        X = self._check_input(X, in_fit=False, check_positive=True, check_shape=True)
+
+        transform_function = {
+            "box-cox": boxcox,
+            "yeo-johnson": self._yeo_johnson_transform,
+        }[self.method]
+        for i, lmbda in enumerate(self.lambdas_):
+            with np.errstate(invalid="ignore"):  # hide NaN warnings
+                X[:, i] = transform_function(X[:, i], lmbda)
+
+        if self.standardize:
+            X = self._scaler.transform(X)
+
+        return X
+
+    def inverse_transform(self, X):
+        """Apply the inverse power transformation using the fitted lambdas.
+
+        The inverse of the Box-Cox transformation is given by::
+
+            if lambda_ == 0:
+                X = exp(X_trans)
+            else:
+                X = (X_trans * lambda_ + 1) ** (1 / lambda_)
+
+        The inverse of the Yeo-Johnson transformation is given by::
+
+            if X >= 0 and lambda_ == 0:
+                X = exp(X_trans) - 1
+            elif X >= 0 and lambda_ != 0:
+                X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1
+            elif X < 0 and lambda_ != 2:
+                X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))
+            elif X < 0 and lambda_ == 2:
+                X = 1 - exp(-X_trans)
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The transformed data.
+
+        Returns
+        -------
+        X : ndarray of shape (n_samples, n_features)
+            The original data.
+        """
+        check_is_fitted(self)
+        X = self._check_input(X, in_fit=False, check_shape=True)
+
+        if self.standardize:
+            X = self._scaler.inverse_transform(X)
+
+        inv_fun = {
+            "box-cox": inv_boxcox,
+            "yeo-johnson": self._yeo_johnson_inverse_transform,
+        }[self.method]
+        for i, lmbda in enumerate(self.lambdas_):
+            with np.errstate(invalid="ignore"):  # hide NaN warnings
+                X[:, i] = inv_fun(X[:, i], lmbda)
+
+        return X
+
+    def _yeo_johnson_inverse_transform(self, x, lmbda):
+        """Return inverse-transformed input x following Yeo-Johnson inverse
+        transform with parameter lambda.
+        """
+        x_inv = np.zeros_like(x)
+        pos = x >= 0
+
+        # when x >= 0
+        if abs(lmbda) < np.spacing(1.0):
+            x_inv[pos] = np.exp(x[pos]) - 1
+        else:  # lmbda != 0
+            x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
+
+        # when x < 0
+        if abs(lmbda - 2) > np.spacing(1.0):
+            x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda))
+        else:  # lmbda == 2
+            x_inv[~pos] = 1 - np.exp(-x[~pos])
+
+        return x_inv
+
+    def _yeo_johnson_transform(self, x, lmbda):
+        """Return transformed input x following Yeo-Johnson transform with
+        parameter lambda.
+        """
+
+        out = np.zeros_like(x)
+        pos = x >= 0  # binary mask
+
+        # when x >= 0
+        if abs(lmbda) < np.spacing(1.0):
+            out[pos] = np.log1p(x[pos])
+        else:  # lmbda != 0
+            out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
+
+        # when x < 0
+        if abs(lmbda - 2) > np.spacing(1.0):
+            out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
+        else:  # lmbda == 2
+            out[~pos] = -np.log1p(-x[~pos])
+
+        return out
+
+    def _box_cox_optimize(self, x):
+        """Find and return optimal lambda parameter of the Box-Cox transform by
+        MLE, for observed data x.
+
+        We here use scipy builtins which uses the brent optimizer.
+        """
+        mask = np.isnan(x)
+        if np.all(mask):
+            raise ValueError("Column must not be all nan.")
+
+        # the computation of lambda is influenced by NaNs so we need to
+        # get rid of them
+        _, lmbda = stats.boxcox(x[~mask], lmbda=None)
+
+        return lmbda
+
+    def _yeo_johnson_optimize(self, x):
+        """Find and return optimal lambda parameter of the Yeo-Johnson
+        transform by MLE, for observed data x.
+
+        Like for Box-Cox, MLE is done via the brent optimizer.
+        """
+        x_tiny = np.finfo(np.float64).tiny
+
+        def _neg_log_likelihood(lmbda):
+            """Return the negative log likelihood of the observed data x as a
+            function of lambda."""
+            x_trans = self._yeo_johnson_transform(x, lmbda)
+            n_samples = x.shape[0]
+            x_trans_var = x_trans.var()
+
+            # Reject transformed data that would raise a RuntimeWarning in np.log
+            if x_trans_var < x_tiny:
+                return np.inf
+
+            log_var = np.log(x_trans_var)
+            loglike = -n_samples / 2 * log_var
+            loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum()
+
+            return -loglike
+
+        # the computation of lambda is influenced by NaNs so we need to
+        # get rid of them
+        x = x[~np.isnan(x)]
+        # choosing bracket -2, 2 like for boxcox
+        return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
+
+    def _check_input(self, X, in_fit, check_positive=False, check_shape=False):
+        """Validate the input before fit and transform.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        in_fit : bool
+            Whether or not `_check_input` is called from `fit` or other
+            methods, e.g. `predict`, `transform`, etc.
+
+        check_positive : bool, default=False
+            If True, check that all data is positive and non-zero (only if
+            ``self.method=='box-cox'``).
+
+        check_shape : bool, default=False
+            If True, check that n_features matches the length of self.lambdas_
+        """
+        X = validate_data(
+            self,
+            X,
+            ensure_2d=True,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            copy=self.copy,
+            ensure_all_finite="allow-nan",
+            reset=in_fit,
+        )
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered")
+            if check_positive and self.method == "box-cox" and np.nanmin(X) <= 0:
+                raise ValueError(
+                    "The Box-Cox transformation can only be "
+                    "applied to strictly positive data"
+                )
+
+        if check_shape and not X.shape[1] == len(self.lambdas_):
+            raise ValueError(
+                "Input data has a different number of features "
+                "than fitting data. Should have {n}, data has {m}".format(
+                    n=len(self.lambdas_), m=X.shape[1]
+                )
+            )
+
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+@validate_params(
+    {"X": ["array-like"]},
+    prefer_skip_nested_validation=False,
+)
+def power_transform(X, method="yeo-johnson", *, standardize=True, copy=True):
+    """Parametric, monotonic transformation to make data more Gaussian-like.
+
+    Power transforms are a family of parametric, monotonic transformations
+    that are applied to make data more Gaussian-like. This is useful for
+    modeling issues related to heteroscedasticity (non-constant variance),
+    or other situations where normality is desired.
+
+    Currently, power_transform supports the Box-Cox transform and the
+    Yeo-Johnson transform. The optimal parameter for stabilizing variance and
+    minimizing skewness is estimated through maximum likelihood.
+
+    Box-Cox requires input data to be strictly positive, while Yeo-Johnson
+    supports both positive or negative data.
+
+    By default, zero-mean, unit-variance normalization is applied to the
+    transformed data.
+
+    Read more in the :ref:`User Guide <preprocessing_transformer>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        The data to be transformed using a power transformation.
+
+    method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'
+        The power transform method. Available methods are:
+
+        - 'yeo-johnson' [1]_, works with positive and negative values
+        - 'box-cox' [2]_, only works with strictly positive values
+
+        .. versionchanged:: 0.23
+            The default value of the `method` parameter changed from
+            'box-cox' to 'yeo-johnson' in 0.23.
+
+    standardize : bool, default=True
+        Set to True to apply zero-mean, unit-variance normalization to the
+        transformed output.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and transform in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    Returns
+    -------
+    X_trans : ndarray of shape (n_samples, n_features)
+        The transformed data.
+
+    See Also
+    --------
+    PowerTransformer : Equivalent transformation with the
+        Transformer API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+
+    quantile_transform : Maps data to a standard normal distribution with
+        the parameter `output_distribution='normal'`.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in ``fit``, and maintained
+    in ``transform``.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    References
+    ----------
+
+    .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to
+           improve normality or symmetry." Biometrika, 87(4), pp.954-959,
+           (2000).
+
+    .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
+           of the Royal Statistical Society B, 26, 211-252 (1964).
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import power_transform
+    >>> data = [[1, 2], [3, 2], [4, 5]]
+    >>> print(power_transform(data, method='box-cox'))
+    [[-1.332... -0.707...]
+     [ 0.256... -0.707...]
+     [ 1.076...  1.414...]]
+
+    .. warning:: Risk of data leak.
+        Do not use :func:`~sklearn.preprocessing.power_transform` unless you
+        know what you are doing. A common mistake is to apply it to the entire
+        data *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.PowerTransformer` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking, e.g.: `pipe = make_pipeline(PowerTransformer(),
+        LogisticRegression())`.
+    """
+    pt = PowerTransformer(method=method, standardize=standardize, copy=copy)
+    return pt.fit_transform(X)
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_discretization.py b/.venv/Lib/site-packages/sklearn/preprocessing/_discretization.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee505239dd9deea2d8680b6393548749df14f151
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/_discretization.py
@@ -0,0 +1,464 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import warnings
+from numbers import Integral
+
+import numpy as np
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils import resample
+from ..utils._param_validation import Interval, Options, StrOptions
+from ..utils.deprecation import _deprecate_Xt_in_inverse_transform
+from ..utils.stats import _weighted_percentile
+from ..utils.validation import (
+    _check_feature_names_in,
+    _check_sample_weight,
+    check_array,
+    check_is_fitted,
+    validate_data,
+)
+from ._encoders import OneHotEncoder
+
+
+class KBinsDiscretizer(TransformerMixin, BaseEstimator):
+    """
+    Bin continuous data into intervals.
+
+    Read more in the :ref:`User Guide <preprocessing_discretization>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    n_bins : int or array-like of shape (n_features,), default=5
+        The number of bins to produce. Raises ValueError if ``n_bins < 2``.
+
+    encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
+        Method used to encode the transformed result.
+
+        - 'onehot': Encode the transformed result with one-hot encoding
+          and return a sparse matrix. Ignored features are always
+          stacked to the right.
+        - 'onehot-dense': Encode the transformed result with one-hot encoding
+          and return a dense array. Ignored features are always
+          stacked to the right.
+        - 'ordinal': Return the bin identifier encoded as an integer value.
+
+    strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
+        Strategy used to define the widths of the bins.
+
+        - 'uniform': All bins in each feature have identical widths.
+        - 'quantile': All bins in each feature have the same number of points.
+        - 'kmeans': Values in each bin have the same nearest center of a 1D
+          k-means cluster.
+
+        For an example of the different strategies see:
+        :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.
+
+    dtype : {np.float32, np.float64}, default=None
+        The desired data-type for the output. If None, output dtype is
+        consistent with input dtype. Only np.float32 and np.float64 are
+        supported.
+
+        .. versionadded:: 0.24
+
+    subsample : int or None, default=200_000
+        Maximum number of samples, used to fit the model, for computational
+        efficiency.
+        `subsample=None` means that all the training samples are used when
+        computing the quantiles that determine the binning thresholds.
+        Since quantile computation relies on sorting each column of `X` and
+        that sorting has an `n log(n)` time complexity,
+        it is recommended to use subsampling on datasets with a
+        very large number of samples.
+
+        .. versionchanged:: 1.3
+            The default value of `subsample` changed from `None` to `200_000` when
+            `strategy="quantile"`.
+
+        .. versionchanged:: 1.5
+            The default value of `subsample` changed from `None` to `200_000` when
+            `strategy="uniform"` or `strategy="kmeans"`.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for subsampling.
+        Pass an int for reproducible results across multiple function calls.
+        See the `subsample` parameter for more details.
+        See :term:`Glossary <random_state>`.
+
+        .. versionadded:: 1.1
+
+    Attributes
+    ----------
+    bin_edges_ : ndarray of ndarray of shape (n_features,)
+        The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
+        Ignored features will have empty arrays.
+
+    n_bins_ : ndarray of shape (n_features,), dtype=np.int64
+        Number of bins per feature. Bins whose width are too small
+        (i.e., <= 1e-8) are removed with a warning.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    Binarizer : Class used to bin values as ``0`` or
+        ``1`` based on a parameter ``threshold``.
+
+    Notes
+    -----
+
+    For a visualization of discretization on different datasets refer to
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
+    On the effect of discretization on linear models see:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.
+
+    In bin edges for feature ``i``, the first and last values are used only for
+    ``inverse_transform``. During transform, bin edges are extended to::
+
+      np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
+
+    You can combine ``KBinsDiscretizer`` with
+    :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
+    part of the features.
+
+    ``KBinsDiscretizer`` might produce constant features (e.g., when
+    ``encode = 'onehot'`` and certain bins do not contain any data).
+    These features can be removed with feature selection algorithms
+    (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import KBinsDiscretizer
+    >>> X = [[-2, 1, -4,   -1],
+    ...      [-1, 2, -3, -0.5],
+    ...      [ 0, 3, -2,  0.5],
+    ...      [ 1, 4, -1,    2]]
+    >>> est = KBinsDiscretizer(
+    ...     n_bins=3, encode='ordinal', strategy='uniform'
+    ... )
+    >>> est.fit(X)
+    KBinsDiscretizer(...)
+    >>> Xt = est.transform(X)
+    >>> Xt  # doctest: +SKIP
+    array([[ 0., 0., 0., 0.],
+           [ 1., 1., 1., 0.],
+           [ 2., 2., 2., 1.],
+           [ 2., 2., 2., 2.]])
+
+    Sometimes it may be useful to convert the data back into the original
+    feature space. The ``inverse_transform`` function converts the binned
+    data into the original feature space. Each value will be equal to the mean
+    of the two bin edges.
+
+    >>> est.bin_edges_[0]
+    array([-2., -1.,  0.,  1.])
+    >>> est.inverse_transform(Xt)
+    array([[-1.5,  1.5, -3.5, -0.5],
+           [-0.5,  2.5, -2.5, -0.5],
+           [ 0.5,  3.5, -1.5,  0.5],
+           [ 0.5,  3.5, -1.5,  1.5]])
+    """
+
+    _parameter_constraints: dict = {
+        "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],
+        "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
+        "strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
+        "dtype": [Options(type, {np.float64, np.float32}), None],
+        "subsample": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_bins=5,
+        *,
+        encode="onehot",
+        strategy="quantile",
+        dtype=None,
+        subsample=200_000,
+        random_state=None,
+    ):
+        self.n_bins = n_bins
+        self.encode = encode
+        self.strategy = strategy
+        self.dtype = dtype
+        self.subsample = subsample
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, sample_weight=None):
+        """
+        Fit the estimator.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Data to be discretized.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+
+        sample_weight : ndarray of shape (n_samples,)
+            Contains weight values to be associated with each sample.
+            Cannot be used when `strategy` is set to `"uniform"`.
+
+            .. versionadded:: 1.3
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = validate_data(self, X, dtype="numeric")
+
+        if self.dtype in (np.float64, np.float32):
+            output_dtype = self.dtype
+        else:  # self.dtype is None
+            output_dtype = X.dtype
+
+        n_samples, n_features = X.shape
+
+        if sample_weight is not None and self.strategy == "uniform":
+            raise ValueError(
+                "`sample_weight` was provided but it cannot be "
+                "used with strategy='uniform'. Got strategy="
+                f"{self.strategy!r} instead."
+            )
+
+        if self.subsample is not None and n_samples > self.subsample:
+            # Take a subsample of `X`
+            X = resample(
+                X,
+                replace=False,
+                n_samples=self.subsample,
+                random_state=self.random_state,
+            )
+
+        n_features = X.shape[1]
+        n_bins = self._validate_n_bins(n_features)
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        bin_edges = np.zeros(n_features, dtype=object)
+        for jj in range(n_features):
+            column = X[:, jj]
+            col_min, col_max = column.min(), column.max()
+
+            if col_min == col_max:
+                warnings.warn(
+                    "Feature %d is constant and will be replaced with 0." % jj
+                )
+                n_bins[jj] = 1
+                bin_edges[jj] = np.array([-np.inf, np.inf])
+                continue
+
+            if self.strategy == "uniform":
+                bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
+
+            elif self.strategy == "quantile":
+                quantiles = np.linspace(0, 100, n_bins[jj] + 1)
+                if sample_weight is None:
+                    bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
+                else:
+                    bin_edges[jj] = np.asarray(
+                        [
+                            _weighted_percentile(column, sample_weight, q)
+                            for q in quantiles
+                        ],
+                        dtype=np.float64,
+                    )
+            elif self.strategy == "kmeans":
+                from ..cluster import KMeans  # fixes import loops
+
+                # Deterministic initialization with uniform spacing
+                uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
+                init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
+
+                # 1D k-means procedure
+                km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
+                centers = km.fit(
+                    column[:, None], sample_weight=sample_weight
+                ).cluster_centers_[:, 0]
+                # Must sort, centers may be unsorted even with sorted init
+                centers.sort()
+                bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
+                bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
+
+            # Remove bins whose width are too small (i.e., <= 1e-8)
+            if self.strategy in ("quantile", "kmeans"):
+                mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
+                bin_edges[jj] = bin_edges[jj][mask]
+                if len(bin_edges[jj]) - 1 != n_bins[jj]:
+                    warnings.warn(
+                        "Bins whose width are too small (i.e., <= "
+                        "1e-8) in feature %d are removed. Consider "
+                        "decreasing the number of bins." % jj
+                    )
+                    n_bins[jj] = len(bin_edges[jj]) - 1
+
+        self.bin_edges_ = bin_edges
+        self.n_bins_ = n_bins
+
+        if "onehot" in self.encode:
+            self._encoder = OneHotEncoder(
+                categories=[np.arange(i) for i in self.n_bins_],
+                sparse_output=self.encode == "onehot",
+                dtype=output_dtype,
+            )
+            # Fit the OneHotEncoder with toy datasets
+            # so that it's ready for use after the KBinsDiscretizer is fitted
+            self._encoder.fit(np.zeros((1, len(self.n_bins_))))
+
+        return self
+
+    def _validate_n_bins(self, n_features):
+        """Returns n_bins_, the number of bins per feature."""
+        orig_bins = self.n_bins
+        if isinstance(orig_bins, Integral):
+            return np.full(n_features, orig_bins, dtype=int)
+
+        n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
+
+        if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
+            raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
+
+        bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
+
+        violating_indices = np.where(bad_nbins_value)[0]
+        if violating_indices.shape[0] > 0:
+            indices = ", ".join(str(i) for i in violating_indices)
+            raise ValueError(
+                "{} received an invalid number "
+                "of bins at indices {}. Number of bins "
+                "must be at least 2, and must be an int.".format(
+                    KBinsDiscretizer.__name__, indices
+                )
+            )
+        return n_bins
+
+    def transform(self, X):
+        """
+        Discretize the data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Data to be discretized.
+
+        Returns
+        -------
+        Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
+            Data in the binned space. Will be a sparse matrix if
+            `self.encode='onehot'` and ndarray otherwise.
+        """
+        check_is_fitted(self)
+
+        # check input and attribute dtypes
+        dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
+        Xt = validate_data(self, X, copy=True, dtype=dtype, reset=False)
+
+        bin_edges = self.bin_edges_
+        for jj in range(Xt.shape[1]):
+            Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")
+
+        if self.encode == "ordinal":
+            return Xt
+
+        dtype_init = None
+        if "onehot" in self.encode:
+            dtype_init = self._encoder.dtype
+            self._encoder.dtype = Xt.dtype
+        try:
+            Xt_enc = self._encoder.transform(Xt)
+        finally:
+            # revert the initial dtype to avoid modifying self.
+            self._encoder.dtype = dtype_init
+        return Xt_enc
+
+    def inverse_transform(self, X=None, *, Xt=None):
+        """
+        Transform discretized data back to original feature space.
+
+        Note that this function does not regenerate the original data
+        due to discretization rounding.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Transformed data in the binned space.
+
+        Xt : array-like of shape (n_samples, n_features)
+            Transformed data in the binned space.
+
+            .. deprecated:: 1.5
+                `Xt` was deprecated in 1.5 and will be removed in 1.7. Use `X` instead.
+
+        Returns
+        -------
+        Xinv : ndarray, dtype={np.float32, np.float64}
+            Data in the original feature space.
+        """
+        X = _deprecate_Xt_in_inverse_transform(X, Xt)
+
+        check_is_fitted(self)
+
+        if "onehot" in self.encode:
+            X = self._encoder.inverse_transform(X)
+
+        Xinv = check_array(X, copy=True, dtype=(np.float64, np.float32))
+        n_features = self.n_bins_.shape[0]
+        if Xinv.shape[1] != n_features:
+            raise ValueError(
+                "Incorrect number of features. Expecting {}, received {}.".format(
+                    n_features, Xinv.shape[1]
+                )
+            )
+
+        for jj in range(n_features):
+            bin_edges = self.bin_edges_[jj]
+            bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
+            Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)]
+
+        return Xinv
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        input_features = _check_feature_names_in(self, input_features)
+        if hasattr(self, "_encoder"):
+            return self._encoder.get_feature_names_out(input_features)
+
+        # ordinal encoding
+        return input_features
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_encoders.py b/.venv/Lib/site-packages/sklearn/preprocessing/_encoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..58ad8bca28a1c13c2304a52364e1a7329eea56e5
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/_encoders.py
@@ -0,0 +1,1698 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import warnings
+from numbers import Integral
+
+import numpy as np
+from scipy import sparse
+
+from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context
+from ..utils import _safe_indexing, check_array
+from ..utils._encode import _check_unknown, _encode, _get_counts, _unique
+from ..utils._mask import _get_mask
+from ..utils._missing import is_scalar_nan
+from ..utils._param_validation import Interval, RealNotInt, StrOptions
+from ..utils._set_output import _get_output_config
+from ..utils.validation import (
+    _check_feature_names,
+    _check_feature_names_in,
+    _check_n_features,
+    check_is_fitted,
+)
+
+__all__ = ["OneHotEncoder", "OrdinalEncoder"]
+
+
+class _BaseEncoder(TransformerMixin, BaseEstimator):
+    """
+    Base class for encoders that includes the code to categorize and
+    transform the input features.
+
+    """
+
+    def _check_X(self, X, ensure_all_finite=True):
+        """
+        Perform custom check_array:
+        - convert list of strings to object dtype
+        - check for missing values for object dtype data (check_array does
+          not do that)
+        - return list of features (arrays): this list of features is
+          constructed feature by feature to preserve the data types
+          of pandas DataFrame columns, as otherwise information is lost
+          and cannot be used, e.g. for the `categories_` attribute.
+
+        """
+        if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2):
+            # if not a dataframe, do normal check_array validation
+            X_temp = check_array(X, dtype=None, ensure_all_finite=ensure_all_finite)
+            if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_):
+                X = check_array(X, dtype=object, ensure_all_finite=ensure_all_finite)
+            else:
+                X = X_temp
+            needs_validation = False
+        else:
+            # pandas dataframe, do validation later column by column, in order
+            # to keep the dtype information to be used in the encoder.
+            needs_validation = ensure_all_finite
+
+        n_samples, n_features = X.shape
+        X_columns = []
+
+        for i in range(n_features):
+            Xi = _safe_indexing(X, indices=i, axis=1)
+            Xi = check_array(
+                Xi, ensure_2d=False, dtype=None, ensure_all_finite=needs_validation
+            )
+            X_columns.append(Xi)
+
+        return X_columns, n_samples, n_features
+
+    def _fit(
+        self,
+        X,
+        handle_unknown="error",
+        ensure_all_finite=True,
+        return_counts=False,
+        return_and_ignore_missing_for_infrequent=False,
+    ):
+        self._check_infrequent_enabled()
+        _check_n_features(self, X, reset=True)
+        _check_feature_names(self, X, reset=True)
+        X_list, n_samples, n_features = self._check_X(
+            X, ensure_all_finite=ensure_all_finite
+        )
+        self.n_features_in_ = n_features
+
+        if self.categories != "auto":
+            if len(self.categories) != n_features:
+                raise ValueError(
+                    "Shape mismatch: if categories is an array,"
+                    " it has to be of shape (n_features,)."
+                )
+
+        self.categories_ = []
+        category_counts = []
+        compute_counts = return_counts or self._infrequent_enabled
+
+        for i in range(n_features):
+            Xi = X_list[i]
+
+            if self.categories == "auto":
+                result = _unique(Xi, return_counts=compute_counts)
+                if compute_counts:
+                    cats, counts = result
+                    category_counts.append(counts)
+                else:
+                    cats = result
+            else:
+                if np.issubdtype(Xi.dtype, np.str_):
+                    # Always convert string categories to objects to avoid
+                    # unexpected string truncation for longer category labels
+                    # passed in the constructor.
+                    Xi_dtype = object
+                else:
+                    Xi_dtype = Xi.dtype
+
+                cats = np.array(self.categories[i], dtype=Xi_dtype)
+                if (
+                    cats.dtype == object
+                    and isinstance(cats[0], bytes)
+                    and Xi.dtype.kind != "S"
+                ):
+                    msg = (
+                        f"In column {i}, the predefined categories have type 'bytes'"
+                        " which is incompatible with values of type"
+                        f" '{type(Xi[0]).__name__}'."
+                    )
+                    raise ValueError(msg)
+
+                # `nan` must be the last stated category
+                for category in cats[:-1]:
+                    if is_scalar_nan(category):
+                        raise ValueError(
+                            "Nan should be the last element in user"
+                            f" provided categories, see categories {cats}"
+                            f" in column #{i}"
+                        )
+
+                if cats.size != len(_unique(cats)):
+                    msg = (
+                        f"In column {i}, the predefined categories"
+                        " contain duplicate elements."
+                    )
+                    raise ValueError(msg)
+
+                if Xi.dtype.kind not in "OUS":
+                    sorted_cats = np.sort(cats)
+                    error_msg = (
+                        "Unsorted categories are not supported for numerical categories"
+                    )
+                    # if there are nans, nan should be the last element
+                    stop_idx = -1 if np.isnan(sorted_cats[-1]) else None
+                    if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]):
+                        raise ValueError(error_msg)
+
+                if handle_unknown == "error":
+                    diff = _check_unknown(Xi, cats)
+                    if diff:
+                        msg = (
+                            "Found unknown categories {0} in column {1}"
+                            " during fit".format(diff, i)
+                        )
+                        raise ValueError(msg)
+                if compute_counts:
+                    category_counts.append(_get_counts(Xi, cats))
+
+            self.categories_.append(cats)
+
+        output = {"n_samples": n_samples}
+        if return_counts:
+            output["category_counts"] = category_counts
+
+        missing_indices = {}
+        if return_and_ignore_missing_for_infrequent:
+            for feature_idx, categories_for_idx in enumerate(self.categories_):
+                if is_scalar_nan(categories_for_idx[-1]):
+                    # `nan` values can only be placed in the latest position
+                    missing_indices[feature_idx] = categories_for_idx.size - 1
+            output["missing_indices"] = missing_indices
+
+        if self._infrequent_enabled:
+            self._fit_infrequent_category_mapping(
+                n_samples,
+                category_counts,
+                missing_indices,
+            )
+        return output
+
+    def _transform(
+        self,
+        X,
+        handle_unknown="error",
+        ensure_all_finite=True,
+        warn_on_unknown=False,
+        ignore_category_indices=None,
+    ):
+        X_list, n_samples, n_features = self._check_X(
+            X, ensure_all_finite=ensure_all_finite
+        )
+        _check_feature_names(self, X, reset=False)
+        _check_n_features(self, X, reset=False)
+
+        X_int = np.zeros((n_samples, n_features), dtype=int)
+        X_mask = np.ones((n_samples, n_features), dtype=bool)
+
+        columns_with_unknown = []
+        for i in range(n_features):
+            Xi = X_list[i]
+            diff, valid_mask = _check_unknown(Xi, self.categories_[i], return_mask=True)
+
+            if not np.all(valid_mask):
+                if handle_unknown == "error":
+                    msg = (
+                        "Found unknown categories {0} in column {1}"
+                        " during transform".format(diff, i)
+                    )
+                    raise ValueError(msg)
+                else:
+                    if warn_on_unknown:
+                        columns_with_unknown.append(i)
+                    # Set the problematic rows to an acceptable value and
+                    # continue `The rows are marked `X_mask` and will be
+                    # removed later.
+                    X_mask[:, i] = valid_mask
+                    # cast Xi into the largest string type necessary
+                    # to handle different lengths of numpy strings
+                    if (
+                        self.categories_[i].dtype.kind in ("U", "S")
+                        and self.categories_[i].itemsize > Xi.itemsize
+                    ):
+                        Xi = Xi.astype(self.categories_[i].dtype)
+                    elif self.categories_[i].dtype.kind == "O" and Xi.dtype.kind == "U":
+                        # categories are objects and Xi are numpy strings.
+                        # Cast Xi to an object dtype to prevent truncation
+                        # when setting invalid values.
+                        Xi = Xi.astype("O")
+                    else:
+                        Xi = Xi.copy()
+
+                    Xi[~valid_mask] = self.categories_[i][0]
+            # We use check_unknown=False, since _check_unknown was
+            # already called above.
+            X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False)
+        if columns_with_unknown:
+            warnings.warn(
+                (
+                    "Found unknown categories in columns "
+                    f"{columns_with_unknown} during transform. These "
+                    "unknown categories will be encoded as all zeros"
+                ),
+                UserWarning,
+            )
+
+        self._map_infrequent_categories(X_int, X_mask, ignore_category_indices)
+        return X_int, X_mask
+
+    @property
+    def infrequent_categories_(self):
+        """Infrequent categories for each feature."""
+        # raises an AttributeError if `_infrequent_indices` is not defined
+        infrequent_indices = self._infrequent_indices
+        return [
+            None if indices is None else category[indices]
+            for category, indices in zip(self.categories_, infrequent_indices)
+        ]
+
+    def _check_infrequent_enabled(self):
+        """
+        This functions checks whether _infrequent_enabled is True or False.
+        This has to be called after parameter validation in the fit function.
+        """
+        max_categories = getattr(self, "max_categories", None)
+        min_frequency = getattr(self, "min_frequency", None)
+        self._infrequent_enabled = (
+            max_categories is not None and max_categories >= 1
+        ) or min_frequency is not None
+
+    def _identify_infrequent(self, category_count, n_samples, col_idx):
+        """Compute the infrequent indices.
+
+        Parameters
+        ----------
+        category_count : ndarray of shape (n_cardinality,)
+            Category counts.
+
+        n_samples : int
+            Number of samples.
+
+        col_idx : int
+            Index of the current category. Only used for the error message.
+
+        Returns
+        -------
+        output : ndarray of shape (n_infrequent_categories,) or None
+            If there are infrequent categories, indices of infrequent
+            categories. Otherwise None.
+        """
+        if isinstance(self.min_frequency, numbers.Integral):
+            infrequent_mask = category_count < self.min_frequency
+        elif isinstance(self.min_frequency, numbers.Real):
+            min_frequency_abs = n_samples * self.min_frequency
+            infrequent_mask = category_count < min_frequency_abs
+        else:
+            infrequent_mask = np.zeros(category_count.shape[0], dtype=bool)
+
+        n_current_features = category_count.size - infrequent_mask.sum() + 1
+        if self.max_categories is not None and self.max_categories < n_current_features:
+            # max_categories includes the one infrequent category
+            frequent_category_count = self.max_categories - 1
+            if frequent_category_count == 0:
+                # All categories are infrequent
+                infrequent_mask[:] = True
+            else:
+                # stable sort to preserve original count order
+                smallest_levels = np.argsort(category_count, kind="mergesort")[
+                    :-frequent_category_count
+                ]
+                infrequent_mask[smallest_levels] = True
+
+        output = np.flatnonzero(infrequent_mask)
+        return output if output.size > 0 else None
+
+    def _fit_infrequent_category_mapping(
+        self, n_samples, category_counts, missing_indices
+    ):
+        """Fit infrequent categories.
+
+        Defines the private attribute: `_default_to_infrequent_mappings`. For
+        feature `i`, `_default_to_infrequent_mappings[i]` defines the mapping
+        from the integer encoding returned by `super().transform()` into
+        infrequent categories. If `_default_to_infrequent_mappings[i]` is None,
+        there were no infrequent categories in the training set.
+
+        For example if categories 0, 2 and 4 were frequent, while categories
+        1, 3, 5 were infrequent for feature 7, then these categories are mapped
+        to a single output:
+        `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])`
+
+        Defines private attribute: `_infrequent_indices`. `_infrequent_indices[i]`
+        is an array of indices such that
+        `categories_[i][_infrequent_indices[i]]` are all the infrequent category
+        labels. If the feature `i` has no infrequent categories
+        `_infrequent_indices[i]` is None.
+
+        .. versionadded:: 1.1
+
+        Parameters
+        ----------
+        n_samples : int
+            Number of samples in training set.
+        category_counts: list of ndarray
+            `category_counts[i]` is the category counts corresponding to
+            `self.categories_[i]`.
+        missing_indices : dict
+            Dict mapping from feature_idx to category index with a missing value.
+        """
+        # Remove missing value from counts, so it is not considered as infrequent
+        if missing_indices:
+            category_counts_ = []
+            for feature_idx, count in enumerate(category_counts):
+                if feature_idx in missing_indices:
+                    category_counts_.append(
+                        np.delete(count, missing_indices[feature_idx])
+                    )
+                else:
+                    category_counts_.append(count)
+        else:
+            category_counts_ = category_counts
+
+        self._infrequent_indices = [
+            self._identify_infrequent(category_count, n_samples, col_idx)
+            for col_idx, category_count in enumerate(category_counts_)
+        ]
+
+        # compute mapping from default mapping to infrequent mapping
+        self._default_to_infrequent_mappings = []
+
+        for feature_idx, infreq_idx in enumerate(self._infrequent_indices):
+            cats = self.categories_[feature_idx]
+            # no infrequent categories
+            if infreq_idx is None:
+                self._default_to_infrequent_mappings.append(None)
+                continue
+
+            n_cats = len(cats)
+            if feature_idx in missing_indices:
+                # Missing index was removed from this category when computing
+                # infrequent indices, thus we need to decrease the number of
+                # total categories when considering the infrequent mapping.
+                n_cats -= 1
+
+            # infrequent indices exist
+            mapping = np.empty(n_cats, dtype=np.int64)
+            n_infrequent_cats = infreq_idx.size
+
+            # infrequent categories are mapped to the last element.
+            n_frequent_cats = n_cats - n_infrequent_cats
+            mapping[infreq_idx] = n_frequent_cats
+
+            frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx)
+            mapping[frequent_indices] = np.arange(n_frequent_cats)
+
+            self._default_to_infrequent_mappings.append(mapping)
+
+    def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices):
+        """Map infrequent categories to integer representing the infrequent category.
+
+        This modifies X_int in-place. Values that were invalid based on `X_mask`
+        are mapped to the infrequent category if there was an infrequent
+        category for that feature.
+
+        Parameters
+        ----------
+        X_int: ndarray of shape (n_samples, n_features)
+            Integer encoded categories.
+
+        X_mask: ndarray of shape (n_samples, n_features)
+            Bool mask for valid values in `X_int`.
+
+        ignore_category_indices : dict
+            Dictionary mapping from feature_idx to category index to ignore.
+            Ignored indexes will not be grouped and the original ordinal encoding
+            will remain.
+        """
+        if not self._infrequent_enabled:
+            return
+
+        ignore_category_indices = ignore_category_indices or {}
+
+        for col_idx in range(X_int.shape[1]):
+            infrequent_idx = self._infrequent_indices[col_idx]
+            if infrequent_idx is None:
+                continue
+
+            X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0]
+            if self.handle_unknown == "infrequent_if_exist":
+                # All the unknown values are now mapped to the
+                # infrequent_idx[0], which makes the unknown values valid
+                # This is needed in `transform` when the encoding is formed
+                # using `X_mask`.
+                X_mask[:, col_idx] = True
+
+        # Remaps encoding in `X_int` where the infrequent categories are
+        # grouped together.
+        for i, mapping in enumerate(self._default_to_infrequent_mappings):
+            if mapping is None:
+                continue
+
+            if i in ignore_category_indices:
+                # Update rows that are **not** ignored
+                rows_to_update = X_int[:, i] != ignore_category_indices[i]
+            else:
+                rows_to_update = slice(None)
+
+            X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i])
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.categorical = True
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+class OneHotEncoder(_BaseEncoder):
+    """
+    Encode categorical features as a one-hot numeric array.
+
+    The input to this transformer should be an array-like of integers or
+    strings, denoting the values taken on by categorical (discrete) features.
+    The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
+    encoding scheme. This creates a binary column for each category and
+    returns a sparse matrix or dense array (depending on the ``sparse_output``
+    parameter).
+
+    By default, the encoder derives the categories based on the unique values
+    in each feature. Alternatively, you can also specify the `categories`
+    manually.
+
+    This encoding is needed for feeding categorical data to many scikit-learn
+    estimators, notably linear models and SVMs with the standard kernels.
+
+    Note: a one-hot encoding of y labels should use a LabelBinarizer
+    instead.
+
+    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+    For a comparison of different encoders, refer to:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
+
+    Parameters
+    ----------
+    categories : 'auto' or a list of array-like, default='auto'
+        Categories (unique values) per feature:
+
+        - 'auto' : Determine categories automatically from the training data.
+        - list : ``categories[i]`` holds the categories expected in the ith
+          column. The passed categories should not mix strings and numeric
+          values within a single feature, and should be sorted in case of
+          numeric values.
+
+        The used categories can be found in the ``categories_`` attribute.
+
+        .. versionadded:: 0.20
+
+    drop : {'first', 'if_binary'} or an array-like of shape (n_features,), \
+            default=None
+        Specifies a methodology to use to drop one of the categories per
+        feature. This is useful in situations where perfectly collinear
+        features cause problems, such as when feeding the resulting data
+        into an unregularized linear regression model.
+
+        However, dropping one category breaks the symmetry of the original
+        representation and can therefore induce a bias in downstream models,
+        for instance for penalized linear classification or regression models.
+
+        - None : retain all features (the default).
+        - 'first' : drop the first category in each feature. If only one
+          category is present, the feature will be dropped entirely.
+        - 'if_binary' : drop the first category in each feature with two
+          categories. Features with 1 or more than 2 categories are
+          left intact.
+        - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
+          should be dropped.
+
+        When `max_categories` or `min_frequency` is configured to group
+        infrequent categories, the dropping behavior is handled after the
+        grouping.
+
+        .. versionadded:: 0.21
+           The parameter `drop` was added in 0.21.
+
+        .. versionchanged:: 0.23
+           The option `drop='if_binary'` was added in 0.23.
+
+        .. versionchanged:: 1.1
+            Support for dropping infrequent categories.
+
+    sparse_output : bool, default=True
+        When ``True``, it returns a :class:`scipy.sparse.csr_matrix`,
+        i.e. a sparse matrix in "Compressed Sparse Row" (CSR) format.
+
+        .. versionadded:: 1.2
+           `sparse` was renamed to `sparse_output`
+
+    dtype : number type, default=np.float64
+        Desired dtype of output.
+
+    handle_unknown : {'error', 'ignore', 'infrequent_if_exist', 'warn'}, \
+                     default='error'
+        Specifies the way unknown categories are handled during :meth:`transform`.
+
+        - 'error' : Raise an error if an unknown category is present during transform.
+        - 'ignore' : When an unknown category is encountered during
+          transform, the resulting one-hot encoded columns for this feature
+          will be all zeros. In the inverse transform, an unknown category
+          will be denoted as None.
+        - 'infrequent_if_exist' : When an unknown category is encountered
+          during transform, the resulting one-hot encoded columns for this
+          feature will map to the infrequent category if it exists. The
+          infrequent category will be mapped to the last position in the
+          encoding. During inverse transform, an unknown category will be
+          mapped to the category denoted `'infrequent'` if it exists. If the
+          `'infrequent'` category does not exist, then :meth:`transform` and
+          :meth:`inverse_transform` will handle an unknown category as with
+          `handle_unknown='ignore'`. Infrequent categories exist based on
+          `min_frequency` and `max_categories`. Read more in the
+          :ref:`User Guide <encoder_infrequent_categories>`.
+        - 'warn' : When an unknown category is encountered during transform
+          a warning is issued, and the encoding then proceeds as described for
+          `handle_unknown="infrequent_if_exist"`.
+
+        .. versionchanged:: 1.1
+            `'infrequent_if_exist'` was added to automatically handle unknown
+            categories and infrequent categories.
+
+        .. versionadded:: 1.6
+           The option `"warn"` was added in 1.6.
+
+    min_frequency : int or float, default=None
+        Specifies the minimum frequency below which a category will be
+        considered infrequent.
+
+        - If `int`, categories with a smaller cardinality will be considered
+          infrequent.
+
+        - If `float`, categories with a smaller cardinality than
+          `min_frequency * n_samples`  will be considered infrequent.
+
+        .. versionadded:: 1.1
+            Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
+
+    max_categories : int, default=None
+        Specifies an upper limit to the number of output features for each input
+        feature when considering infrequent categories. If there are infrequent
+        categories, `max_categories` includes the category representing the
+        infrequent categories along with the frequent categories. If `None`,
+        there is no limit to the number of output features.
+
+        .. versionadded:: 1.1
+            Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
+
+    feature_name_combiner : "concat" or callable, default="concat"
+        Callable with signature `def callable(input_feature, category)` that returns a
+        string. This is used to create feature names to be returned by
+        :meth:`get_feature_names_out`.
+
+        `"concat"` concatenates encoded feature name and category with
+        `feature + "_" + str(category)`.E.g. feature X with values 1, 6, 7 create
+        feature names `X_1, X_6, X_7`.
+
+        .. versionadded:: 1.3
+
+    Attributes
+    ----------
+    categories_ : list of arrays
+        The categories of each feature determined during fitting
+        (in order of the features in X and corresponding with the output
+        of ``transform``). This includes the category specified in ``drop``
+        (if any).
+
+    drop_idx_ : array of shape (n_features,)
+        - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
+          to be dropped for each feature.
+        - ``drop_idx_[i] = None`` if no category is to be dropped from the
+          feature with index ``i``, e.g. when `drop='if_binary'` and the
+          feature isn't binary.
+        - ``drop_idx_ = None`` if all the transformed features will be
+          retained.
+
+        If infrequent categories are enabled by setting `min_frequency` or
+        `max_categories` to a non-default value and `drop_idx[i]` corresponds
+        to a infrequent category, then the entire infrequent category is
+        dropped.
+
+        .. versionchanged:: 0.23
+           Added the possibility to contain `None` values.
+
+    infrequent_categories_ : list of ndarray
+        Defined only if infrequent categories are enabled by setting
+        `min_frequency` or `max_categories` to a non-default value.
+        `infrequent_categories_[i]` are the infrequent categories for feature
+        `i`. If the feature `i` has no infrequent categories
+        `infrequent_categories_[i]` is None.
+
+        .. versionadded:: 1.1
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 1.0
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    feature_name_combiner : callable or None
+        Callable with signature `def callable(input_feature, category)` that returns a
+        string. This is used to create feature names to be returned by
+        :meth:`get_feature_names_out`.
+
+        .. versionadded:: 1.3
+
+    See Also
+    --------
+    OrdinalEncoder : Performs an ordinal (integer)
+      encoding of the categorical features.
+    TargetEncoder : Encodes categorical features using the target.
+    sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of
+      dictionary items (also handles string-valued features).
+    sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot
+      encoding of dictionary items or strings.
+    LabelBinarizer : Binarizes labels in a one-vs-all
+      fashion.
+    MultiLabelBinarizer : Transforms between iterable of
+      iterables and a multilabel format, e.g. a (samples x classes) binary
+      matrix indicating the presence of a class label.
+
+    Examples
+    --------
+    Given a dataset with two features, we let the encoder find the unique
+    values per feature and transform the data to a binary one-hot encoding.
+
+    >>> from sklearn.preprocessing import OneHotEncoder
+
+    One can discard categories not seen during `fit`:
+
+    >>> enc = OneHotEncoder(handle_unknown='ignore')
+    >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
+    >>> enc.fit(X)
+    OneHotEncoder(handle_unknown='ignore')
+    >>> enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
+    array([[1., 0., 1., 0., 0.],
+           [0., 1., 0., 0., 0.]])
+    >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
+    array([['Male', 1],
+           [None, 2]], dtype=object)
+    >>> enc.get_feature_names_out(['gender', 'group'])
+    array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)
+
+    One can always drop the first column for each feature:
+
+    >>> drop_enc = OneHotEncoder(drop='first').fit(X)
+    >>> drop_enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
+    array([[0., 0., 0.],
+           [1., 1., 0.]])
+
+    Or drop a column for feature only having 2 categories:
+
+    >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
+    >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
+    array([[0., 1., 0., 0.],
+           [1., 0., 1., 0.]])
+
+    One can change the way feature names are created.
+
+    >>> def custom_combiner(feature, category):
+    ...     return str(feature) + "_" + type(category).__name__ + "_" + str(category)
+    >>> custom_fnames_enc = OneHotEncoder(feature_name_combiner=custom_combiner).fit(X)
+    >>> custom_fnames_enc.get_feature_names_out()
+    array(['x0_str_Female', 'x0_str_Male', 'x1_int_1', 'x1_int_2', 'x1_int_3'],
+          dtype=object)
+
+    Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
+
+    >>> import numpy as np
+    >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T
+    >>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X)
+    >>> ohe.infrequent_categories_
+    [array(['a', 'd'], dtype=object)]
+    >>> ohe.transform([["a"], ["b"]])
+    array([[0., 0., 1.],
+           [1., 0., 0.]])
+    """
+
+    _parameter_constraints: dict = {
+        "categories": [StrOptions({"auto"}), list],
+        "drop": [StrOptions({"first", "if_binary"}), "array-like", None],
+        "dtype": "no_validation",  # validation delegated to numpy
+        "handle_unknown": [
+            StrOptions({"error", "ignore", "infrequent_if_exist", "warn"})
+        ],
+        "max_categories": [Interval(Integral, 1, None, closed="left"), None],
+        "min_frequency": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            None,
+        ],
+        "sparse_output": ["boolean"],
+        "feature_name_combiner": [StrOptions({"concat"}), callable],
+    }
+
+    def __init__(
+        self,
+        *,
+        categories="auto",
+        drop=None,
+        sparse_output=True,
+        dtype=np.float64,
+        handle_unknown="error",
+        min_frequency=None,
+        max_categories=None,
+        feature_name_combiner="concat",
+    ):
+        self.categories = categories
+        self.sparse_output = sparse_output
+        self.dtype = dtype
+        self.handle_unknown = handle_unknown
+        self.drop = drop
+        self.min_frequency = min_frequency
+        self.max_categories = max_categories
+        self.feature_name_combiner = feature_name_combiner
+
+    def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
+        """Convert `drop_idx` into the index for infrequent categories.
+
+        If there are no infrequent categories, then `drop_idx` is
+        returned. This method is called in `_set_drop_idx` when the `drop`
+        parameter is an array-like.
+        """
+        if not self._infrequent_enabled:
+            return drop_idx
+
+        default_to_infrequent = self._default_to_infrequent_mappings[feature_idx]
+        if default_to_infrequent is None:
+            return drop_idx
+
+        # Raise error when explicitly dropping a category that is infrequent
+        infrequent_indices = self._infrequent_indices[feature_idx]
+        if infrequent_indices is not None and drop_idx in infrequent_indices:
+            categories = self.categories_[feature_idx]
+            raise ValueError(
+                f"Unable to drop category {categories[drop_idx].item()!r} from"
+                f" feature {feature_idx} because it is infrequent"
+            )
+        return default_to_infrequent[drop_idx]
+
+    def _set_drop_idx(self):
+        """Compute the drop indices associated with `self.categories_`.
+
+        If `self.drop` is:
+        - `None`, No categories have been dropped.
+        - `'first'`, All zeros to drop the first category.
+        - `'if_binary'`, All zeros if the category is binary and `None`
+          otherwise.
+        - array-like, The indices of the categories that match the
+          categories in `self.drop`. If the dropped category is an infrequent
+          category, then the index for the infrequent category is used. This
+          means that the entire infrequent category is dropped.
+
+        This methods defines a public `drop_idx_` and a private
+        `_drop_idx_after_grouping`.
+
+        - `drop_idx_`: Public facing API that references the drop category in
+          `self.categories_`.
+        - `_drop_idx_after_grouping`: Used internally to drop categories *after* the
+          infrequent categories are grouped together.
+
+        If there are no infrequent categories or drop is `None`, then
+        `drop_idx_=_drop_idx_after_grouping`.
+        """
+        if self.drop is None:
+            drop_idx_after_grouping = None
+        elif isinstance(self.drop, str):
+            if self.drop == "first":
+                drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)
+            elif self.drop == "if_binary":
+                n_features_out_no_drop = [len(cat) for cat in self.categories_]
+                if self._infrequent_enabled:
+                    for i, infreq_idx in enumerate(self._infrequent_indices):
+                        if infreq_idx is None:
+                            continue
+                        n_features_out_no_drop[i] -= infreq_idx.size - 1
+
+                drop_idx_after_grouping = np.array(
+                    [
+                        0 if n_features_out == 2 else None
+                        for n_features_out in n_features_out_no_drop
+                    ],
+                    dtype=object,
+                )
+
+        else:
+            drop_array = np.asarray(self.drop, dtype=object)
+            droplen = len(drop_array)
+
+            if droplen != len(self.categories_):
+                msg = (
+                    "`drop` should have length equal to the number "
+                    "of features ({}), got {}"
+                )
+                raise ValueError(msg.format(len(self.categories_), droplen))
+            missing_drops = []
+            drop_indices = []
+            for feature_idx, (drop_val, cat_list) in enumerate(
+                zip(drop_array, self.categories_)
+            ):
+                if not is_scalar_nan(drop_val):
+                    drop_idx = np.where(cat_list == drop_val)[0]
+                    if drop_idx.size:  # found drop idx
+                        drop_indices.append(
+                            self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0])
+                        )
+                    else:
+                        missing_drops.append((feature_idx, drop_val))
+                    continue
+
+                # drop_val is nan, find nan in categories manually
+                if is_scalar_nan(cat_list[-1]):
+                    drop_indices.append(
+                        self._map_drop_idx_to_infrequent(feature_idx, cat_list.size - 1)
+                    )
+                else:  # nan is missing
+                    missing_drops.append((feature_idx, drop_val))
+
+            if any(missing_drops):
+                msg = (
+                    "The following categories were supposed to be "
+                    "dropped, but were not found in the training "
+                    "data.\n{}".format(
+                        "\n".join(
+                            [
+                                "Category: {}, Feature: {}".format(c, v)
+                                for c, v in missing_drops
+                            ]
+                        )
+                    )
+                )
+                raise ValueError(msg)
+            drop_idx_after_grouping = np.array(drop_indices, dtype=object)
+
+        # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
+        # categories are grouped together. If needed, we remap `drop_idx` back
+        # to the categories seen in `self.categories_`.
+        self._drop_idx_after_grouping = drop_idx_after_grouping
+
+        if not self._infrequent_enabled or drop_idx_after_grouping is None:
+            self.drop_idx_ = self._drop_idx_after_grouping
+        else:
+            drop_idx_ = []
+            for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):
+                default_to_infrequent = self._default_to_infrequent_mappings[
+                    feature_idx
+                ]
+                if drop_idx is None or default_to_infrequent is None:
+                    orig_drop_idx = drop_idx
+                else:
+                    orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]
+
+                drop_idx_.append(orig_drop_idx)
+
+            self.drop_idx_ = np.asarray(drop_idx_, dtype=object)
+
+    def _compute_transformed_categories(self, i, remove_dropped=True):
+        """Compute the transformed categories used for column `i`.
+
+        1. If there are infrequent categories, the category is named
+        'infrequent_sklearn'.
+        2. Dropped columns are removed when remove_dropped=True.
+        """
+        cats = self.categories_[i]
+
+        if self._infrequent_enabled:
+            infreq_map = self._default_to_infrequent_mappings[i]
+            if infreq_map is not None:
+                frequent_mask = infreq_map < infreq_map.max()
+                infrequent_cat = "infrequent_sklearn"
+                # infrequent category is always at the end
+                cats = np.concatenate(
+                    (cats[frequent_mask], np.array([infrequent_cat], dtype=object))
+                )
+
+        if remove_dropped:
+            cats = self._remove_dropped_categories(cats, i)
+        return cats
+
+    def _remove_dropped_categories(self, categories, i):
+        """Remove dropped categories."""
+        if (
+            self._drop_idx_after_grouping is not None
+            and self._drop_idx_after_grouping[i] is not None
+        ):
+            return np.delete(categories, self._drop_idx_after_grouping[i])
+        return categories
+
+    def _compute_n_features_outs(self):
+        """Compute the n_features_out for each input feature."""
+        output = [len(cats) for cats in self.categories_]
+
+        if self._drop_idx_after_grouping is not None:
+            for i, drop_idx in enumerate(self._drop_idx_after_grouping):
+                if drop_idx is not None:
+                    output[i] -= 1
+
+        if not self._infrequent_enabled:
+            return output
+
+        # infrequent is enabled, the number of features out are reduced
+        # because the infrequent categories are grouped together
+        for i, infreq_idx in enumerate(self._infrequent_indices):
+            if infreq_idx is None:
+                continue
+            output[i] -= infreq_idx.size - 1
+
+        return output
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """
+        Fit OneHotEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+
+        Returns
+        -------
+        self
+            Fitted encoder.
+        """
+        self._fit(
+            X,
+            handle_unknown=self.handle_unknown,
+            ensure_all_finite="allow-nan",
+        )
+        self._set_drop_idx()
+        self._n_features_outs = self._compute_n_features_outs()
+        return self
+
+    def transform(self, X):
+        """
+        Transform X using one-hot encoding.
+
+        If `sparse_output=True` (default), it returns an instance of
+        :class:`scipy.sparse._csr.csr_matrix` (CSR format).
+
+        If there are infrequent categories for a feature, set by specifying
+        `max_categories` or `min_frequency`, the infrequent categories are
+        grouped into a single category.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to encode.
+
+        Returns
+        -------
+        X_out : {ndarray, sparse matrix} of shape \
+                (n_samples, n_encoded_features)
+            Transformed input. If `sparse_output=True`, a sparse matrix will be
+            returned.
+        """
+        check_is_fitted(self)
+        transform_output = _get_output_config("transform", estimator=self)["dense"]
+        if transform_output != "default" and self.sparse_output:
+            capitalize_transform_output = transform_output.capitalize()
+            raise ValueError(
+                f"{capitalize_transform_output} output does not support sparse data."
+                f" Set sparse_output=False to output {transform_output} dataframes or"
+                f" disable {capitalize_transform_output} output via"
+                '` ohe.set_output(transform="default").'
+            )
+
+        # validation of X happens in _check_X called by _transform
+        if self.handle_unknown == "warn":
+            warn_on_unknown, handle_unknown = True, "infrequent_if_exist"
+        else:
+            warn_on_unknown = self.drop is not None and self.handle_unknown in {
+                "ignore",
+                "infrequent_if_exist",
+            }
+            handle_unknown = self.handle_unknown
+        X_int, X_mask = self._transform(
+            X,
+            handle_unknown=handle_unknown,
+            ensure_all_finite="allow-nan",
+            warn_on_unknown=warn_on_unknown,
+        )
+
+        n_samples, n_features = X_int.shape
+
+        if self._drop_idx_after_grouping is not None:
+            to_drop = self._drop_idx_after_grouping.copy()
+            # We remove all the dropped categories from mask, and decrement all
+            # categories that occur after them to avoid an empty column.
+            keep_cells = X_int != to_drop
+            for i, cats in enumerate(self.categories_):
+                # drop='if_binary' but feature isn't binary
+                if to_drop[i] is None:
+                    # set to cardinality to not drop from X_int
+                    to_drop[i] = len(cats)
+
+            to_drop = to_drop.reshape(1, -1)
+            X_int[X_int > to_drop] -= 1
+            X_mask &= keep_cells
+
+        mask = X_mask.ravel()
+        feature_indices = np.cumsum([0] + self._n_features_outs)
+        indices = (X_int + feature_indices[:-1]).ravel()[mask]
+
+        indptr = np.empty(n_samples + 1, dtype=int)
+        indptr[0] = 0
+        np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype)
+        np.cumsum(indptr[1:], out=indptr[1:])
+        data = np.ones(indptr[-1])
+
+        out = sparse.csr_matrix(
+            (data, indices, indptr),
+            shape=(n_samples, feature_indices[-1]),
+            dtype=self.dtype,
+        )
+        if not self.sparse_output:
+            return out.toarray()
+        else:
+            return out
+
+    def inverse_transform(self, X):
+        """
+        Convert the data back to the original representation.
+
+        When unknown categories are encountered (all zeros in the
+        one-hot encoding), ``None`` is used to represent this category. If the
+        feature with the unknown category has a dropped category, the dropped
+        category will be its inverse.
+
+        For a given input feature, if there is an infrequent category,
+        'infrequent_sklearn' will be used to represent the infrequent category.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape \
+                (n_samples, n_encoded_features)
+            The transformed data.
+
+        Returns
+        -------
+        X_tr : ndarray of shape (n_samples, n_features)
+            Inverse transformed array.
+        """
+        check_is_fitted(self)
+        X = check_array(X, accept_sparse="csr")
+
+        n_samples, _ = X.shape
+        n_features = len(self.categories_)
+
+        n_features_out = np.sum(self._n_features_outs)
+
+        # validate shape of passed X
+        msg = (
+            "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
+        )
+        if X.shape[1] != n_features_out:
+            raise ValueError(msg.format(n_features_out, X.shape[1]))
+
+        transformed_features = [
+            self._compute_transformed_categories(i, remove_dropped=False)
+            for i, _ in enumerate(self.categories_)
+        ]
+
+        # create resulting array of appropriate dtype
+        dt = np.result_type(*[cat.dtype for cat in transformed_features])
+        X_tr = np.empty((n_samples, n_features), dtype=dt)
+
+        j = 0
+        found_unknown = {}
+
+        if self._infrequent_enabled:
+            infrequent_indices = self._infrequent_indices
+        else:
+            infrequent_indices = [None] * n_features
+
+        for i in range(n_features):
+            cats_wo_dropped = self._remove_dropped_categories(
+                transformed_features[i], i
+            )
+            n_categories = cats_wo_dropped.shape[0]
+
+            # Only happens if there was a column with a unique
+            # category. In this case we just fill the column with this
+            # unique category value.
+            if n_categories == 0:
+                X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]
+                j += n_categories
+                continue
+            sub = X[:, j : j + n_categories]
+            # for sparse X argmax returns 2D matrix, ensure 1D array
+            labels = np.asarray(sub.argmax(axis=1)).flatten()
+            X_tr[:, i] = cats_wo_dropped[labels]
+
+            if self.handle_unknown == "ignore" or (
+                self.handle_unknown in ("infrequent_if_exist", "warn")
+                and infrequent_indices[i] is None
+            ):
+                unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
+                # ignored unknown categories: we have a row of all zero
+                if unknown.any():
+                    # if categories were dropped then unknown categories will
+                    # be mapped to the dropped category
+                    if (
+                        self._drop_idx_after_grouping is None
+                        or self._drop_idx_after_grouping[i] is None
+                    ):
+                        found_unknown[i] = unknown
+                    else:
+                        X_tr[unknown, i] = self.categories_[i][
+                            self._drop_idx_after_grouping[i]
+                        ]
+            else:
+                dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
+                if dropped.any():
+                    if self._drop_idx_after_grouping is None:
+                        all_zero_samples = np.flatnonzero(dropped)
+                        raise ValueError(
+                            f"Samples {all_zero_samples} can not be inverted "
+                            "when drop=None and handle_unknown='error' "
+                            "because they contain all zeros"
+                        )
+                    # we can safely assume that all of the nulls in each column
+                    # are the dropped value
+                    drop_idx = self._drop_idx_after_grouping[i]
+                    X_tr[dropped, i] = transformed_features[i][drop_idx]
+
+            j += n_categories
+
+        # if ignored are found: potentially need to upcast result to
+        # insert None values
+        if found_unknown:
+            if X_tr.dtype != object:
+                X_tr = X_tr.astype(object)
+
+            for idx, mask in found_unknown.items():
+                X_tr[mask, idx] = None
+
+        return X_tr
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self)
+        input_features = _check_feature_names_in(self, input_features)
+        cats = [
+            self._compute_transformed_categories(i)
+            for i, _ in enumerate(self.categories_)
+        ]
+
+        name_combiner = self._check_get_feature_name_combiner()
+        feature_names = []
+        for i in range(len(cats)):
+            names = [name_combiner(input_features[i], t) for t in cats[i]]
+            feature_names.extend(names)
+
+        return np.array(feature_names, dtype=object)
+
+    def _check_get_feature_name_combiner(self):
+        if self.feature_name_combiner == "concat":
+            return lambda feature, category: feature + "_" + str(category)
+        else:  # callable
+            dry_run_combiner = self.feature_name_combiner("feature", "category")
+            if not isinstance(dry_run_combiner, str):
+                raise TypeError(
+                    "When `feature_name_combiner` is a callable, it should return a "
+                    f"Python string. Got {type(dry_run_combiner)} instead."
+                )
+            return self.feature_name_combiner
+
+
+class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
+    """
+    Encode categorical features as an integer array.
+
+    The input to this transformer should be an array-like of integers or
+    strings, denoting the values taken on by categorical (discrete) features.
+    The features are converted to ordinal integers. This results in
+    a single column of integers (0 to n_categories - 1) per feature.
+
+    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+    For a comparison of different encoders, refer to:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    categories : 'auto' or a list of array-like, default='auto'
+        Categories (unique values) per feature:
+
+        - 'auto' : Determine categories automatically from the training data.
+        - list : ``categories[i]`` holds the categories expected in the ith
+          column. The passed categories should not mix strings and numeric
+          values, and should be sorted in case of numeric values.
+
+        The used categories can be found in the ``categories_`` attribute.
+
+    dtype : number type, default=np.float64
+        Desired dtype of output.
+
+    handle_unknown : {'error', 'use_encoded_value'}, default='error'
+        When set to 'error' an error will be raised in case an unknown
+        categorical feature is present during transform. When set to
+        'use_encoded_value', the encoded value of unknown categories will be
+        set to the value given for the parameter `unknown_value`. In
+        :meth:`inverse_transform`, an unknown category will be denoted as None.
+
+        .. versionadded:: 0.24
+
+    unknown_value : int or np.nan, default=None
+        When the parameter handle_unknown is set to 'use_encoded_value', this
+        parameter is required and will set the encoded value of unknown
+        categories. It has to be distinct from the values used to encode any of
+        the categories in `fit`. If set to np.nan, the `dtype` parameter must
+        be a float dtype.
+
+        .. versionadded:: 0.24
+
+    encoded_missing_value : int or np.nan, default=np.nan
+        Encoded value of missing categories. If set to `np.nan`, then the `dtype`
+        parameter must be a float dtype.
+
+        .. versionadded:: 1.1
+
+    min_frequency : int or float, default=None
+        Specifies the minimum frequency below which a category will be
+        considered infrequent.
+
+        - If `int`, categories with a smaller cardinality will be considered
+          infrequent.
+
+        - If `float`, categories with a smaller cardinality than
+          `min_frequency * n_samples`  will be considered infrequent.
+
+        .. versionadded:: 1.3
+            Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
+
+    max_categories : int, default=None
+        Specifies an upper limit to the number of output categories for each input
+        feature when considering infrequent categories. If there are infrequent
+        categories, `max_categories` includes the category representing the
+        infrequent categories along with the frequent categories. If `None`,
+        there is no limit to the number of output features.
+
+        `max_categories` do **not** take into account missing or unknown
+        categories. Setting `unknown_value` or `encoded_missing_value` to an
+        integer will increase the number of unique integer codes by one each.
+        This can result in up to `max_categories + 2` integer codes.
+
+        .. versionadded:: 1.3
+            Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
+
+    Attributes
+    ----------
+    categories_ : list of arrays
+        The categories of each feature determined during ``fit`` (in order of
+        the features in X and corresponding with the output of ``transform``).
+        This does not include categories that weren't seen during ``fit``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 1.0
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    infrequent_categories_ : list of ndarray
+        Defined only if infrequent categories are enabled by setting
+        `min_frequency` or `max_categories` to a non-default value.
+        `infrequent_categories_[i]` are the infrequent categories for feature
+        `i`. If the feature `i` has no infrequent categories
+        `infrequent_categories_[i]` is None.
+
+        .. versionadded:: 1.3
+
+    See Also
+    --------
+    OneHotEncoder : Performs a one-hot encoding of categorical features. This encoding
+        is suitable for low to medium cardinality categorical variables, both in
+        supervised and unsupervised settings.
+    TargetEncoder : Encodes categorical features using supervised signal
+        in a classification or regression pipeline. This encoding is typically
+        suitable for high cardinality categorical variables.
+    LabelEncoder : Encodes target labels with values between 0 and
+        ``n_classes-1``.
+
+    Notes
+    -----
+    With a high proportion of `nan` values, inferring categories becomes slow with
+    Python versions before 3.10. The handling of `nan` values was improved
+    from Python 3.10 onwards, (c.f.
+    `bpo-43475 <https://github.com/python/cpython/issues/87641>`_).
+
+    Examples
+    --------
+    Given a dataset with two features, we let the encoder find the unique
+    values per feature and transform the data to an ordinal encoding.
+
+    >>> from sklearn.preprocessing import OrdinalEncoder
+    >>> enc = OrdinalEncoder()
+    >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
+    >>> enc.fit(X)
+    OrdinalEncoder()
+    >>> enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> enc.transform([['Female', 3], ['Male', 1]])
+    array([[0., 2.],
+           [1., 0.]])
+
+    >>> enc.inverse_transform([[1, 0], [0, 1]])
+    array([['Male', 1],
+           ['Female', 2]], dtype=object)
+
+    By default, :class:`OrdinalEncoder` is lenient towards missing values by
+    propagating them.
+
+    >>> import numpy as np
+    >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]]
+    >>> enc.fit_transform(X)
+    array([[ 1.,  0.],
+           [ 0.,  1.],
+           [ 0., nan]])
+
+    You can use the parameter `encoded_missing_value` to encode missing values.
+
+    >>> enc.set_params(encoded_missing_value=-1).fit_transform(X)
+    array([[ 1.,  0.],
+           [ 0.,  1.],
+           [ 0., -1.]])
+
+    Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
+    In the following example, "a" and "d" are considered infrequent and grouped
+    together into a single category, "b" and "c" are their own categories, unknown
+    values are encoded as 3 and missing values are encoded as 4.
+
+    >>> X_train = np.array(
+    ...     [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]],
+    ...     dtype=object).T
+    >>> enc = OrdinalEncoder(
+    ...     handle_unknown="use_encoded_value", unknown_value=3,
+    ...     max_categories=3, encoded_missing_value=4)
+    >>> _ = enc.fit(X_train)
+    >>> X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object)
+    >>> enc.transform(X_test)
+    array([[2.],
+           [0.],
+           [1.],
+           [2.],
+           [3.],
+           [4.]])
+    """
+
+    _parameter_constraints: dict = {
+        "categories": [StrOptions({"auto"}), list],
+        "dtype": "no_validation",  # validation delegated to numpy
+        "encoded_missing_value": [Integral, type(np.nan)],
+        "handle_unknown": [StrOptions({"error", "use_encoded_value"})],
+        "unknown_value": [Integral, type(np.nan), None],
+        "max_categories": [Interval(Integral, 1, None, closed="left"), None],
+        "min_frequency": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            None,
+        ],
+    }
+
+    def __init__(
+        self,
+        *,
+        categories="auto",
+        dtype=np.float64,
+        handle_unknown="error",
+        unknown_value=None,
+        encoded_missing_value=np.nan,
+        min_frequency=None,
+        max_categories=None,
+    ):
+        self.categories = categories
+        self.dtype = dtype
+        self.handle_unknown = handle_unknown
+        self.unknown_value = unknown_value
+        self.encoded_missing_value = encoded_missing_value
+        self.min_frequency = min_frequency
+        self.max_categories = max_categories
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """
+        Fit the OrdinalEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+
+        Returns
+        -------
+        self : object
+            Fitted encoder.
+        """
+        if self.handle_unknown == "use_encoded_value":
+            if is_scalar_nan(self.unknown_value):
+                if np.dtype(self.dtype).kind != "f":
+                    raise ValueError(
+                        "When unknown_value is np.nan, the dtype "
+                        "parameter should be "
+                        f"a float dtype. Got {self.dtype}."
+                    )
+            elif not isinstance(self.unknown_value, numbers.Integral):
+                raise TypeError(
+                    "unknown_value should be an integer or "
+                    "np.nan when "
+                    "handle_unknown is 'use_encoded_value', "
+                    f"got {self.unknown_value}."
+                )
+        elif self.unknown_value is not None:
+            raise TypeError(
+                "unknown_value should only be set when "
+                "handle_unknown is 'use_encoded_value', "
+                f"got {self.unknown_value}."
+            )
+
+        # `_fit` will only raise an error when `self.handle_unknown="error"`
+        fit_results = self._fit(
+            X,
+            handle_unknown=self.handle_unknown,
+            ensure_all_finite="allow-nan",
+            return_and_ignore_missing_for_infrequent=True,
+        )
+        self._missing_indices = fit_results["missing_indices"]
+
+        cardinalities = [len(categories) for categories in self.categories_]
+        if self._infrequent_enabled:
+            # Cardinality decreases because the infrequent categories are grouped
+            # together
+            for feature_idx, infrequent in enumerate(self.infrequent_categories_):
+                if infrequent is not None:
+                    cardinalities[feature_idx] -= len(infrequent)
+
+        # missing values are not considered part of the cardinality
+        # when considering unknown categories or encoded_missing_value
+        for cat_idx, categories_for_idx in enumerate(self.categories_):
+            if is_scalar_nan(categories_for_idx[-1]):
+                cardinalities[cat_idx] -= 1
+
+        if self.handle_unknown == "use_encoded_value":
+            for cardinality in cardinalities:
+                if 0 <= self.unknown_value < cardinality:
+                    raise ValueError(
+                        "The used value for unknown_value "
+                        f"{self.unknown_value} is one of the "
+                        "values already used for encoding the "
+                        "seen categories."
+                    )
+
+        if self._missing_indices:
+            if np.dtype(self.dtype).kind != "f" and is_scalar_nan(
+                self.encoded_missing_value
+            ):
+                raise ValueError(
+                    "There are missing values in features "
+                    f"{list(self._missing_indices)}. For OrdinalEncoder to "
+                    f"encode missing values with dtype: {self.dtype}, set "
+                    "encoded_missing_value to a non-nan value, or "
+                    "set dtype to a float"
+                )
+
+            if not is_scalar_nan(self.encoded_missing_value):
+                # Features are invalid when they contain a missing category
+                # and encoded_missing_value was already used to encode a
+                # known category
+                invalid_features = [
+                    cat_idx
+                    for cat_idx, cardinality in enumerate(cardinalities)
+                    if cat_idx in self._missing_indices
+                    and 0 <= self.encoded_missing_value < cardinality
+                ]
+
+                if invalid_features:
+                    # Use feature names if they are available
+                    if hasattr(self, "feature_names_in_"):
+                        invalid_features = self.feature_names_in_[invalid_features]
+                    raise ValueError(
+                        f"encoded_missing_value ({self.encoded_missing_value}) "
+                        "is already used to encode a known category in features: "
+                        f"{invalid_features}"
+                    )
+
+        return self
+
+    def transform(self, X):
+        """
+        Transform X to ordinal codes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to encode.
+
+        Returns
+        -------
+        X_out : ndarray of shape (n_samples, n_features)
+            Transformed input.
+        """
+        check_is_fitted(self, "categories_")
+        X_int, X_mask = self._transform(
+            X,
+            handle_unknown=self.handle_unknown,
+            ensure_all_finite="allow-nan",
+            ignore_category_indices=self._missing_indices,
+        )
+        X_trans = X_int.astype(self.dtype, copy=False)
+
+        for cat_idx, missing_idx in self._missing_indices.items():
+            X_missing_mask = X_int[:, cat_idx] == missing_idx
+            X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value
+
+        # create separate category for unknown values
+        if self.handle_unknown == "use_encoded_value":
+            X_trans[~X_mask] = self.unknown_value
+        return X_trans
+
+    def inverse_transform(self, X):
+        """
+        Convert the data back to the original representation.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_encoded_features)
+            The transformed data.
+
+        Returns
+        -------
+        X_tr : ndarray of shape (n_samples, n_features)
+            Inverse transformed array.
+        """
+        check_is_fitted(self)
+        X = check_array(X, ensure_all_finite="allow-nan")
+
+        n_samples, _ = X.shape
+        n_features = len(self.categories_)
+
+        # validate shape of passed X
+        msg = (
+            "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
+        )
+        if X.shape[1] != n_features:
+            raise ValueError(msg.format(n_features, X.shape[1]))
+
+        # create resulting array of appropriate dtype
+        dt = np.result_type(*[cat.dtype for cat in self.categories_])
+        X_tr = np.empty((n_samples, n_features), dtype=dt)
+
+        found_unknown = {}
+        infrequent_masks = {}
+
+        infrequent_indices = getattr(self, "_infrequent_indices", None)
+
+        for i in range(n_features):
+            labels = X[:, i]
+
+            # replace values of X[:, i] that were nan with actual indices
+            if i in self._missing_indices:
+                X_i_mask = _get_mask(labels, self.encoded_missing_value)
+                labels[X_i_mask] = self._missing_indices[i]
+
+            rows_to_update = slice(None)
+            categories = self.categories_[i]
+
+            if infrequent_indices is not None and infrequent_indices[i] is not None:
+                # Compute mask for frequent categories
+                infrequent_encoding_value = len(categories) - len(infrequent_indices[i])
+                infrequent_masks[i] = labels == infrequent_encoding_value
+                rows_to_update = ~infrequent_masks[i]
+
+                # Remap categories to be only frequent categories. The infrequent
+                # categories will be mapped to "infrequent_sklearn" later
+                frequent_categories_mask = np.ones_like(categories, dtype=bool)
+                frequent_categories_mask[infrequent_indices[i]] = False
+                categories = categories[frequent_categories_mask]
+
+            if self.handle_unknown == "use_encoded_value":
+                unknown_labels = _get_mask(labels, self.unknown_value)
+                found_unknown[i] = unknown_labels
+
+                known_labels = ~unknown_labels
+                if isinstance(rows_to_update, np.ndarray):
+                    rows_to_update &= known_labels
+                else:
+                    rows_to_update = known_labels
+
+            labels_int = labels[rows_to_update].astype("int64", copy=False)
+            X_tr[rows_to_update, i] = categories[labels_int]
+
+        if found_unknown or infrequent_masks:
+            X_tr = X_tr.astype(object, copy=False)
+
+        # insert None values for unknown values
+        if found_unknown:
+            for idx, mask in found_unknown.items():
+                X_tr[mask, idx] = None
+
+        if infrequent_masks:
+            for idx, mask in infrequent_masks.items():
+                X_tr[mask, idx] = "infrequent_sklearn"
+
+        return X_tr
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_function_transformer.py b/.venv/Lib/site-packages/sklearn/preprocessing/_function_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..10bfcb34980b70a3438d003d6a511bf0c0887d34
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/_function_transformer.py
@@ -0,0 +1,445 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from functools import partial
+
+import numpy as np
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils._estimator_html_repr import _VisualBlock
+from ..utils._param_validation import StrOptions
+from ..utils._set_output import (
+    _get_adapter_from_container,
+    _get_output_config,
+)
+from ..utils.metaestimators import available_if
+from ..utils.validation import (
+    _allclose_dense_sparse,
+    _check_feature_names,
+    _check_feature_names_in,
+    _check_n_features,
+    _get_feature_names,
+    _is_pandas_df,
+    _is_polars_df,
+    check_array,
+    validate_data,
+)
+
+
+def _identity(X):
+    """The identity function."""
+    return X
+
+
+class FunctionTransformer(TransformerMixin, BaseEstimator):
+    """Constructs a transformer from an arbitrary callable.
+
+    A FunctionTransformer forwards its X (and optionally y) arguments to a
+    user-defined function or function object and returns the result of this
+    function. This is useful for stateless transformations such as taking the
+    log of frequencies, doing custom scaling, etc.
+
+    Note: If a lambda is used as the function, then the resulting
+    transformer will not be pickleable.
+
+    .. versionadded:: 0.17
+
+    Read more in the :ref:`User Guide <function_transformer>`.
+
+    Parameters
+    ----------
+    func : callable, default=None
+        The callable to use for the transformation. This will be passed
+        the same arguments as transform, with args and kwargs forwarded.
+        If func is None, then func will be the identity function.
+
+    inverse_func : callable, default=None
+        The callable to use for the inverse transformation. This will be
+        passed the same arguments as inverse transform, with args and
+        kwargs forwarded. If inverse_func is None, then inverse_func
+        will be the identity function.
+
+    validate : bool, default=False
+        Indicate that the input X array should be checked before calling
+        ``func``. The possibilities are:
+
+        - If False, there is no input validation.
+        - If True, then X will be converted to a 2-dimensional NumPy array or
+          sparse matrix. If the conversion is not possible an exception is
+          raised.
+
+        .. versionchanged:: 0.22
+           The default of ``validate`` changed from True to False.
+
+    accept_sparse : bool, default=False
+        Indicate that func accepts a sparse matrix as input. If validate is
+        False, this has no effect. Otherwise, if accept_sparse is false,
+        sparse matrix inputs will cause an exception to be raised.
+
+    check_inverse : bool, default=True
+       Whether to check that or ``func`` followed by ``inverse_func`` leads to
+       the original inputs. It can be used for a sanity check, raising a
+       warning when the condition is not fulfilled.
+
+       .. versionadded:: 0.20
+
+    feature_names_out : callable, 'one-to-one' or None, default=None
+        Determines the list of feature names that will be returned by the
+        `get_feature_names_out` method. If it is 'one-to-one', then the output
+        feature names will be equal to the input feature names. If it is a
+        callable, then it must take two positional arguments: this
+        `FunctionTransformer` (`self`) and an array-like of input feature names
+        (`input_features`). It must return an array-like of output feature
+        names. The `get_feature_names_out` method is only defined if
+        `feature_names_out` is not None.
+
+        See ``get_feature_names_out`` for more details.
+
+        .. versionadded:: 1.1
+
+    kw_args : dict, default=None
+        Dictionary of additional keyword arguments to pass to func.
+
+        .. versionadded:: 0.18
+
+    inv_kw_args : dict, default=None
+        Dictionary of additional keyword arguments to pass to inverse_func.
+
+        .. versionadded:: 0.18
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    MaxAbsScaler : Scale each feature by its maximum absolute value.
+    StandardScaler : Standardize features by removing the mean and
+        scaling to unit variance.
+    LabelBinarizer : Binarize labels in a one-vs-all fashion.
+    MultiLabelBinarizer : Transform between iterable of iterables
+        and a multilabel format.
+
+    Notes
+    -----
+    If `func` returns an output with a `columns` attribute, then the columns is enforced
+    to be consistent with the output of `get_feature_names_out`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import FunctionTransformer
+    >>> transformer = FunctionTransformer(np.log1p)
+    >>> X = np.array([[0, 1], [2, 3]])
+    >>> transformer.transform(X)
+    array([[0.       , 0.6931...],
+           [1.0986..., 1.3862...]])
+    """
+
+    _parameter_constraints: dict = {
+        "func": [callable, None],
+        "inverse_func": [callable, None],
+        "validate": ["boolean"],
+        "accept_sparse": ["boolean"],
+        "check_inverse": ["boolean"],
+        "feature_names_out": [callable, StrOptions({"one-to-one"}), None],
+        "kw_args": [dict, None],
+        "inv_kw_args": [dict, None],
+    }
+
+    def __init__(
+        self,
+        func=None,
+        inverse_func=None,
+        *,
+        validate=False,
+        accept_sparse=False,
+        check_inverse=True,
+        feature_names_out=None,
+        kw_args=None,
+        inv_kw_args=None,
+    ):
+        self.func = func
+        self.inverse_func = inverse_func
+        self.validate = validate
+        self.accept_sparse = accept_sparse
+        self.check_inverse = check_inverse
+        self.feature_names_out = feature_names_out
+        self.kw_args = kw_args
+        self.inv_kw_args = inv_kw_args
+
+    def _check_input(self, X, *, reset):
+        if self.validate:
+            return validate_data(self, X, accept_sparse=self.accept_sparse, reset=reset)
+        elif reset:
+            # Set feature_names_in_ and n_features_in_ even if validate=False
+            # We run this only when reset==True to store the attributes but not
+            # validate them, because validate=False
+            _check_n_features(self, X, reset=reset)
+            _check_feature_names(self, X, reset=reset)
+        return X
+
+    def _check_inverse_transform(self, X):
+        """Check that func and inverse_func are the inverse."""
+        idx_selected = slice(None, None, max(1, X.shape[0] // 100))
+        X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
+
+        if hasattr(X, "dtype"):
+            dtypes = [X.dtype]
+        elif hasattr(X, "dtypes"):
+            # Dataframes can have multiple dtypes
+            dtypes = X.dtypes
+
+        if not all(np.issubdtype(d, np.number) for d in dtypes):
+            raise ValueError(
+                "'check_inverse' is only supported when all the elements in `X` is"
+                " numerical."
+            )
+
+        if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
+            warnings.warn(
+                (
+                    "The provided functions are not strictly"
+                    " inverse of each other. If you are sure you"
+                    " want to proceed regardless, set"
+                    " 'check_inverse=False'."
+                ),
+                UserWarning,
+            )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit transformer by checking X.
+
+        If ``validate`` is ``True``, ``X`` will be checked.
+
+        Parameters
+        ----------
+        X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
+                if `validate=True` else any object that `func` can handle
+            Input array.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            FunctionTransformer class instance.
+        """
+        X = self._check_input(X, reset=True)
+        if self.check_inverse and not (self.func is None or self.inverse_func is None):
+            self._check_inverse_transform(X)
+        return self
+
+    def transform(self, X):
+        """Transform X using the forward function.
+
+        Parameters
+        ----------
+        X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
+                if `validate=True` else any object that `func` can handle
+            Input array.
+
+        Returns
+        -------
+        X_out : array-like, shape (n_samples, n_features)
+            Transformed input.
+        """
+        X = self._check_input(X, reset=False)
+        out = self._transform(X, func=self.func, kw_args=self.kw_args)
+        output_config = _get_output_config("transform", self)["dense"]
+
+        if hasattr(out, "columns") and self.feature_names_out is not None:
+            # check the consistency between the column provided by `transform` and
+            # the the column names provided by `get_feature_names_out`.
+            feature_names_out = self.get_feature_names_out()
+            if list(out.columns) != list(feature_names_out):
+                # we can override the column names of the output if it is inconsistent
+                # with the column names provided by `get_feature_names_out` in the
+                # following cases:
+                # * `func` preserved the column names between the input and the output
+                # * the input column names are all numbers
+                # * the output is requested to be a DataFrame (pandas or polars)
+                feature_names_in = getattr(
+                    X, "feature_names_in_", _get_feature_names(X)
+                )
+                same_feature_names_in_out = feature_names_in is not None and list(
+                    feature_names_in
+                ) == list(out.columns)
+                not_all_str_columns = not all(
+                    isinstance(col, str) for col in out.columns
+                )
+                if same_feature_names_in_out or not_all_str_columns:
+                    adapter = _get_adapter_from_container(out)
+                    out = adapter.create_container(
+                        X_output=out,
+                        X_original=out,
+                        columns=feature_names_out,
+                        inplace=False,
+                    )
+                else:
+                    raise ValueError(
+                        "The output generated by `func` have different column names "
+                        "than the ones provided by `get_feature_names_out`. "
+                        f"Got output with columns names: {list(out.columns)} and "
+                        "`get_feature_names_out` returned: "
+                        f"{list(self.get_feature_names_out())}. "
+                        "The column names can be overridden by setting "
+                        "`set_output(transform='pandas')` or "
+                        "`set_output(transform='polars')` such that the column names "
+                        "are set to the names provided by `get_feature_names_out`."
+                    )
+
+        if self.feature_names_out is None:
+            warn_msg = (
+                "When `set_output` is configured to be '{0}', `func` should return "
+                "a {0} DataFrame to follow the `set_output` API  or `feature_names_out`"
+                " should be defined."
+            )
+            if output_config == "pandas" and not _is_pandas_df(out):
+                warnings.warn(warn_msg.format("pandas"))
+            elif output_config == "polars" and not _is_polars_df(out):
+                warnings.warn(warn_msg.format("polars"))
+
+        return out
+
+    def inverse_transform(self, X):
+        """Transform X using the inverse function.
+
+        Parameters
+        ----------
+        X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
+                if `validate=True` else any object that `inverse_func` can handle
+            Input array.
+
+        Returns
+        -------
+        X_out : array-like, shape (n_samples, n_features)
+            Transformed input.
+        """
+        if self.validate:
+            X = check_array(X, accept_sparse=self.accept_sparse)
+        return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)
+
+    @available_if(lambda self: self.feature_names_out is not None)
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        This method is only defined if `feature_names_out` is not None.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input feature names.
+
+            - If `input_features` is None, then `feature_names_in_` is
+              used as the input feature names. If `feature_names_in_` is not
+              defined, then names are generated:
+              `[x0, x1, ..., x(n_features_in_ - 1)]`.
+            - If `input_features` is array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+
+            - If `feature_names_out` is 'one-to-one', the input feature names
+              are returned (see `input_features` above). This requires
+              `feature_names_in_` and/or `n_features_in_` to be defined, which
+              is done automatically if `validate=True`. Alternatively, you can
+              set them in `func`.
+            - If `feature_names_out` is a callable, then it is called with two
+              arguments, `self` and `input_features`, and its return value is
+              returned by this method.
+        """
+        if hasattr(self, "n_features_in_") or input_features is not None:
+            input_features = _check_feature_names_in(self, input_features)
+        if self.feature_names_out == "one-to-one":
+            names_out = input_features
+        elif callable(self.feature_names_out):
+            names_out = self.feature_names_out(self, input_features)
+        else:
+            raise ValueError(
+                f"feature_names_out={self.feature_names_out!r} is invalid. "
+                'It must either be "one-to-one" or a callable with two '
+                "arguments: the function transformer and an array-like of "
+                "input feature names. The callable must return an array-like "
+                "of output feature names."
+            )
+        return np.asarray(names_out, dtype=object)
+
+    def _transform(self, X, func=None, kw_args=None):
+        if func is None:
+            func = _identity
+
+        return func(X, **(kw_args if kw_args else {}))
+
+    def __sklearn_is_fitted__(self):
+        """Return True since FunctionTransfomer is stateless."""
+        return True
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.no_validation = not self.validate
+        tags.requires_fit = False
+        return tags
+
+    def set_output(self, *, transform=None):
+        """Set output container.
+
+        See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
+        for an example on how to use the API.
+
+        Parameters
+        ----------
+        transform : {"default", "pandas", "polars"}, default=None
+            Configure output of `transform` and `fit_transform`.
+
+            - `"default"`: Default output format of a transformer
+            - `"pandas"`: DataFrame output
+            - `"polars"`: Polars output
+            - `None`: Transform configuration is unchanged
+
+            .. versionadded:: 1.4
+                `"polars"` option was added.
+
+        Returns
+        -------
+        self : estimator instance
+            Estimator instance.
+        """
+        if not hasattr(self, "_sklearn_output_config"):
+            self._sklearn_output_config = {}
+
+        self._sklearn_output_config["transform"] = transform
+        return self
+
+    def _get_function_name(self):
+        """Get the name display of the `func` used in HTML representation."""
+        if hasattr(self.func, "__name__"):
+            return self.func.__name__
+        if isinstance(self.func, partial):
+            return self.func.func.__name__
+        return f"{self.func.__class__.__name__}(...)"
+
+    def _sk_visual_block_(self):
+        return _VisualBlock(
+            "single",
+            self,
+            names=self._get_function_name(),
+            name_details=str(self),
+            name_caption="FunctionTransformer",
+            doc_link_label="FunctionTransformer",
+        )
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_label.py b/.venv/Lib/site-packages/sklearn/preprocessing/_label.py
new file mode 100644
index 0000000000000000000000000000000000000000..57dcaa08c1ed3707d2feb066b42adb81f8ebfd74
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/_label.py
@@ -0,0 +1,963 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import array
+import itertools
+import warnings
+from collections import defaultdict
+from numbers import Integral
+
+import numpy as np
+import scipy.sparse as sp
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils import column_or_1d
+from ..utils._array_api import _setdiff1d, device, get_namespace
+from ..utils._encode import _encode, _unique
+from ..utils._param_validation import Interval, validate_params
+from ..utils.multiclass import type_of_target, unique_labels
+from ..utils.sparsefuncs import min_max_axis
+from ..utils.validation import _num_samples, check_array, check_is_fitted
+
+__all__ = [
+    "label_binarize",
+    "LabelBinarizer",
+    "LabelEncoder",
+    "MultiLabelBinarizer",
+]
+
+
+class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
+    """Encode target labels with value between 0 and n_classes-1.
+
+    This transformer should be used to encode target values, *i.e.* `y`, and
+    not the input `X`.
+
+    Read more in the :ref:`User Guide <preprocessing_targets>`.
+
+    .. versionadded:: 0.12
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        Holds the label for each class.
+
+    See Also
+    --------
+    OrdinalEncoder : Encode categorical features using an ordinal encoding
+        scheme.
+    OneHotEncoder : Encode categorical features as a one-hot numeric array.
+
+    Examples
+    --------
+    `LabelEncoder` can be used to normalize labels.
+
+    >>> from sklearn.preprocessing import LabelEncoder
+    >>> le = LabelEncoder()
+    >>> le.fit([1, 2, 2, 6])
+    LabelEncoder()
+    >>> le.classes_
+    array([1, 2, 6])
+    >>> le.transform([1, 1, 2, 6])
+    array([0, 0, 1, 2]...)
+    >>> le.inverse_transform([0, 0, 1, 2])
+    array([1, 1, 2, 6])
+
+    It can also be used to transform non-numerical labels (as long as they are
+    hashable and comparable) to numerical labels.
+
+    >>> le = LabelEncoder()
+    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    LabelEncoder()
+    >>> list(le.classes_)
+    [np.str_('amsterdam'), np.str_('paris'), np.str_('tokyo')]
+    >>> le.transform(["tokyo", "tokyo", "paris"])
+    array([2, 2, 1]...)
+    >>> list(le.inverse_transform([2, 2, 1]))
+    [np.str_('tokyo'), np.str_('tokyo'), np.str_('paris')]
+    """
+
+    def fit(self, y):
+        """Fit label encoder.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        self : returns an instance of self.
+            Fitted label encoder.
+        """
+        y = column_or_1d(y, warn=True)
+        self.classes_ = _unique(y)
+        return self
+
+    def fit_transform(self, y):
+        """Fit label encoder and return encoded labels.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        y : array-like of shape (n_samples,)
+            Encoded labels.
+        """
+        y = column_or_1d(y, warn=True)
+        self.classes_, y = _unique(y, return_inverse=True)
+        return y
+
+    def transform(self, y):
+        """Transform labels to normalized encoding.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        y : array-like of shape (n_samples,)
+            Labels as normalized encodings.
+        """
+        check_is_fitted(self)
+        xp, _ = get_namespace(y)
+        y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
+        # transform of empty array is empty array
+        if _num_samples(y) == 0:
+            return xp.asarray([])
+
+        return _encode(y, uniques=self.classes_)
+
+    def inverse_transform(self, y):
+        """Transform labels back to original encoding.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            Original encoding.
+        """
+        check_is_fitted(self)
+        xp, _ = get_namespace(y)
+        y = column_or_1d(y, warn=True)
+        # inverse transform of empty array is empty array
+        if _num_samples(y) == 0:
+            return xp.asarray([])
+
+        diff = _setdiff1d(
+            ar1=y,
+            ar2=xp.arange(self.classes_.shape[0], device=device(y)),
+            xp=xp,
+        )
+        if diff.shape[0]:
+            raise ValueError("y contains previously unseen labels: %s" % str(diff))
+        y = xp.asarray(y)
+        return xp.take(self.classes_, y, axis=0)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.array_api_support = True
+        tags.input_tags.two_d_array = False
+        tags.target_tags.one_d_labels = True
+        return tags
+
+
+class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
+    """Binarize labels in a one-vs-all fashion.
+
+    Several regression and binary classification algorithms are
+    available in scikit-learn. A simple way to extend these algorithms
+    to the multi-class classification case is to use the so-called
+    one-vs-all scheme.
+
+    At learning time, this simply consists in learning one regressor
+    or binary classifier per class. In doing so, one needs to convert
+    multi-class labels to binary labels (belong or does not belong
+    to the class). `LabelBinarizer` makes this process easy with the
+    transform method.
+
+    At prediction time, one assigns the class for which the corresponding
+    model gave the greatest confidence. `LabelBinarizer` makes this easy
+    with the :meth:`inverse_transform` method.
+
+    Read more in the :ref:`User Guide <preprocessing_targets>`.
+
+    Parameters
+    ----------
+    neg_label : int, default=0
+        Value with which negative labels must be encoded.
+
+    pos_label : int, default=1
+        Value with which positive labels must be encoded.
+
+    sparse_output : bool, default=False
+        True if the returned array from transform is desired to be in sparse
+        CSR format.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        Holds the label for each class.
+
+    y_type_ : str
+        Represents the type of the target data as evaluated by
+        :func:`~sklearn.utils.multiclass.type_of_target`. Possible type are
+        'continuous', 'continuous-multioutput', 'binary', 'multiclass',
+        'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.
+
+    sparse_input_ : bool
+        `True` if the input data to transform is given as a sparse matrix,
+         `False` otherwise.
+
+    See Also
+    --------
+    label_binarize : Function to perform the transform operation of
+        LabelBinarizer with fixed classes.
+    OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
+        scheme.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import LabelBinarizer
+    >>> lb = LabelBinarizer()
+    >>> lb.fit([1, 2, 6, 4, 2])
+    LabelBinarizer()
+    >>> lb.classes_
+    array([1, 2, 4, 6])
+    >>> lb.transform([1, 6])
+    array([[1, 0, 0, 0],
+           [0, 0, 0, 1]])
+
+    Binary targets transform to a column vector
+
+    >>> lb = LabelBinarizer()
+    >>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
+    array([[1],
+           [0],
+           [0],
+           [1]])
+
+    Passing a 2D matrix for multilabel classification
+
+    >>> import numpy as np
+    >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
+    LabelBinarizer()
+    >>> lb.classes_
+    array([0, 1, 2])
+    >>> lb.transform([0, 1, 2, 1])
+    array([[1, 0, 0],
+           [0, 1, 0],
+           [0, 0, 1],
+           [0, 1, 0]])
+    """
+
+    _parameter_constraints: dict = {
+        "neg_label": [Integral],
+        "pos_label": [Integral],
+        "sparse_output": ["boolean"],
+    }
+
+    def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
+        self.neg_label = neg_label
+        self.pos_label = pos_label
+        self.sparse_output = sparse_output
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, y):
+        """Fit label binarizer.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        if self.neg_label >= self.pos_label:
+            raise ValueError(
+                f"neg_label={self.neg_label} must be strictly less than "
+                f"pos_label={self.pos_label}."
+            )
+
+        if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
+            raise ValueError(
+                "Sparse binarization is only supported with non "
+                "zero pos_label and zero neg_label, got "
+                f"pos_label={self.pos_label} and neg_label={self.neg_label}"
+            )
+
+        self.y_type_ = type_of_target(y, input_name="y")
+
+        if "multioutput" in self.y_type_:
+            raise ValueError(
+                "Multioutput target data is not supported with label binarization"
+            )
+        if _num_samples(y) == 0:
+            raise ValueError("y has 0 samples: %r" % y)
+
+        self.sparse_input_ = sp.issparse(y)
+        self.classes_ = unique_labels(y)
+        return self
+
+    def fit_transform(self, y):
+        """Fit label binarizer/transform multi-class labels to binary labels.
+
+        The output of transform is sometimes referred to as
+        the 1-of-K coding scheme.
+
+        Parameters
+        ----------
+        y : {ndarray, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_classes)
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification. Sparse matrix can be
+            CSR, CSC, COO, DOK, or LIL.
+
+        Returns
+        -------
+        Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            Shape will be (n_samples, 1) for binary problems. Sparse matrix
+            will be of CSR format.
+        """
+        return self.fit(y).transform(y)
+
+    def transform(self, y):
+        """Transform multi-class labels to binary labels.
+
+        The output of transform is sometimes referred to by some authors as
+        the 1-of-K coding scheme.
+
+        Parameters
+        ----------
+        y : {array, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_classes)
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification. Sparse matrix can be
+            CSR, CSC, COO, DOK, or LIL.
+
+        Returns
+        -------
+        Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            Shape will be (n_samples, 1) for binary problems. Sparse matrix
+            will be of CSR format.
+        """
+        check_is_fitted(self)
+
+        y_is_multilabel = type_of_target(y).startswith("multilabel")
+        if y_is_multilabel and not self.y_type_.startswith("multilabel"):
+            raise ValueError("The object was not fitted with multilabel input.")
+
+        return label_binarize(
+            y,
+            classes=self.classes_,
+            pos_label=self.pos_label,
+            neg_label=self.neg_label,
+            sparse_output=self.sparse_output,
+        )
+
+    def inverse_transform(self, Y, threshold=None):
+        """Transform binary labels back to multi-class labels.
+
+        Parameters
+        ----------
+        Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            Target values. All sparse matrices are converted to CSR before
+            inverse transformation.
+
+        threshold : float, default=None
+            Threshold used in the binary and multi-label cases.
+
+            Use 0 when ``Y`` contains the output of :term:`decision_function`
+            (classifier).
+            Use 0.5 when ``Y`` contains the output of :term:`predict_proba`.
+
+            If None, the threshold is assumed to be half way between
+            neg_label and pos_label.
+
+        Returns
+        -------
+        y : {ndarray, sparse matrix} of shape (n_samples,)
+            Target values. Sparse matrix will be of CSR format.
+
+        Notes
+        -----
+        In the case when the binary labels are fractional
+        (probabilistic), :meth:`inverse_transform` chooses the class with the
+        greatest value. Typically, this allows to use the output of a
+        linear model's :term:`decision_function` method directly as the input
+        of :meth:`inverse_transform`.
+        """
+        check_is_fitted(self)
+
+        if threshold is None:
+            threshold = (self.pos_label + self.neg_label) / 2.0
+
+        if self.y_type_ == "multiclass":
+            y_inv = _inverse_binarize_multiclass(Y, self.classes_)
+        else:
+            y_inv = _inverse_binarize_thresholding(
+                Y, self.y_type_, self.classes_, threshold
+            )
+
+        if self.sparse_input_:
+            y_inv = sp.csr_matrix(y_inv)
+        elif sp.issparse(y_inv):
+            y_inv = y_inv.toarray()
+
+        return y_inv
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.two_d_array = False
+        tags.target_tags.one_d_labels = True
+        return tags
+
+
+@validate_params(
+    {
+        "y": ["array-like", "sparse matrix"],
+        "classes": ["array-like"],
+        "neg_label": [Interval(Integral, None, None, closed="neither")],
+        "pos_label": [Interval(Integral, None, None, closed="neither")],
+        "sparse_output": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):
+    """Binarize labels in a one-vs-all fashion.
+
+    Several regression and binary classification algorithms are
+    available in scikit-learn. A simple way to extend these algorithms
+    to the multi-class classification case is to use the so-called
+    one-vs-all scheme.
+
+    This function makes it possible to compute this transformation for a
+    fixed set of class labels known ahead of time.
+
+    Parameters
+    ----------
+    y : array-like or sparse matrix
+        Sequence of integer labels or multilabel data to encode.
+
+    classes : array-like of shape (n_classes,)
+        Uniquely holds the label for each class.
+
+    neg_label : int, default=0
+        Value with which negative labels must be encoded.
+
+    pos_label : int, default=1
+        Value with which positive labels must be encoded.
+
+    sparse_output : bool, default=False,
+        Set to true if output binary array is desired in CSR sparse format.
+
+    Returns
+    -------
+    Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+        Shape will be (n_samples, 1) for binary problems. Sparse matrix will
+        be of CSR format.
+
+    See Also
+    --------
+    LabelBinarizer : Class used to wrap the functionality of label_binarize and
+        allow for fitting to classes independently of the transform operation.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import label_binarize
+    >>> label_binarize([1, 6], classes=[1, 2, 4, 6])
+    array([[1, 0, 0, 0],
+           [0, 0, 0, 1]])
+
+    The class ordering is preserved:
+
+    >>> label_binarize([1, 6], classes=[1, 6, 4, 2])
+    array([[1, 0, 0, 0],
+           [0, 1, 0, 0]])
+
+    Binary targets transform to a column vector
+
+    >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
+    array([[1],
+           [0],
+           [0],
+           [1]])
+    """
+    if not isinstance(y, list):
+        # XXX Workaround that will be removed when list of list format is
+        # dropped
+        y = check_array(
+            y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
+        )
+    else:
+        if _num_samples(y) == 0:
+            raise ValueError("y has 0 samples: %r" % y)
+    if neg_label >= pos_label:
+        raise ValueError(
+            "neg_label={0} must be strictly less than pos_label={1}.".format(
+                neg_label, pos_label
+            )
+        )
+
+    if sparse_output and (pos_label == 0 or neg_label != 0):
+        raise ValueError(
+            "Sparse binarization is only supported with non "
+            "zero pos_label and zero neg_label, got "
+            "pos_label={0} and neg_label={1}"
+            "".format(pos_label, neg_label)
+        )
+
+    # To account for pos_label == 0 in the dense case
+    pos_switch = pos_label == 0
+    if pos_switch:
+        pos_label = -neg_label
+
+    y_type = type_of_target(y)
+    if "multioutput" in y_type:
+        raise ValueError(
+            "Multioutput target data is not supported with label binarization"
+        )
+    if y_type == "unknown":
+        raise ValueError("The type of target data is not known")
+
+    n_samples = y.shape[0] if sp.issparse(y) else len(y)
+    n_classes = len(classes)
+    classes = np.asarray(classes)
+
+    if y_type == "binary":
+        if n_classes == 1:
+            if sparse_output:
+                return sp.csr_matrix((n_samples, 1), dtype=int)
+            else:
+                Y = np.zeros((len(y), 1), dtype=int)
+                Y += neg_label
+                return Y
+        elif len(classes) >= 3:
+            y_type = "multiclass"
+
+    sorted_class = np.sort(classes)
+    if y_type == "multilabel-indicator":
+        y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
+        if classes.size != y_n_classes:
+            raise ValueError(
+                "classes {0} mismatch with the labels {1} found in the data".format(
+                    classes, unique_labels(y)
+                )
+            )
+
+    if y_type in ("binary", "multiclass"):
+        y = column_or_1d(y)
+
+        # pick out the known labels from y
+        y_in_classes = np.isin(y, classes)
+        y_seen = y[y_in_classes]
+        indices = np.searchsorted(sorted_class, y_seen)
+        indptr = np.hstack((0, np.cumsum(y_in_classes)))
+
+        data = np.empty_like(indices)
+        data.fill(pos_label)
+        Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
+    elif y_type == "multilabel-indicator":
+        Y = sp.csr_matrix(y)
+        if pos_label != 1:
+            data = np.empty_like(Y.data)
+            data.fill(pos_label)
+            Y.data = data
+    else:
+        raise ValueError(
+            "%s target data is not supported with label binarization" % y_type
+        )
+
+    if not sparse_output:
+        Y = Y.toarray()
+        Y = Y.astype(int, copy=False)
+
+        if neg_label != 0:
+            Y[Y == 0] = neg_label
+
+        if pos_switch:
+            Y[Y == pos_label] = 0
+    else:
+        Y.data = Y.data.astype(int, copy=False)
+
+    # preserve label ordering
+    if np.any(classes != sorted_class):
+        indices = np.searchsorted(sorted_class, classes)
+        Y = Y[:, indices]
+
+    if y_type == "binary":
+        if sparse_output:
+            Y = Y.getcol(-1)
+        else:
+            Y = Y[:, -1].reshape((-1, 1))
+
+    return Y
+
+
+def _inverse_binarize_multiclass(y, classes):
+    """Inverse label binarization transformation for multiclass.
+
+    Multiclass uses the maximal score instead of a threshold.
+    """
+    classes = np.asarray(classes)
+
+    if sp.issparse(y):
+        # Find the argmax for each row in y where y is a CSR matrix
+
+        y = y.tocsr()
+        n_samples, n_outputs = y.shape
+        outputs = np.arange(n_outputs)
+        row_max = min_max_axis(y, 1)[1]
+        row_nnz = np.diff(y.indptr)
+
+        y_data_repeated_max = np.repeat(row_max, row_nnz)
+        # picks out all indices obtaining the maximum per row
+        y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
+
+        # For corner case where last row has a max of 0
+        if row_max[-1] == 0:
+            y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
+
+        # Gets the index of the first argmax in each row from y_i_all_argmax
+        index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
+        # first argmax of each row
+        y_ind_ext = np.append(y.indices, [0])
+        y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
+        # Handle rows of all 0
+        y_i_argmax[np.where(row_nnz == 0)[0]] = 0
+
+        # Handles rows with max of 0 that contain negative numbers
+        samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
+        for i in samples:
+            ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
+            y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
+
+        return classes[y_i_argmax]
+    else:
+        return classes.take(y.argmax(axis=1), mode="clip")
+
+
+def _inverse_binarize_thresholding(y, output_type, classes, threshold):
+    """Inverse label binarization transformation using thresholding."""
+
+    if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
+        raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))
+
+    if output_type != "binary" and y.shape[1] != len(classes):
+        raise ValueError(
+            "The number of class is not equal to the number of dimension of y."
+        )
+
+    classes = np.asarray(classes)
+
+    # Perform thresholding
+    if sp.issparse(y):
+        if threshold > 0:
+            if y.format not in ("csr", "csc"):
+                y = y.tocsr()
+            y.data = np.array(y.data > threshold, dtype=int)
+            y.eliminate_zeros()
+        else:
+            y = np.array(y.toarray() > threshold, dtype=int)
+    else:
+        y = np.array(y > threshold, dtype=int)
+
+    # Inverse transform data
+    if output_type == "binary":
+        if sp.issparse(y):
+            y = y.toarray()
+        if y.ndim == 2 and y.shape[1] == 2:
+            return classes[y[:, 1]]
+        else:
+            if len(classes) == 1:
+                return np.repeat(classes[0], len(y))
+            else:
+                return classes[y.ravel()]
+
+    elif output_type == "multilabel-indicator":
+        return y
+
+    else:
+        raise ValueError("{0} format is not supported".format(output_type))
+
+
+class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
+    """Transform between iterable of iterables and a multilabel format.
+
+    Although a list of sets or tuples is a very intuitive format for multilabel
+    data, it is unwieldy to process. This transformer converts between this
+    intuitive format and the supported multilabel format: a (samples x classes)
+    binary matrix indicating the presence of a class label.
+
+    Parameters
+    ----------
+    classes : array-like of shape (n_classes,), default=None
+        Indicates an ordering for the class labels.
+        All entries should be unique (cannot contain duplicate classes).
+
+    sparse_output : bool, default=False
+        Set to True if output binary array is desired in CSR sparse format.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        A copy of the `classes` parameter when provided.
+        Otherwise it corresponds to the sorted set of classes found
+        when fitting.
+
+    See Also
+    --------
+    OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
+        scheme.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import MultiLabelBinarizer
+    >>> mlb = MultiLabelBinarizer()
+    >>> mlb.fit_transform([(1, 2), (3,)])
+    array([[1, 1, 0],
+           [0, 0, 1]])
+    >>> mlb.classes_
+    array([1, 2, 3])
+
+    >>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
+    array([[0, 1, 1],
+           [1, 0, 0]])
+    >>> list(mlb.classes_)
+    ['comedy', 'sci-fi', 'thriller']
+
+    A common mistake is to pass in a list, which leads to the following issue:
+
+    >>> mlb = MultiLabelBinarizer()
+    >>> mlb.fit(['sci-fi', 'thriller', 'comedy'])
+    MultiLabelBinarizer()
+    >>> mlb.classes_
+    array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',
+        'y'], dtype=object)
+
+    To correct this, the list of labels should be passed in as:
+
+    >>> mlb = MultiLabelBinarizer()
+    >>> mlb.fit([['sci-fi', 'thriller', 'comedy']])
+    MultiLabelBinarizer()
+    >>> mlb.classes_
+    array(['comedy', 'sci-fi', 'thriller'], dtype=object)
+    """
+
+    _parameter_constraints: dict = {
+        "classes": ["array-like", None],
+        "sparse_output": ["boolean"],
+    }
+
+    def __init__(self, *, classes=None, sparse_output=False):
+        self.classes = classes
+        self.sparse_output = sparse_output
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, y):
+        """Fit the label sets binarizer, storing :term:`classes_`.
+
+        Parameters
+        ----------
+        y : iterable of iterables
+            A set of labels (any orderable and hashable object) for each
+            sample. If the `classes` parameter is set, `y` will not be
+            iterated.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self._cached_dict = None
+
+        if self.classes is None:
+            classes = sorted(set(itertools.chain.from_iterable(y)))
+        elif len(set(self.classes)) < len(self.classes):
+            raise ValueError(
+                "The classes argument contains duplicate "
+                "classes. Remove these duplicates before passing "
+                "them to MultiLabelBinarizer."
+            )
+        else:
+            classes = self.classes
+        dtype = int if all(isinstance(c, int) for c in classes) else object
+        self.classes_ = np.empty(len(classes), dtype=dtype)
+        self.classes_[:] = classes
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, y):
+        """Fit the label sets binarizer and transform the given label sets.
+
+        Parameters
+        ----------
+        y : iterable of iterables
+            A set of labels (any orderable and hashable object) for each
+            sample. If the `classes` parameter is set, `y` will not be
+            iterated.
+
+        Returns
+        -------
+        y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`
+            is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR
+            format.
+        """
+        if self.classes is not None:
+            return self.fit(y).transform(y)
+
+        self._cached_dict = None
+
+        # Automatically increment on new class
+        class_mapping = defaultdict(int)
+        class_mapping.default_factory = class_mapping.__len__
+        yt = self._transform(y, class_mapping)
+
+        # sort classes and reorder columns
+        tmp = sorted(class_mapping, key=class_mapping.get)
+
+        # (make safe for tuples)
+        dtype = int if all(isinstance(c, int) for c in tmp) else object
+        class_mapping = np.empty(len(tmp), dtype=dtype)
+        class_mapping[:] = tmp
+        self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
+        # ensure yt.indices keeps its current dtype
+        yt.indices = np.asarray(inverse[yt.indices], dtype=yt.indices.dtype)
+
+        if not self.sparse_output:
+            yt = yt.toarray()
+
+        return yt
+
+    def transform(self, y):
+        """Transform the given label sets.
+
+        Parameters
+        ----------
+        y : iterable of iterables
+            A set of labels (any orderable and hashable object) for each
+            sample. If the `classes` parameter is set, `y` will not be
+            iterated.
+
+        Returns
+        -------
+        y_indicator : array or CSR matrix, shape (n_samples, n_classes)
+            A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in
+            `y[i]`, and 0 otherwise.
+        """
+        check_is_fitted(self)
+
+        class_to_index = self._build_cache()
+        yt = self._transform(y, class_to_index)
+
+        if not self.sparse_output:
+            yt = yt.toarray()
+
+        return yt
+
+    def _build_cache(self):
+        if self._cached_dict is None:
+            self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))
+
+        return self._cached_dict
+
+    def _transform(self, y, class_mapping):
+        """Transforms the label sets with a given mapping.
+
+        Parameters
+        ----------
+        y : iterable of iterables
+            A set of labels (any orderable and hashable object) for each
+            sample. If the `classes` parameter is set, `y` will not be
+            iterated.
+
+        class_mapping : Mapping
+            Maps from label to column index in label indicator matrix.
+
+        Returns
+        -------
+        y_indicator : sparse matrix of shape (n_samples, n_classes)
+            Label indicator matrix. Will be of CSR format.
+        """
+        indices = array.array("i")
+        indptr = array.array("i", [0])
+        unknown = set()
+        for labels in y:
+            index = set()
+            for label in labels:
+                try:
+                    index.add(class_mapping[label])
+                except KeyError:
+                    unknown.add(label)
+            indices.extend(index)
+            indptr.append(len(indices))
+        if unknown:
+            warnings.warn(
+                "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))
+            )
+        data = np.ones(len(indices), dtype=int)
+
+        return sp.csr_matrix(
+            (data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))
+        )
+
+    def inverse_transform(self, yt):
+        """Transform the given indicator matrix into label sets.
+
+        Parameters
+        ----------
+        yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            A matrix containing only 1s ands 0s.
+
+        Returns
+        -------
+        y : list of tuples
+            The set of labels for each sample such that `y[i]` consists of
+            `classes_[j]` for each `yt[i, j] == 1`.
+        """
+        check_is_fitted(self)
+
+        if yt.shape[1] != len(self.classes_):
+            raise ValueError(
+                "Expected indicator for {0} classes, but got {1}".format(
+                    len(self.classes_), yt.shape[1]
+                )
+            )
+
+        if sp.issparse(yt):
+            yt = yt.tocsr()
+            if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:
+                raise ValueError("Expected only 0s and 1s in label indicator.")
+            return [
+                tuple(self.classes_.take(yt.indices[start:end]))
+                for start, end in zip(yt.indptr[:-1], yt.indptr[1:])
+            ]
+        else:
+            unexpected = np.setdiff1d(yt, [0, 1])
+            if len(unexpected) > 0:
+                raise ValueError(
+                    "Expected only 0s and 1s in label indicator. Also got {0}".format(
+                        unexpected
+                    )
+                )
+            return [tuple(self.classes_.compress(indicators)) for indicators in yt]
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.two_d_array = False
+        tags.target_tags.two_d_labels = True
+        return tags
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_polynomial.py b/.venv/Lib/site-packages/sklearn/preprocessing/_polynomial.py
new file mode 100644
index 0000000000000000000000000000000000000000..457d17ab0695075c4e852d55394feab0cba4bd8d
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/_polynomial.py
@@ -0,0 +1,1173 @@
+"""
+This file contains preprocessing tools based on polynomials.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import collections
+from itertools import chain, combinations
+from itertools import combinations_with_replacement as combinations_w_r
+from numbers import Integral
+
+import numpy as np
+from scipy import sparse
+from scipy.interpolate import BSpline
+from scipy.special import comb
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils import check_array
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.fixes import parse_version, sp_version
+from ..utils.stats import _weighted_percentile
+from ..utils.validation import (
+    FLOAT_DTYPES,
+    _check_feature_names_in,
+    _check_sample_weight,
+    check_is_fitted,
+    validate_data,
+)
+from ._csr_polynomial_expansion import (
+    _calc_expanded_nnz,
+    _calc_total_nnz,
+    _csr_polynomial_expansion,
+)
+
+__all__ = [
+    "PolynomialFeatures",
+    "SplineTransformer",
+]
+
+
+def _create_expansion(X, interaction_only, deg, n_features, cumulative_size=0):
+    """Helper function for creating and appending sparse expansion matrices"""
+
+    total_nnz = _calc_total_nnz(X.indptr, interaction_only, deg)
+    expanded_col = _calc_expanded_nnz(n_features, interaction_only, deg)
+
+    if expanded_col == 0:
+        return None
+    # This only checks whether each block needs 64bit integers upon
+    # expansion. We prefer to keep int32 indexing where we can,
+    # since currently SciPy's CSR construction downcasts when possible,
+    # so we prefer to avoid an unnecessary cast. The dtype may still
+    # change in the concatenation process if needed.
+    # See: https://github.com/scipy/scipy/issues/16569
+    max_indices = expanded_col - 1
+    max_indptr = total_nnz
+    max_int32 = np.iinfo(np.int32).max
+    needs_int64 = max(max_indices, max_indptr) > max_int32
+    index_dtype = np.int64 if needs_int64 else np.int32
+
+    # This is a pretty specific bug that is hard to work around by a user,
+    # hence we do not detail the entire bug and all possible avoidance
+    # mechnasisms. Instead we recommend upgrading scipy or shrinking their data.
+    cumulative_size += expanded_col
+    if (
+        sp_version < parse_version("1.8.0")
+        and cumulative_size - 1 > max_int32
+        and not needs_int64
+    ):
+        raise ValueError(
+            "In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
+            " sometimes produces negative columns when the output shape contains"
+            " `n_cols` too large to be represented by a 32bit signed"
+            " integer. To avoid this error, either use a version"
+            " of scipy `>=1.8.0` or alter the `PolynomialFeatures`"
+            " transformer to produce fewer than 2^31 output features."
+        )
+
+    # Result of the expansion, modified in place by the
+    # `_csr_polynomial_expansion` routine.
+    expanded_data = np.empty(shape=total_nnz, dtype=X.data.dtype)
+    expanded_indices = np.empty(shape=total_nnz, dtype=index_dtype)
+    expanded_indptr = np.empty(shape=X.indptr.shape[0], dtype=index_dtype)
+    _csr_polynomial_expansion(
+        X.data,
+        X.indices,
+        X.indptr,
+        X.shape[1],
+        expanded_data,
+        expanded_indices,
+        expanded_indptr,
+        interaction_only,
+        deg,
+    )
+    return sparse.csr_matrix(
+        (expanded_data, expanded_indices, expanded_indptr),
+        shape=(X.indptr.shape[0] - 1, expanded_col),
+        dtype=X.dtype,
+    )
+
+
+class PolynomialFeatures(TransformerMixin, BaseEstimator):
+    """Generate polynomial and interaction features.
+
+    Generate a new feature matrix consisting of all polynomial combinations
+    of the features with degree less than or equal to the specified degree.
+    For example, if an input sample is two dimensional and of the form
+    [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
+
+    Read more in the :ref:`User Guide <polynomial_features>`.
+
+    Parameters
+    ----------
+    degree : int or tuple (min_degree, max_degree), default=2
+        If a single int is given, it specifies the maximal degree of the
+        polynomial features. If a tuple `(min_degree, max_degree)` is passed,
+        then `min_degree` is the minimum and `max_degree` is the maximum
+        polynomial degree of the generated features. Note that `min_degree=0`
+        and `min_degree=1` are equivalent as outputting the degree zero term is
+        determined by `include_bias`.
+
+    interaction_only : bool, default=False
+        If `True`, only interaction features are produced: features that are
+        products of at most `degree` *distinct* input features, i.e. terms with
+        power of 2 or higher of the same input feature are excluded:
+
+        - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc.
+        - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc.
+
+    include_bias : bool, default=True
+        If `True` (default), then include a bias column, the feature in which
+        all polynomial powers are zero (i.e. a column of ones - acts as an
+        intercept term in a linear model).
+
+    order : {'C', 'F'}, default='C'
+        Order of output array in the dense case. `'F'` order is faster to
+        compute, but may slow down subsequent estimators.
+
+        .. versionadded:: 0.21
+
+    Attributes
+    ----------
+    powers_ : ndarray of shape (`n_output_features_`, `n_features_in_`)
+        `powers_[i, j]` is the exponent of the jth input in the ith output.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_output_features_ : int
+        The total number of polynomial output features. The number of output
+        features is computed by iterating over all suitably sized combinations
+        of input features.
+
+    See Also
+    --------
+    SplineTransformer : Transformer that generates univariate B-spline bases
+        for features.
+
+    Notes
+    -----
+    Be aware that the number of features in the output array scales
+    polynomially in the number of features of the input array, and
+    exponentially in the degree. High degrees can cause overfitting.
+
+    See :ref:`examples/linear_model/plot_polynomial_interpolation.py
+    <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import PolynomialFeatures
+    >>> X = np.arange(6).reshape(3, 2)
+    >>> X
+    array([[0, 1],
+           [2, 3],
+           [4, 5]])
+    >>> poly = PolynomialFeatures(2)
+    >>> poly.fit_transform(X)
+    array([[ 1.,  0.,  1.,  0.,  0.,  1.],
+           [ 1.,  2.,  3.,  4.,  6.,  9.],
+           [ 1.,  4.,  5., 16., 20., 25.]])
+    >>> poly = PolynomialFeatures(interaction_only=True)
+    >>> poly.fit_transform(X)
+    array([[ 1.,  0.,  1.,  0.],
+           [ 1.,  2.,  3.,  6.],
+           [ 1.,  4.,  5., 20.]])
+    """
+
+    _parameter_constraints: dict = {
+        "degree": [Interval(Integral, 0, None, closed="left"), "array-like"],
+        "interaction_only": ["boolean"],
+        "include_bias": ["boolean"],
+        "order": [StrOptions({"C", "F"})],
+    }
+
+    def __init__(
+        self, degree=2, *, interaction_only=False, include_bias=True, order="C"
+    ):
+        self.degree = degree
+        self.interaction_only = interaction_only
+        self.include_bias = include_bias
+        self.order = order
+
+    @staticmethod
+    def _combinations(
+        n_features, min_degree, max_degree, interaction_only, include_bias
+    ):
+        comb = combinations if interaction_only else combinations_w_r
+        start = max(1, min_degree)
+        iter = chain.from_iterable(
+            comb(range(n_features), i) for i in range(start, max_degree + 1)
+        )
+        if include_bias:
+            iter = chain(comb(range(n_features), 0), iter)
+        return iter
+
+    @staticmethod
+    def _num_combinations(
+        n_features, min_degree, max_degree, interaction_only, include_bias
+    ):
+        """Calculate number of terms in polynomial expansion
+
+        This should be equivalent to counting the number of terms returned by
+        _combinations(...) but much faster.
+        """
+
+        if interaction_only:
+            combinations = sum(
+                [
+                    comb(n_features, i, exact=True)
+                    for i in range(max(1, min_degree), min(max_degree, n_features) + 1)
+                ]
+            )
+        else:
+            combinations = comb(n_features + max_degree, max_degree, exact=True) - 1
+            if min_degree > 0:
+                d = min_degree - 1
+                combinations -= comb(n_features + d, d, exact=True) - 1
+
+        if include_bias:
+            combinations += 1
+
+        return combinations
+
+    @property
+    def powers_(self):
+        """Exponent for each of the inputs in the output."""
+        check_is_fitted(self)
+
+        combinations = self._combinations(
+            n_features=self.n_features_in_,
+            min_degree=self._min_degree,
+            max_degree=self._max_degree,
+            interaction_only=self.interaction_only,
+            include_bias=self.include_bias,
+        )
+        return np.vstack(
+            [np.bincount(c, minlength=self.n_features_in_) for c in combinations]
+        )
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features is None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        powers = self.powers_
+        input_features = _check_feature_names_in(self, input_features)
+        feature_names = []
+        for row in powers:
+            inds = np.where(row)[0]
+            if len(inds):
+                name = " ".join(
+                    (
+                        "%s^%d" % (input_features[ind], exp)
+                        if exp != 1
+                        else input_features[ind]
+                    )
+                    for ind, exp in zip(inds, row[inds])
+                )
+            else:
+                name = "1"
+            feature_names.append(name)
+        return np.asarray(feature_names, dtype=object)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """
+        Compute number of output features.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        _, n_features = validate_data(self, X, accept_sparse=True).shape
+
+        if isinstance(self.degree, Integral):
+            if self.degree == 0 and not self.include_bias:
+                raise ValueError(
+                    "Setting degree to zero and include_bias to False would result in"
+                    " an empty output array."
+                )
+
+            self._min_degree = 0
+            self._max_degree = self.degree
+        elif (
+            isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2
+        ):
+            self._min_degree, self._max_degree = self.degree
+            if not (
+                isinstance(self._min_degree, Integral)
+                and isinstance(self._max_degree, Integral)
+                and self._min_degree >= 0
+                and self._min_degree <= self._max_degree
+            ):
+                raise ValueError(
+                    "degree=(min_degree, max_degree) must "
+                    "be non-negative integers that fulfil "
+                    "min_degree <= max_degree, got "
+                    f"{self.degree}."
+                )
+            elif self._max_degree == 0 and not self.include_bias:
+                raise ValueError(
+                    "Setting both min_degree and max_degree to zero and include_bias to"
+                    " False would result in an empty output array."
+                )
+        else:
+            raise ValueError(
+                "degree must be a non-negative int or tuple "
+                "(min_degree, max_degree), got "
+                f"{self.degree}."
+            )
+
+        self.n_output_features_ = self._num_combinations(
+            n_features=n_features,
+            min_degree=self._min_degree,
+            max_degree=self._max_degree,
+            interaction_only=self.interaction_only,
+            include_bias=self.include_bias,
+        )
+        if self.n_output_features_ > np.iinfo(np.intp).max:
+            msg = (
+                "The output that would result from the current configuration would"
+                f" have {self.n_output_features_} features which is too large to be"
+                f" indexed by {np.intp().dtype.name}. Please change some or all of the"
+                " following:\n- The number of features in the input, currently"
+                f" {n_features=}\n- The range of degrees to calculate, currently"
+                f" [{self._min_degree}, {self._max_degree}]\n- Whether to include only"
+                f" interaction terms, currently {self.interaction_only}\n- Whether to"
+                f" include a bias term, currently {self.include_bias}."
+            )
+            if (
+                np.intp == np.int32
+                and self.n_output_features_ <= np.iinfo(np.int64).max
+            ):  # pragma: nocover
+                msg += (
+                    "\nNote that the current Python runtime has a limited 32 bit "
+                    "address space and that this configuration would have been "
+                    "admissible if run on a 64 bit Python runtime."
+                )
+            raise ValueError(msg)
+        # We also record the number of output features for
+        # _max_degree = 0
+        self._n_out_full = self._num_combinations(
+            n_features=n_features,
+            min_degree=0,
+            max_degree=self._max_degree,
+            interaction_only=self.interaction_only,
+            include_bias=self.include_bias,
+        )
+
+        return self
+
+    def transform(self, X):
+        """Transform data to polynomial features.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to transform, row by row.
+
+            Prefer CSR over CSC for sparse input (for speed), but CSC is
+            required if the degree is 4 or higher. If the degree is less than
+            4 and the input format is CSC, it will be converted to CSR, have
+            its polynomial features generated, then converted back to CSC.
+
+            If the degree is 2 or 3, the method described in "Leveraging
+            Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices
+            Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is
+            used, which is much faster than the method used on CSC input. For
+            this reason, a CSC input will be converted to CSR, and the output
+            will be converted back to CSC prior to being returned, hence the
+            preference of CSR.
+
+        Returns
+        -------
+        XP : {ndarray, sparse matrix} of shape (n_samples, NP)
+            The matrix of features, where `NP` is the number of polynomial
+            features generated from the combination of inputs. If a sparse
+            matrix is provided, it will be converted into a sparse
+            `csr_matrix`.
+        """
+        check_is_fitted(self)
+
+        X = validate_data(
+            self,
+            X,
+            order="F",
+            dtype=FLOAT_DTYPES,
+            reset=False,
+            accept_sparse=("csr", "csc"),
+        )
+
+        n_samples, n_features = X.shape
+        max_int32 = np.iinfo(np.int32).max
+        if sparse.issparse(X) and X.format == "csr":
+            if self._max_degree > 3:
+                return self.transform(X.tocsc()).tocsr()
+            to_stack = []
+            if self.include_bias:
+                to_stack.append(
+                    sparse.csr_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype))
+                )
+            if self._min_degree <= 1 and self._max_degree > 0:
+                to_stack.append(X)
+
+            cumulative_size = sum(mat.shape[1] for mat in to_stack)
+            for deg in range(max(2, self._min_degree), self._max_degree + 1):
+                expanded = _create_expansion(
+                    X=X,
+                    interaction_only=self.interaction_only,
+                    deg=deg,
+                    n_features=n_features,
+                    cumulative_size=cumulative_size,
+                )
+                if expanded is not None:
+                    to_stack.append(expanded)
+                    cumulative_size += expanded.shape[1]
+            if len(to_stack) == 0:
+                # edge case: deal with empty matrix
+                XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)
+            else:
+                # `scipy.sparse.hstack` breaks in scipy<1.9.2
+                # when `n_output_features_ > max_int32`
+                all_int32 = all(mat.indices.dtype == np.int32 for mat in to_stack)
+                if (
+                    sp_version < parse_version("1.9.2")
+                    and self.n_output_features_ > max_int32
+                    and all_int32
+                ):
+                    raise ValueError(  # pragma: no cover
+                        "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
+                        " produces negative columns when:\n1. The output shape contains"
+                        " `n_cols` too large to be represented by a 32bit signed"
+                        " integer.\n2. All sub-matrices to be stacked have indices of"
+                        " dtype `np.int32`.\nTo avoid this error, either use a version"
+                        " of scipy `>=1.9.2` or alter the `PolynomialFeatures`"
+                        " transformer to produce fewer than 2^31 output features"
+                    )
+                XP = sparse.hstack(to_stack, dtype=X.dtype, format="csr")
+        elif sparse.issparse(X) and X.format == "csc" and self._max_degree < 4:
+            return self.transform(X.tocsr()).tocsc()
+        elif sparse.issparse(X):
+            combinations = self._combinations(
+                n_features=n_features,
+                min_degree=self._min_degree,
+                max_degree=self._max_degree,
+                interaction_only=self.interaction_only,
+                include_bias=self.include_bias,
+            )
+            columns = []
+            for combi in combinations:
+                if combi:
+                    out_col = 1
+                    for col_idx in combi:
+                        out_col = X[:, [col_idx]].multiply(out_col)
+                    columns.append(out_col)
+                else:
+                    bias = sparse.csc_matrix(np.ones((X.shape[0], 1)))
+                    columns.append(bias)
+            XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
+        else:
+            # Do as if _min_degree = 0 and cut down array after the
+            # computation, i.e. use _n_out_full instead of n_output_features_.
+            XP = np.empty(
+                shape=(n_samples, self._n_out_full), dtype=X.dtype, order=self.order
+            )
+
+            # What follows is a faster implementation of:
+            # for i, comb in enumerate(combinations):
+            #     XP[:, i] = X[:, comb].prod(1)
+            # This implementation uses two optimisations.
+            # First one is broadcasting,
+            # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1]
+            # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2]
+            # ...
+            # multiply ([X[:, start:end], X[:, start]) -> ...
+            # Second optimisation happens for degrees >= 3.
+            # Xi^3 is computed reusing previous computation:
+            # Xi^3 = Xi^2 * Xi.
+
+            # degree 0 term
+            if self.include_bias:
+                XP[:, 0] = 1
+                current_col = 1
+            else:
+                current_col = 0
+
+            if self._max_degree == 0:
+                return XP
+
+            # degree 1 term
+            XP[:, current_col : current_col + n_features] = X
+            index = list(range(current_col, current_col + n_features))
+            current_col += n_features
+            index.append(current_col)
+
+            # loop over degree >= 2 terms
+            for _ in range(2, self._max_degree + 1):
+                new_index = []
+                end = index[-1]
+                for feature_idx in range(n_features):
+                    start = index[feature_idx]
+                    new_index.append(current_col)
+                    if self.interaction_only:
+                        start += index[feature_idx + 1] - index[feature_idx]
+                    next_col = current_col + end - start
+                    if next_col <= current_col:
+                        break
+                    # XP[:, start:end] are terms of degree d - 1
+                    # that exclude feature #feature_idx.
+                    np.multiply(
+                        XP[:, start:end],
+                        X[:, feature_idx : feature_idx + 1],
+                        out=XP[:, current_col:next_col],
+                        casting="no",
+                    )
+                    current_col = next_col
+
+                new_index.append(current_col)
+                index = new_index
+
+            if self._min_degree > 1:
+                n_XP, n_Xout = self._n_out_full, self.n_output_features_
+                if self.include_bias:
+                    Xout = np.empty(
+                        shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order
+                    )
+                    Xout[:, 0] = 1
+                    Xout[:, 1:] = XP[:, n_XP - n_Xout + 1 :]
+                else:
+                    Xout = XP[:, n_XP - n_Xout :].copy()
+                XP = Xout
+        return XP
+
+
+class SplineTransformer(TransformerMixin, BaseEstimator):
+    """Generate univariate B-spline bases for features.
+
+    Generate a new feature matrix consisting of
+    `n_splines=n_knots + degree - 1` (`n_knots - 1` for
+    `extrapolation="periodic"`) spline basis functions
+    (B-splines) of polynomial order=`degree` for each feature.
+
+    In order to learn more about the SplineTransformer class go to:
+    :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`
+
+    Read more in the :ref:`User Guide <spline_transformer>`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    n_knots : int, default=5
+        Number of knots of the splines if `knots` equals one of
+        {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots`
+        is array-like.
+
+    degree : int, default=3
+        The polynomial degree of the spline basis. Must be a non-negative
+        integer.
+
+    knots : {'uniform', 'quantile'} or array-like of shape \
+        (n_knots, n_features), default='uniform'
+        Set knot positions such that first knot <= features <= last knot.
+
+        - If 'uniform', `n_knots` number of knots are distributed uniformly
+          from min to max values of the features.
+        - If 'quantile', they are distributed uniformly along the quantiles of
+          the features.
+        - If an array-like is given, it directly specifies the sorted knot
+          positions including the boundary knots. Note that, internally,
+          `degree` number of knots are added before the first knot, the same
+          after the last knot.
+
+    extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, \
+        default='constant'
+        If 'error', values outside the min and max values of the training
+        features raises a `ValueError`. If 'constant', the value of the
+        splines at minimum and maximum value of the features is used as
+        constant extrapolation. If 'linear', a linear extrapolation is used.
+        If 'continue', the splines are extrapolated as is, i.e. option
+        `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If
+        'periodic', periodic splines with a periodicity equal to the distance
+        between the first and last knot are used. Periodic splines enforce
+        equal function values and derivatives at the first and last knot.
+        For example, this makes it possible to avoid introducing an arbitrary
+        jump between Dec 31st and Jan 1st in spline features derived from a
+        naturally periodic "day-of-year" input feature. In this case it is
+        recommended to manually set the knot values to control the period.
+
+    include_bias : bool, default=True
+        If False, then the last spline element inside the data range
+        of a feature is dropped. As B-splines sum to one over the spline basis
+        functions for each data point, they implicitly include a bias term,
+        i.e. a column of ones. It acts as an intercept term in a linear models.
+
+    order : {'C', 'F'}, default='C'
+        Order of output array in the dense case. `'F'` order is faster to compute, but
+        may slow down subsequent estimators.
+
+    sparse_output : bool, default=False
+        Will return sparse CSR matrix if set True else will return an array. This
+        option is only available with `scipy>=1.8`.
+
+        .. versionadded:: 1.2
+
+    Attributes
+    ----------
+    bsplines_ : list of shape (n_features,)
+        List of BSplines objects, one for each feature.
+
+    n_features_in_ : int
+        The total number of input features.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_features_out_ : int
+        The total number of output features, which is computed as
+        `n_features * n_splines`, where `n_splines` is
+        the number of bases elements of the B-splines,
+        `n_knots + degree - 1` for non-periodic splines and
+        `n_knots - 1` for periodic ones.
+        If `include_bias=False`, then it is only
+        `n_features * (n_splines - 1)`.
+
+    See Also
+    --------
+    KBinsDiscretizer : Transformer that bins continuous data into intervals.
+
+    PolynomialFeatures : Transformer that generates polynomial and interaction
+        features.
+
+    Notes
+    -----
+    High degrees and a high number of knots can cause overfitting.
+
+    See :ref:`examples/linear_model/plot_polynomial_interpolation.py
+    <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import SplineTransformer
+    >>> X = np.arange(6).reshape(6, 1)
+    >>> spline = SplineTransformer(degree=2, n_knots=3)
+    >>> spline.fit_transform(X)
+    array([[0.5 , 0.5 , 0.  , 0.  ],
+           [0.18, 0.74, 0.08, 0.  ],
+           [0.02, 0.66, 0.32, 0.  ],
+           [0.  , 0.32, 0.66, 0.02],
+           [0.  , 0.08, 0.74, 0.18],
+           [0.  , 0.  , 0.5 , 0.5 ]])
+    """
+
+    _parameter_constraints: dict = {
+        "n_knots": [Interval(Integral, 2, None, closed="left")],
+        "degree": [Interval(Integral, 0, None, closed="left")],
+        "knots": [StrOptions({"uniform", "quantile"}), "array-like"],
+        "extrapolation": [
+            StrOptions({"error", "constant", "linear", "continue", "periodic"})
+        ],
+        "include_bias": ["boolean"],
+        "order": [StrOptions({"C", "F"})],
+        "sparse_output": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        n_knots=5,
+        degree=3,
+        *,
+        knots="uniform",
+        extrapolation="constant",
+        include_bias=True,
+        order="C",
+        sparse_output=False,
+    ):
+        self.n_knots = n_knots
+        self.degree = degree
+        self.knots = knots
+        self.extrapolation = extrapolation
+        self.include_bias = include_bias
+        self.order = order
+        self.sparse_output = sparse_output
+
+    @staticmethod
+    def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None):
+        """Calculate base knot positions.
+
+        Base knots such that first knot <= feature <= last knot. For the
+        B-spline construction with scipy.interpolate.BSpline, 2*degree knots
+        beyond the base interval are added.
+
+        Returns
+        -------
+        knots : ndarray of shape (n_knots, n_features), dtype=np.float64
+            Knot positions (points) of base interval.
+        """
+        if knots == "quantile":
+            percentiles = 100 * np.linspace(
+                start=0, stop=1, num=n_knots, dtype=np.float64
+            )
+
+            if sample_weight is None:
+                knots = np.percentile(X, percentiles, axis=0)
+            else:
+                knots = np.array(
+                    [
+                        _weighted_percentile(X, sample_weight, percentile)
+                        for percentile in percentiles
+                    ]
+                )
+
+        else:
+            # knots == 'uniform':
+            # Note that the variable `knots` has already been validated and
+            # `else` is therefore safe.
+            # Disregard observations with zero weight.
+            mask = slice(None, None, 1) if sample_weight is None else sample_weight > 0
+            x_min = np.amin(X[mask], axis=0)
+            x_max = np.amax(X[mask], axis=0)
+
+            knots = np.linspace(
+                start=x_min,
+                stop=x_max,
+                num=n_knots,
+                endpoint=True,
+                dtype=np.float64,
+            )
+
+        return knots
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        n_splines = self.bsplines_[0].c.shape[1]
+
+        input_features = _check_feature_names_in(self, input_features)
+        feature_names = []
+        for i in range(self.n_features_in_):
+            for j in range(n_splines - 1 + self.include_bias):
+                feature_names.append(f"{input_features[i]}_sp_{j}")
+        return np.asarray(feature_names, dtype=object)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute knot positions of splines.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data.
+
+        y : None
+            Ignored.
+
+        sample_weight : array-like of shape (n_samples,), default = None
+            Individual weights for each sample. Used to calculate quantiles if
+            `knots="quantile"`. For `knots="uniform"`, zero weighted
+            observations are ignored for finding the min and max of `X`.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        X = validate_data(
+            self,
+            X,
+            reset=True,
+            accept_sparse=False,
+            ensure_min_samples=2,
+            ensure_2d=True,
+        )
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        _, n_features = X.shape
+
+        if isinstance(self.knots, str):
+            base_knots = self._get_base_knot_positions(
+                X, n_knots=self.n_knots, knots=self.knots, sample_weight=sample_weight
+            )
+        else:
+            base_knots = check_array(self.knots, dtype=np.float64)
+            if base_knots.shape[0] < 2:
+                raise ValueError("Number of knots, knots.shape[0], must be >= 2.")
+            elif base_knots.shape[1] != n_features:
+                raise ValueError("knots.shape[1] == n_features is violated.")
+            elif not np.all(np.diff(base_knots, axis=0) > 0):
+                raise ValueError("knots must be sorted without duplicates.")
+
+        if self.sparse_output and sp_version < parse_version("1.8.0"):
+            raise ValueError(
+                "Option sparse_output=True is only available with scipy>=1.8.0, "
+                f"but here scipy=={sp_version} is used."
+            )
+
+        # number of knots for base interval
+        n_knots = base_knots.shape[0]
+
+        if self.extrapolation == "periodic" and n_knots <= self.degree:
+            raise ValueError(
+                "Periodic splines require degree < n_knots. Got n_knots="
+                f"{n_knots} and degree={self.degree}."
+            )
+
+        # number of splines basis functions
+        if self.extrapolation != "periodic":
+            n_splines = n_knots + self.degree - 1
+        else:
+            # periodic splines have self.degree less degrees of freedom
+            n_splines = n_knots - 1
+
+        degree = self.degree
+        n_out = n_features * n_splines
+        # We have to add degree number of knots below, and degree number knots
+        # above the base knots in order to make the spline basis complete.
+        if self.extrapolation == "periodic":
+            # For periodic splines the spacing of the first / last degree knots
+            # needs to be a continuation of the spacing of the last / first
+            # base knots.
+            period = base_knots[-1] - base_knots[0]
+            knots = np.r_[
+                base_knots[-(degree + 1) : -1] - period,
+                base_knots,
+                base_knots[1 : (degree + 1)] + period,
+            ]
+
+        else:
+            # Eilers & Marx in "Flexible smoothing with B-splines and
+            # penalties" https://doi.org/10.1214/ss/1038425655 advice
+            # against repeating first and last knot several times, which
+            # would have inferior behaviour at boundaries if combined with
+            # a penalty (hence P-Spline). We follow this advice even if our
+            # splines are unpenalized. Meaning we do not:
+            # knots = np.r_[
+            #     np.tile(base_knots.min(axis=0), reps=[degree, 1]),
+            #     base_knots,
+            #     np.tile(base_knots.max(axis=0), reps=[degree, 1])
+            # ]
+            # Instead, we reuse the distance of the 2 fist/last knots.
+            dist_min = base_knots[1] - base_knots[0]
+            dist_max = base_knots[-1] - base_knots[-2]
+
+            knots = np.r_[
+                np.linspace(
+                    base_knots[0] - degree * dist_min,
+                    base_knots[0] - dist_min,
+                    num=degree,
+                ),
+                base_knots,
+                np.linspace(
+                    base_knots[-1] + dist_max,
+                    base_knots[-1] + degree * dist_max,
+                    num=degree,
+                ),
+            ]
+
+        # With a diagonal coefficient matrix, we get back the spline basis
+        # elements, i.e. the design matrix of the spline.
+        # Note, BSpline appreciates C-contiguous float64 arrays as c=coef.
+        coef = np.eye(n_splines, dtype=np.float64)
+        if self.extrapolation == "periodic":
+            coef = np.concatenate((coef, coef[:degree, :]))
+
+        extrapolate = self.extrapolation in ["periodic", "continue"]
+
+        bsplines = [
+            BSpline.construct_fast(
+                knots[:, i], coef, self.degree, extrapolate=extrapolate
+            )
+            for i in range(n_features)
+        ]
+        self.bsplines_ = bsplines
+
+        self.n_features_out_ = n_out - n_features * (1 - self.include_bias)
+        return self
+
+    def transform(self, X):
+        """Transform each feature data to B-splines.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to transform.
+
+        Returns
+        -------
+        XBS : {ndarray, sparse matrix} of shape (n_samples, n_features * n_splines)
+            The matrix of features, where n_splines is the number of bases
+            elements of the B-splines, n_knots + degree - 1.
+        """
+        check_is_fitted(self)
+
+        X = validate_data(self, X, reset=False, accept_sparse=False, ensure_2d=True)
+
+        n_samples, n_features = X.shape
+        n_splines = self.bsplines_[0].c.shape[1]
+        degree = self.degree
+
+        # TODO: Remove this condition, once scipy 1.10 is the minimum version.
+        #       Only scipy => 1.10 supports design_matrix(.., extrapolate=..).
+        #       The default (implicit in scipy < 1.10) is extrapolate=False.
+        scipy_1_10 = sp_version >= parse_version("1.10.0")
+        # Note: self.bsplines_[0].extrapolate is True for extrapolation in
+        # ["periodic", "continue"]
+        if scipy_1_10:
+            use_sparse = self.sparse_output
+            kwargs_extrapolate = {"extrapolate": self.bsplines_[0].extrapolate}
+        else:
+            use_sparse = self.sparse_output and not self.bsplines_[0].extrapolate
+            kwargs_extrapolate = dict()
+
+        # Note that scipy BSpline returns float64 arrays and converts input
+        # x=X[:, i] to c-contiguous float64.
+        n_out = self.n_features_out_ + n_features * (1 - self.include_bias)
+        if X.dtype in FLOAT_DTYPES:
+            dtype = X.dtype
+        else:
+            dtype = np.float64
+        if use_sparse:
+            output_list = []
+        else:
+            XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order)
+
+        for i in range(n_features):
+            spl = self.bsplines_[i]
+
+            if self.extrapolation in ("continue", "error", "periodic"):
+                if self.extrapolation == "periodic":
+                    # With periodic extrapolation we map x to the segment
+                    # [spl.t[k], spl.t[n]].
+                    # This is equivalent to BSpline(.., extrapolate="periodic")
+                    # for scipy>=1.0.0.
+                    n = spl.t.size - spl.k - 1
+                    # Assign to new array to avoid inplace operation
+                    x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % (
+                        spl.t[n] - spl.t[spl.k]
+                    )
+                else:
+                    x = X[:, i]
+
+                if use_sparse:
+                    XBS_sparse = BSpline.design_matrix(
+                        x, spl.t, spl.k, **kwargs_extrapolate
+                    )
+                    if self.extrapolation == "periodic":
+                        # See the construction of coef in fit. We need to add the last
+                        # degree spline basis function to the first degree ones and
+                        # then drop the last ones.
+                        # Note: See comment about SparseEfficiencyWarning below.
+                        XBS_sparse = XBS_sparse.tolil()
+                        XBS_sparse[:, :degree] += XBS_sparse[:, -degree:]
+                        XBS_sparse = XBS_sparse[:, :-degree]
+                else:
+                    XBS[:, (i * n_splines) : ((i + 1) * n_splines)] = spl(x)
+            else:  # extrapolation in ("constant", "linear")
+                xmin, xmax = spl.t[degree], spl.t[-degree - 1]
+                # spline values at boundaries
+                f_min, f_max = spl(xmin), spl(xmax)
+                mask = (xmin <= X[:, i]) & (X[:, i] <= xmax)
+                if use_sparse:
+                    mask_inv = ~mask
+                    x = X[:, i].copy()
+                    # Set some arbitrary values outside boundary that will be reassigned
+                    # later.
+                    x[mask_inv] = spl.t[self.degree]
+                    XBS_sparse = BSpline.design_matrix(x, spl.t, spl.k)
+                    # Note: Without converting to lil_matrix we would get:
+                    # scipy.sparse._base.SparseEfficiencyWarning: Changing the sparsity
+                    # structure of a csr_matrix is expensive. lil_matrix is more
+                    # efficient.
+                    if np.any(mask_inv):
+                        XBS_sparse = XBS_sparse.tolil()
+                        XBS_sparse[mask_inv, :] = 0
+                else:
+                    XBS[mask, (i * n_splines) : ((i + 1) * n_splines)] = spl(X[mask, i])
+
+            # Note for extrapolation:
+            # 'continue' is already returned as is by scipy BSplines
+            if self.extrapolation == "error":
+                # BSpline with extrapolate=False does not raise an error, but
+                # outputs np.nan.
+                if (use_sparse and np.any(np.isnan(XBS_sparse.data))) or (
+                    not use_sparse
+                    and np.any(
+                        np.isnan(XBS[:, (i * n_splines) : ((i + 1) * n_splines)])
+                    )
+                ):
+                    raise ValueError(
+                        "X contains values beyond the limits of the knots."
+                    )
+            elif self.extrapolation == "constant":
+                # Set all values beyond xmin and xmax to the value of the
+                # spline basis functions at those two positions.
+                # Only the first degree and last degree number of splines
+                # have non-zero values at the boundaries.
+
+                mask = X[:, i] < xmin
+                if np.any(mask):
+                    if use_sparse:
+                        # Note: See comment about SparseEfficiencyWarning above.
+                        XBS_sparse = XBS_sparse.tolil()
+                        XBS_sparse[mask, :degree] = f_min[:degree]
+
+                    else:
+                        XBS[mask, (i * n_splines) : (i * n_splines + degree)] = f_min[
+                            :degree
+                        ]
+
+                mask = X[:, i] > xmax
+                if np.any(mask):
+                    if use_sparse:
+                        # Note: See comment about SparseEfficiencyWarning above.
+                        XBS_sparse = XBS_sparse.tolil()
+                        XBS_sparse[mask, -degree:] = f_max[-degree:]
+                    else:
+                        XBS[
+                            mask,
+                            ((i + 1) * n_splines - degree) : ((i + 1) * n_splines),
+                        ] = f_max[-degree:]
+
+            elif self.extrapolation == "linear":
+                # Continue the degree first and degree last spline bases
+                # linearly beyond the boundaries, with slope = derivative at
+                # the boundary.
+                # Note that all others have derivative = value = 0 at the
+                # boundaries.
+
+                # spline derivatives = slopes at boundaries
+                fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1)
+                # Compute the linear continuation.
+                if degree <= 1:
+                    # For degree=1, the derivative of 2nd spline is not zero at
+                    # boundary. For degree=0 it is the same as 'constant'.
+                    degree += 1
+                for j in range(degree):
+                    mask = X[:, i] < xmin
+                    if np.any(mask):
+                        linear_extr = f_min[j] + (X[mask, i] - xmin) * fp_min[j]
+                        if use_sparse:
+                            # Note: See comment about SparseEfficiencyWarning above.
+                            XBS_sparse = XBS_sparse.tolil()
+                            XBS_sparse[mask, j] = linear_extr
+                        else:
+                            XBS[mask, i * n_splines + j] = linear_extr
+
+                    mask = X[:, i] > xmax
+                    if np.any(mask):
+                        k = n_splines - 1 - j
+                        linear_extr = f_max[k] + (X[mask, i] - xmax) * fp_max[k]
+                        if use_sparse:
+                            # Note: See comment about SparseEfficiencyWarning above.
+                            XBS_sparse = XBS_sparse.tolil()
+                            XBS_sparse[mask, k : k + 1] = linear_extr[:, None]
+                        else:
+                            XBS[mask, i * n_splines + k] = linear_extr
+
+            if use_sparse:
+                XBS_sparse = XBS_sparse.tocsr()
+                output_list.append(XBS_sparse)
+
+        if use_sparse:
+            # TODO: Remove this conditional error when the minimum supported version of
+            # SciPy is 1.9.2
+            # `scipy.sparse.hstack` breaks in scipy<1.9.2
+            # when `n_features_out_ > max_int32`
+            max_int32 = np.iinfo(np.int32).max
+            all_int32 = True
+            for mat in output_list:
+                all_int32 &= mat.indices.dtype == np.int32
+            if (
+                sp_version < parse_version("1.9.2")
+                and self.n_features_out_ > max_int32
+                and all_int32
+            ):
+                raise ValueError(
+                    "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
+                    " produces negative columns when:\n1. The output shape contains"
+                    " `n_cols` too large to be represented by a 32bit signed"
+                    " integer.\n. All sub-matrices to be stacked have indices of"
+                    " dtype `np.int32`.\nTo avoid this error, either use a version"
+                    " of scipy `>=1.9.2` or alter the `SplineTransformer`"
+                    " transformer to produce fewer than 2^31 output features"
+                )
+            XBS = sparse.hstack(output_list, format="csr")
+        elif self.sparse_output:
+            # TODO: Remove ones scipy 1.10 is the minimum version. See comments above.
+            XBS = sparse.csr_matrix(XBS)
+
+        if self.include_bias:
+            return XBS
+        else:
+            # We throw away one spline basis per feature.
+            # We chose the last one.
+            indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0]
+            return XBS[:, indices]
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder.py b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..171aafe7a03ed5711cbf0d3181c686199e6d370c
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder.py
@@ -0,0 +1,534 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import OneToOneFeatureMixin, _fit_context
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.multiclass import type_of_target
+from ..utils.validation import (
+    _check_feature_names_in,
+    _check_y,
+    check_consistent_length,
+    check_is_fitted,
+)
+from ._encoders import _BaseEncoder
+from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth
+
+
+class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
+    """Target Encoder for regression and classification targets.
+
+    Each category is encoded based on a shrunk estimate of the average target
+    values for observations belonging to the category. The encoding scheme mixes
+    the global target mean with the target mean conditioned on the value of the
+    category (see [MIC]_).
+
+    When the target type is "multiclass", encodings are based
+    on the conditional probability estimate for each class. The target is first
+    binarized using the "one-vs-all" scheme via
+    :class:`~sklearn.preprocessing.LabelBinarizer`, then the average target
+    value for each class and each category is used for encoding, resulting in
+    `n_features` * `n_classes` encoded output features.
+
+    :class:`TargetEncoder` considers missing values, such as `np.nan` or `None`,
+    as another category and encodes them like any other category. Categories
+    that are not seen during :meth:`fit` are encoded with the target mean, i.e.
+    `target_mean_`.
+
+    For a demo on the importance of the `TargetEncoder` internal cross-fitting,
+    see
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`.
+    For a comparison of different encoders, refer to
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read
+    more in the :ref:`User Guide <target_encoder>`.
+
+    .. note::
+        `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
+        :term:`cross fitting` scheme is used in `fit_transform` for encoding.
+        See the :ref:`User Guide <target_encoder>` for details.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    categories : "auto" or list of shape (n_features,) of array-like, default="auto"
+        Categories (unique values) per feature:
+
+        - `"auto"` : Determine categories automatically from the training data.
+        - list : `categories[i]` holds the categories expected in the i-th column. The
+          passed categories should not mix strings and numeric values within a single
+          feature, and should be sorted in case of numeric values.
+
+        The used categories are stored in the `categories_` fitted attribute.
+
+    target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto"
+        Type of target.
+
+        - `"auto"` : Type of target is inferred with
+          :func:`~sklearn.utils.multiclass.type_of_target`.
+        - `"continuous"` : Continuous target
+        - `"binary"` : Binary target
+        - `"multiclass"` : Multiclass target
+
+        .. note::
+            The type of target inferred with `"auto"` may not be the desired target
+            type used for modeling. For example, if the target consisted of integers
+            between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target`
+            will infer the target as `"multiclass"`. In this case, setting
+            `target_type="continuous"` will specify the target as a regression
+            problem. The `target_type_` attribute gives the target type used by the
+            encoder.
+
+        .. versionchanged:: 1.4
+           Added the option 'multiclass'.
+
+    smooth : "auto" or float, default="auto"
+        The amount of mixing of the target mean conditioned on the value of the
+        category with the global target mean. A larger `smooth` value will put
+        more weight on the global target mean.
+        If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
+
+    cv : int, default=5
+        Determines the number of folds in the :term:`cross fitting` strategy used in
+        :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
+        and for continuous targets, `KFold` is used.
+
+    shuffle : bool, default=True
+        Whether to shuffle the data in :meth:`fit_transform` before splitting into
+        folds. Note that the samples within each split will not be shuffled.
+
+    random_state : int, RandomState instance or None, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold. Otherwise, this
+        parameter has no effect.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    encodings_ : list of shape (n_features,) or (n_features * n_classes) of \
+                    ndarray
+        Encodings learnt on all of `X`.
+        For feature `i`, `encodings_[i]` are the encodings matching the
+        categories listed in `categories_[i]`. When `target_type_` is
+        "multiclass", the encoding for feature `i` and class `j` is stored in
+        `encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and
+        3 classes (c), encodings are ordered:
+        f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2,
+
+    categories_ : list of shape (n_features,) of ndarray
+        The categories of each input feature determined during fitting or
+        specified in `categories`
+        (in order of the features in `X` and corresponding with the output
+        of :meth:`transform`).
+
+    target_type_ : str
+        Type of target.
+
+    target_mean_ : float
+        The overall mean of the target. This value is only used in :meth:`transform`
+        to encode categories.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+    classes_ : ndarray or None
+        If `target_type_` is 'binary' or 'multiclass', holds the label for each class,
+        otherwise `None`.
+
+    See Also
+    --------
+    OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features.
+        Contrary to TargetEncoder, this encoding is not supervised. Treating the
+        resulting encoding as a numerical features therefore lead arbitrarily
+        ordered values and therefore typically lead to lower predictive performance
+        when used as preprocessing for a classifier or regressor.
+    OneHotEncoder : Performs a one-hot encoding of categorical features. This
+        unsupervised encoding is better suited for low cardinality categorical
+        variables as it generate one new feature per unique category.
+
+    References
+    ----------
+    .. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
+       categorical attributes in classification and prediction problems"
+       SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>`
+
+    Examples
+    --------
+    With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate:
+
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import TargetEncoder
+    >>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T
+    >>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30
+    >>> enc_auto = TargetEncoder(smooth="auto")
+    >>> X_trans = enc_auto.fit_transform(X, y)
+
+    >>> # A high `smooth` parameter puts more weight on global mean on the categorical
+    >>> # encodings:
+    >>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y)
+    >>> enc_high_smooth.target_mean_
+    np.float64(44...)
+    >>> enc_high_smooth.encodings_
+    [array([44..., 44..., 44...])]
+
+    >>> # On the other hand, a low `smooth` parameter puts more weight on target
+    >>> # conditioned on the value of the categorical:
+    >>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y)
+    >>> enc_low_smooth.encodings_
+    [array([20..., 80..., 43...])]
+    """
+
+    _parameter_constraints: dict = {
+        "categories": [StrOptions({"auto"}), list],
+        "target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})],
+        "smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")],
+        "cv": [Interval(Integral, 2, None, closed="left")],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        categories="auto",
+        target_type="auto",
+        smooth="auto",
+        cv=5,
+        shuffle=True,
+        random_state=None,
+    ):
+        self.categories = categories
+        self.smooth = smooth
+        self.target_type = target_type
+        self.cv = cv
+        self.shuffle = shuffle
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit the :class:`TargetEncoder` to X and y.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        y : array-like of shape (n_samples,)
+            The target data used to encode the categories.
+
+        Returns
+        -------
+        self : object
+            Fitted encoder.
+        """
+        self._fit_encodings_all(X, y)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y):
+        """Fit :class:`TargetEncoder` and transform X with the target encoding.
+
+        .. note::
+            `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
+            :term:`cross fitting` scheme is used in `fit_transform` for encoding.
+            See the :ref:`User Guide <target_encoder>`. for details.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        y : array-like of shape (n_samples,)
+            The target data used to encode the categories.
+
+        Returns
+        -------
+        X_trans : ndarray of shape (n_samples, n_features) or \
+                    (n_samples, (n_features * n_classes))
+            Transformed input.
+        """
+        from ..model_selection import KFold, StratifiedKFold  # avoid circular import
+
+        X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)
+
+        # The cv splitter is voluntarily restricted to *KFold to enforce non
+        # overlapping validation folds, otherwise the fit_transform output will
+        # not be well-specified.
+        if self.target_type_ == "continuous":
+            cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state)
+        else:
+            cv = StratifiedKFold(
+                self.cv, shuffle=self.shuffle, random_state=self.random_state
+            )
+
+        # If 'multiclass' multiply axis=1 by num classes else keep shape the same
+        if self.target_type_ == "multiclass":
+            X_out = np.empty(
+                (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
+                dtype=np.float64,
+            )
+        else:
+            X_out = np.empty_like(X_ordinal, dtype=np.float64)
+
+        for train_idx, test_idx in cv.split(X, y):
+            X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]
+            y_train_mean = np.mean(y_train, axis=0)
+
+            if self.target_type_ == "multiclass":
+                encodings = self._fit_encoding_multiclass(
+                    X_train,
+                    y_train,
+                    n_categories,
+                    y_train_mean,
+                )
+            else:
+                encodings = self._fit_encoding_binary_or_continuous(
+                    X_train,
+                    y_train,
+                    n_categories,
+                    y_train_mean,
+                )
+            self._transform_X_ordinal(
+                X_out,
+                X_ordinal,
+                ~X_known_mask,
+                test_idx,
+                encodings,
+                y_train_mean,
+            )
+        return X_out
+
+    def transform(self, X):
+        """Transform X with the target encoding.
+
+        .. note::
+            `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
+            :term:`cross fitting` scheme is used in `fit_transform` for encoding.
+            See the :ref:`User Guide <target_encoder>`. for details.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        Returns
+        -------
+        X_trans : ndarray of shape (n_samples, n_features) or \
+                    (n_samples, (n_features * n_classes))
+            Transformed input.
+        """
+        X_ordinal, X_known_mask = self._transform(
+            X, handle_unknown="ignore", ensure_all_finite="allow-nan"
+        )
+
+        # If 'multiclass' multiply axis=1 by num of classes else keep shape the same
+        if self.target_type_ == "multiclass":
+            X_out = np.empty(
+                (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
+                dtype=np.float64,
+            )
+        else:
+            X_out = np.empty_like(X_ordinal, dtype=np.float64)
+
+        self._transform_X_ordinal(
+            X_out,
+            X_ordinal,
+            ~X_known_mask,
+            slice(None),
+            self.encodings_,
+            self.target_mean_,
+        )
+        return X_out
+
+    def _fit_encodings_all(self, X, y):
+        """Fit a target encoding with all the data."""
+        # avoid circular import
+        from ..preprocessing import (
+            LabelBinarizer,
+            LabelEncoder,
+        )
+
+        check_consistent_length(X, y)
+        self._fit(X, handle_unknown="ignore", ensure_all_finite="allow-nan")
+
+        if self.target_type == "auto":
+            accepted_target_types = ("binary", "multiclass", "continuous")
+            inferred_type_of_target = type_of_target(y, input_name="y")
+            if inferred_type_of_target not in accepted_target_types:
+                raise ValueError(
+                    "Unknown label type: Target type was inferred to be "
+                    f"{inferred_type_of_target!r}. Only {accepted_target_types} are "
+                    "supported."
+                )
+            self.target_type_ = inferred_type_of_target
+        else:
+            self.target_type_ = self.target_type
+
+        self.classes_ = None
+        if self.target_type_ == "binary":
+            label_encoder = LabelEncoder()
+            y = label_encoder.fit_transform(y)
+            self.classes_ = label_encoder.classes_
+        elif self.target_type_ == "multiclass":
+            label_binarizer = LabelBinarizer()
+            y = label_binarizer.fit_transform(y)
+            self.classes_ = label_binarizer.classes_
+        else:  # continuous
+            y = _check_y(y, y_numeric=True, estimator=self)
+
+        self.target_mean_ = np.mean(y, axis=0)
+
+        X_ordinal, X_known_mask = self._transform(
+            X, handle_unknown="ignore", ensure_all_finite="allow-nan"
+        )
+        n_categories = np.fromiter(
+            (len(category_for_feature) for category_for_feature in self.categories_),
+            dtype=np.int64,
+            count=len(self.categories_),
+        )
+        if self.target_type_ == "multiclass":
+            encodings = self._fit_encoding_multiclass(
+                X_ordinal,
+                y,
+                n_categories,
+                self.target_mean_,
+            )
+        else:
+            encodings = self._fit_encoding_binary_or_continuous(
+                X_ordinal,
+                y,
+                n_categories,
+                self.target_mean_,
+            )
+        self.encodings_ = encodings
+
+        return X_ordinal, X_known_mask, y, n_categories
+
+    def _fit_encoding_binary_or_continuous(
+        self, X_ordinal, y, n_categories, target_mean
+    ):
+        """Learn target encodings."""
+        if self.smooth == "auto":
+            y_variance = np.var(y)
+            encodings = _fit_encoding_fast_auto_smooth(
+                X_ordinal,
+                y,
+                n_categories,
+                target_mean,
+                y_variance,
+            )
+        else:
+            encodings = _fit_encoding_fast(
+                X_ordinal,
+                y,
+                n_categories,
+                self.smooth,
+                target_mean,
+            )
+        return encodings
+
+    def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
+        """Learn multiclass encodings.
+
+        Learn encodings for each class (c) then reorder encodings such that
+        the same features (f) are grouped together. `reorder_index` enables
+        converting from:
+        f0_c0, f1_c0, f0_c1, f1_c1, f0_c2, f1_c2
+        to:
+        f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2
+        """
+        n_features = self.n_features_in_
+        n_classes = len(self.classes_)
+
+        encodings = []
+        for i in range(n_classes):
+            y_class = y[:, i]
+            encoding = self._fit_encoding_binary_or_continuous(
+                X_ordinal,
+                y_class,
+                n_categories,
+                target_mean[i],
+            )
+            encodings.extend(encoding)
+
+        reorder_index = (
+            idx
+            for start in range(n_features)
+            for idx in range(start, (n_classes * n_features), n_features)
+        )
+        return [encodings[idx] for idx in reorder_index]
+
+    def _transform_X_ordinal(
+        self,
+        X_out,
+        X_ordinal,
+        X_unknown_mask,
+        row_indices,
+        encodings,
+        target_mean,
+    ):
+        """Transform X_ordinal using encodings.
+
+        In the multiclass case, `X_ordinal` and `X_unknown_mask` have column
+        (axis=1) size `n_features`, while `encodings` has length of size
+        `n_features * n_classes`. `feat_idx` deals with this by repeating
+        feature indices by `n_classes` E.g., for 3 features, 2 classes:
+        0,0,1,1,2,2
+
+        Additionally, `target_mean` is of shape (`n_classes`,) so `mean_idx`
+        cycles through 0 to `n_classes` - 1, `n_features` times.
+        """
+        if self.target_type_ == "multiclass":
+            n_classes = len(self.classes_)
+            for e_idx, encoding in enumerate(encodings):
+                # Repeat feature indices by n_classes
+                feat_idx = e_idx // n_classes
+                # Cycle through each class
+                mean_idx = e_idx % n_classes
+                X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, feat_idx]]
+                X_out[X_unknown_mask[:, feat_idx], e_idx] = target_mean[mean_idx]
+        else:
+            for e_idx, encoding in enumerate(encodings):
+                X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, e_idx]]
+                X_out[X_unknown_mask[:, e_idx], e_idx] = target_mean
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names. `feature_names_in_` is used unless it is
+            not defined, in which case the following input feature names are
+            generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            When `type_of_target_` is "multiclass" the names are of the format
+            '<feature_name>_<class_name>'.
+        """
+        check_is_fitted(self, "n_features_in_")
+        feature_names = _check_feature_names_in(self, input_features)
+        if self.target_type_ == "multiclass":
+            feature_names = [
+                f"{feature_name}_{class_name}"
+                for feature_name in feature_names
+                for class_name in self.classes_
+            ]
+            return np.asarray(feature_names, dtype=object)
+        else:
+            return feature_names
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.required = True
+        return tags
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.lib
new file mode 100644
index 0000000000000000000000000000000000000000..9c58f0c23834feaba3bf6547c479c919a350b534
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.lib differ
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..e654cb7f4b09e30332955c3be1eec2fb9e956caf
Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.cp39-win_amd64.pyd differ
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..cb07d531147bbd55434cfa435f5b180d1d144aee
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx
@@ -0,0 +1,167 @@
+from libc.math cimport isnan
+from libcpp.vector cimport vector
+
+from ..utils._typedefs cimport float32_t, float64_t, int32_t, int64_t
+
+import numpy as np
+
+
+ctypedef fused INT_DTYPE:
+    int64_t
+    int32_t
+
+ctypedef fused Y_DTYPE:
+    int64_t
+    int32_t
+    float64_t
+    float32_t
+
+
+def _fit_encoding_fast(
+    INT_DTYPE[:, ::1] X_int,
+    const Y_DTYPE[:] y,
+    int64_t[::1] n_categories,
+    double smooth,
+    double y_mean,
+):
+    """Fit a target encoding on X_int and y.
+
+    This implementation uses Eq 7 from [1] to compute the encoding.
+    As stated in the paper, Eq 7 is the same as Eq 3.
+
+    [1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
+         categorical attributes in classification and prediction problems"
+    """
+    cdef:
+        int64_t sample_idx, feat_idx, cat_idx, n_cats
+        INT_DTYPE X_int_tmp
+        int n_samples = X_int.shape[0]
+        int n_features = X_int.shape[1]
+        double smooth_sum = smooth * y_mean
+        int64_t max_n_cats = np.max(n_categories)
+        double[::1] sums = np.empty(max_n_cats, dtype=np.float64)
+        double[::1] counts = np.empty(max_n_cats, dtype=np.float64)
+        list encodings = []
+        double[::1] current_encoding
+        # Gives access to encodings without gil
+        vector[double*] encoding_vec
+
+    encoding_vec.resize(n_features)
+    for feat_idx in range(n_features):
+        current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
+        encoding_vec[feat_idx] = &current_encoding[0]
+        encodings.append(np.asarray(current_encoding))
+
+    with nogil:
+        for feat_idx in range(n_features):
+            n_cats = n_categories[feat_idx]
+
+            for cat_idx in range(n_cats):
+                sums[cat_idx] = smooth_sum
+                counts[cat_idx] = smooth
+
+            for sample_idx in range(n_samples):
+                X_int_tmp = X_int[sample_idx, feat_idx]
+                # -1 are unknown categories, which are not counted
+                if X_int_tmp == -1:
+                    continue
+                sums[X_int_tmp] += y[sample_idx]
+                counts[X_int_tmp] += 1.0
+
+            for cat_idx in range(n_cats):
+                if counts[cat_idx] == 0:
+                    encoding_vec[feat_idx][cat_idx] = y_mean
+                else:
+                    encoding_vec[feat_idx][cat_idx] = sums[cat_idx] / counts[cat_idx]
+
+    return encodings
+
+
+def _fit_encoding_fast_auto_smooth(
+    INT_DTYPE[:, ::1] X_int,
+    const Y_DTYPE[:] y,
+    int64_t[::1] n_categories,
+    double y_mean,
+    double y_variance,
+):
+    """Fit a target encoding on X_int and y with auto smoothing.
+
+    This implementation uses Eq 5 and 6 from [1].
+
+    [1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
+         categorical attributes in classification and prediction problems"
+    """
+    cdef:
+        int64_t sample_idx, feat_idx, cat_idx, n_cats
+        INT_DTYPE X_int_tmp
+        double diff
+        int n_samples = X_int.shape[0]
+        int n_features = X_int.shape[1]
+        int64_t max_n_cats = np.max(n_categories)
+        double[::1] means = np.empty(max_n_cats, dtype=np.float64)
+        int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
+        double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
+        double lambda_
+        list encodings = []
+        double[::1] current_encoding
+        # Gives access to encodings without gil
+        vector[double*] encoding_vec
+
+    encoding_vec.resize(n_features)
+    for feat_idx in range(n_features):
+        current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
+        encoding_vec[feat_idx] = &current_encoding[0]
+        encodings.append(np.asarray(current_encoding))
+
+    # TODO: parallelize this with OpenMP prange. When n_features >= n_threads, it's
+    # probably good to parallelize the outer loop. When n_features is too small,
+    # then it would probably better to parallelize the nested loops on n_samples and
+    # n_cats, but the code to handle thread-local temporary variables might be
+    # significantly more complex.
+    with nogil:
+        for feat_idx in range(n_features):
+            n_cats = n_categories[feat_idx]
+
+            for cat_idx in range(n_cats):
+                means[cat_idx] = 0.0
+                counts[cat_idx] = 0
+                sum_of_squared_diffs[cat_idx] = 0.0
+
+            # first pass to compute the mean
+            for sample_idx in range(n_samples):
+                X_int_tmp = X_int[sample_idx, feat_idx]
+
+                # -1 are unknown categories, which are not counted
+                if X_int_tmp == -1:
+                    continue
+                counts[X_int_tmp] += 1
+                means[X_int_tmp] += y[sample_idx]
+
+            for cat_idx in range(n_cats):
+                means[cat_idx] /= counts[cat_idx]
+
+            # second pass to compute the sum of squared differences
+            for sample_idx in range(n_samples):
+                X_int_tmp = X_int[sample_idx, feat_idx]
+                if X_int_tmp == -1:
+                    continue
+                diff = y[sample_idx] - means[X_int_tmp]
+                sum_of_squared_diffs[X_int_tmp] += diff * diff
+
+            for cat_idx in range(n_cats):
+                lambda_ = (
+                    y_variance * counts[cat_idx] /
+                    (y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
+                     counts[cat_idx])
+                )
+                if isnan(lambda_):
+                    # A nan can happen when:
+                    # 1. counts[cat_idx] == 0
+                    # 2. y_variance == 0 and sum_of_squared_diffs[cat_idx] == 0
+                    encoding_vec[feat_idx][cat_idx] = y_mean
+                else:
+                    encoding_vec[feat_idx][cat_idx] = (
+                        lambda_ * means[cat_idx] + (1 - lambda_) * y_mean
+                    )
+
+    return encodings
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/meson.build b/.venv/Lib/site-packages/sklearn/preprocessing/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..fcd4cc8d50740bf09955ccf08b850dde1cd55293
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/meson.build
@@ -0,0 +1,16 @@
+py.extension_module(
+  '_csr_polynomial_expansion',
+  ['_csr_polynomial_expansion.pyx', utils_cython_tree],
+  cython_args: cython_args,
+  subdir: 'sklearn/preprocessing',
+  install: true
+)
+
+py.extension_module(
+  '_target_encoder_fast',
+  ['_target_encoder_fast.pyx', utils_cython_tree],
+  override_options: ['cython_language=cpp'],
+  cython_args: cython_args,
+  subdir: 'sklearn/preprocessing',
+  install: true
+)
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_common.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d0fbd461bfe5d4158865bd956dbe4d2e12fbc3d
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_common.py
@@ -0,0 +1,187 @@
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.base import clone
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import (
+    MaxAbsScaler,
+    MinMaxScaler,
+    PowerTransformer,
+    QuantileTransformer,
+    RobustScaler,
+    StandardScaler,
+    maxabs_scale,
+    minmax_scale,
+    power_transform,
+    quantile_transform,
+    robust_scale,
+    scale,
+)
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import (
+    BSR_CONTAINERS,
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DIA_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+
+iris = load_iris()
+
+
+def _get_valid_samples_by_column(X, col):
+    """Get non NaN samples in column of X"""
+    return X[:, [col]][~np.isnan(X[:, col])]
+
+
+@pytest.mark.parametrize(
+    "est, func, support_sparse, strictly_positive, omit_kwargs",
+    [
+        (MaxAbsScaler(), maxabs_scale, True, False, []),
+        (MinMaxScaler(), minmax_scale, False, False, ["clip"]),
+        (StandardScaler(), scale, False, False, []),
+        (StandardScaler(with_mean=False), scale, True, False, []),
+        (PowerTransformer("yeo-johnson"), power_transform, False, False, []),
+        (PowerTransformer("box-cox"), power_transform, False, True, []),
+        (QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
+        (RobustScaler(), robust_scale, False, False, []),
+        (RobustScaler(with_centering=False), robust_scale, True, False, []),
+    ],
+)
+def test_missing_value_handling(
+    est, func, support_sparse, strictly_positive, omit_kwargs
+):
+    # check that the preprocessing method let pass nan
+    rng = np.random.RandomState(42)
+    X = iris.data.copy()
+    n_missing = 50
+    X[
+        rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
+    ] = np.nan
+    if strictly_positive:
+        X += np.nanmin(X) + 0.1
+    X_train, X_test = train_test_split(X, random_state=1)
+    # sanity check
+    assert not np.all(np.isnan(X_train), axis=0).any()
+    assert np.any(np.isnan(X_train), axis=0).all()
+    assert np.any(np.isnan(X_test), axis=0).all()
+    X_test[:, 0] = np.nan  # make sure this boundary case is tested
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        Xt = est.fit(X_train).transform(X_test)
+    # ensure no warnings are raised
+    # missing values should still be missing, and only them
+    assert_array_equal(np.isnan(Xt), np.isnan(X_test))
+
+    # check that the function leads to the same results as the class
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        Xt_class = est.transform(X_train)
+    kwargs = est.get_params()
+    # remove the parameters which should be omitted because they
+    # are not defined in the counterpart function of the preprocessing class
+    for kwarg in omit_kwargs:
+        _ = kwargs.pop(kwarg)
+    Xt_func = func(X_train, **kwargs)
+    assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
+    assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
+
+    # check that the inverse transform keep NaN
+    Xt_inv = est.inverse_transform(Xt)
+    assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
+    # FIXME: we can introduce equal_nan=True in recent version of numpy.
+    # For the moment which just check that non-NaN values are almost equal.
+    assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
+
+    for i in range(X.shape[1]):
+        # train only on non-NaN
+        est.fit(_get_valid_samples_by_column(X_train, i))
+        # check transforming with NaN works even when training without NaN
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", RuntimeWarning)
+            Xt_col = est.transform(X_test[:, [i]])
+        assert_allclose(Xt_col, Xt[:, [i]])
+        # check non-NaN is handled as before - the 1st column is all nan
+        if not np.isnan(X_test[:, i]).all():
+            Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
+            assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
+
+    if support_sparse:
+        est_dense = clone(est)
+        est_sparse = clone(est)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", RuntimeWarning)
+            Xt_dense = est_dense.fit(X_train).transform(X_test)
+            Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
+
+        for sparse_container in (
+            BSR_CONTAINERS
+            + COO_CONTAINERS
+            + CSC_CONTAINERS
+            + CSR_CONTAINERS
+            + DIA_CONTAINERS
+            + DOK_CONTAINERS
+            + LIL_CONTAINERS
+        ):
+            # check that the dense and sparse inputs lead to the same results
+            # precompute the matrix to avoid catching side warnings
+            X_train_sp = sparse_container(X_train)
+            X_test_sp = sparse_container(X_test)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", PendingDeprecationWarning)
+                warnings.simplefilter("error", RuntimeWarning)
+                Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
+
+            assert_allclose(Xt_sp.toarray(), Xt_dense)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", PendingDeprecationWarning)
+                warnings.simplefilter("error", RuntimeWarning)
+                Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
+
+            assert_allclose(Xt_inv_sp.toarray(), Xt_inv_dense)
+
+
+@pytest.mark.parametrize(
+    "est, func",
+    [
+        (MaxAbsScaler(), maxabs_scale),
+        (MinMaxScaler(), minmax_scale),
+        (StandardScaler(), scale),
+        (StandardScaler(with_mean=False), scale),
+        (PowerTransformer("yeo-johnson"), power_transform),
+        (
+            PowerTransformer("box-cox"),
+            power_transform,
+        ),
+        (QuantileTransformer(n_quantiles=3), quantile_transform),
+        (RobustScaler(), robust_scale),
+        (RobustScaler(with_centering=False), robust_scale),
+    ],
+)
+def test_missing_value_pandas_na_support(est, func):
+    # Test pandas IntegerArray with pd.NA
+    pd = pytest.importorskip("pandas")
+
+    X = np.array(
+        [
+            [1, 2, 3, np.nan, np.nan, 4, 5, 1],
+            [np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
+            [1, 2, 3, 4, 5, 6, 7, 8],
+        ]
+    ).T
+
+    # Creates dataframe with IntegerArrays with pd.NA
+    X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"])
+    X_df["c"] = X_df["c"].astype("int")
+
+    X_trans = est.fit_transform(X)
+    X_df_trans = est.fit_transform(X_df)
+
+    assert_allclose(X_trans, X_df_trans)
diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..04c67a15cd92903e0d10c6dea3e6fda55abfb5d7
--- /dev/null
+++ b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py
@@ -0,0 +1,2621 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+import warnings
+
+import numpy as np
+import numpy.linalg as la
+import pytest
+from scipy import sparse, stats
+
+from sklearn import datasets
+from sklearn.base import clone
+from sklearn.exceptions import NotFittedError
+from sklearn.metrics.pairwise import linear_kernel
+from sklearn.model_selection import cross_val_predict
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import (
+    Binarizer,
+    KernelCenterer,
+    MaxAbsScaler,
+    MinMaxScaler,
+    Normalizer,
+    PowerTransformer,
+    QuantileTransformer,
+    RobustScaler,
+    StandardScaler,
+    add_dummy_feature,
+    maxabs_scale,
+    minmax_scale,
+    normalize,
+    power_transform,
+    quantile_transform,
+    robust_scale,
+    scale,
+)
+from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale
+from sklearn.svm import SVR
+from sklearn.utils import gen_batches, shuffle
+from sklearn.utils._array_api import (
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    assert_array_less,
+    skip_if_32bit,
+)
+from sklearn.utils.estimator_checks import (
+    check_array_api_input_and_values,
+)
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    LIL_CONTAINERS,
+)
+from sklearn.utils.sparsefuncs import mean_variance_axis
+
+iris = datasets.load_iris()
+
+# Make some data to be used many times
+rng = np.random.RandomState(0)
+n_features = 30
+n_samples = 1000
+offsets = rng.uniform(-1, 1, size=n_features)
+scales = rng.uniform(1, 10, size=n_features)
+X_2d = rng.randn(n_samples, n_features) * scales + offsets
+X_1row = X_2d[0, :].reshape(1, n_features)
+X_1col = X_2d[:, 0].reshape(n_samples, 1)
+X_list_1row = X_1row.tolist()
+X_list_1col = X_1col.tolist()
+
+
+def toarray(a):
+    if hasattr(a, "toarray"):
+        a = a.toarray()
+    return a
+
+
+def _check_dim_1axis(a):
+    return np.asarray(a).shape[0]
+
+
+def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen):
+    if batch_stop != n:
+        assert (i + 1) * chunk_size == n_samples_seen
+    else:
+        assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen
+
+
+def test_raises_value_error_if_sample_weights_greater_than_1d():
+    # Sample weights must be either scalar or 1D
+
+    n_sampless = [2, 3]
+    n_featuress = [3, 2]
+
+    for n_samples, n_features in zip(n_sampless, n_featuress):
+        X = rng.randn(n_samples, n_features)
+        y = rng.randn(n_samples)
+
+        scaler = StandardScaler()
+
+        # make sure Error is raised the sample weights greater than 1d
+        sample_weight_notOK = rng.randn(n_samples, 1) ** 2
+        with pytest.raises(ValueError):
+            scaler.fit(X, y, sample_weight=sample_weight_notOK)
+
+
+@pytest.mark.parametrize(
+    ["Xw", "X", "sample_weight"],
+    [
+        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [1, 2, 3], [4, 5, 6]], [2.0, 1.0]),
+        (
+            [[1, 0, 1], [0, 0, 1]],
+            [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]],
+            np.array([1, 3]),
+        ),
+        (
+            [[1, np.nan, 1], [np.nan, np.nan, 1]],
+            [
+                [1, np.nan, 1],
+                [np.nan, np.nan, 1],
+                [np.nan, np.nan, 1],
+                [np.nan, np.nan, 1],
+            ],
+            np.array([1, 3]),
+        ),
+    ],
+)
+@pytest.mark.parametrize("array_constructor", ["array", "sparse_csr", "sparse_csc"])
+def test_standard_scaler_sample_weight(Xw, X, sample_weight, array_constructor):
+    with_mean = not array_constructor.startswith("sparse")
+    X = _convert_container(X, array_constructor)
+    Xw = _convert_container(Xw, array_constructor)
+
+    # weighted StandardScaler
+    yw = np.ones(Xw.shape[0])
+    scaler_w = StandardScaler(with_mean=with_mean)
+    scaler_w.fit(Xw, yw, sample_weight=sample_weight)
+
+    # unweighted, but with repeated samples
+    y = np.ones(X.shape[0])
+    scaler = StandardScaler(with_mean=with_mean)
+    scaler.fit(X, y)
+
+    X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
+
+    assert_almost_equal(scaler.mean_, scaler_w.mean_)
+    assert_almost_equal(scaler.var_, scaler_w.var_)
+    assert_almost_equal(scaler.transform(X_test), scaler_w.transform(X_test))
+
+
+def test_standard_scaler_1d():
+    # Test scaling of dataset along single axis
+    for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
+        scaler = StandardScaler()
+        X_scaled = scaler.fit(X).transform(X, copy=True)
+
+        if isinstance(X, list):
+            X = np.array(X)  # cast only after scaling done
+
+        if _check_dim_1axis(X) == 1:
+            assert_almost_equal(scaler.mean_, X.ravel())
+            assert_almost_equal(scaler.scale_, np.ones(n_features))
+            assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features))
+            assert_array_almost_equal(X_scaled.std(axis=0), np.zeros_like(n_features))
+        else:
+            assert_almost_equal(scaler.mean_, X.mean())
+            assert_almost_equal(scaler.scale_, X.std())
+            assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features))
+            assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
+            assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
+        assert scaler.n_samples_seen_ == X.shape[0]
+
+        # check inverse transform
+        X_scaled_back = scaler.inverse_transform(X_scaled)
+        assert_array_almost_equal(X_scaled_back, X)
+
+    # Constant feature
+    X = np.ones((5, 1))
+    scaler = StandardScaler()
+    X_scaled = scaler.fit(X).transform(X, copy=True)
+    assert_almost_equal(scaler.mean_, 1.0)
+    assert_almost_equal(scaler.scale_, 1.0)
+    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
+    assert_array_almost_equal(X_scaled.std(axis=0), 0.0)
+    assert scaler.n_samples_seen_ == X.shape[0]
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
+@pytest.mark.parametrize("add_sample_weight", [False, True])
+def test_standard_scaler_dtype(add_sample_weight, sparse_container):
+    # Ensure scaling does not affect dtype
+    rng = np.random.RandomState(0)
+    n_samples = 10
+    n_features = 3
+    if add_sample_weight:
+        sample_weight = np.ones(n_samples)
+    else:
+        sample_weight = None
+    with_mean = True
+    if sparse_container is not None:
+        # scipy sparse containers do not support float16, see
+        # https://github.com/scipy/scipy/issues/7408 for more details.
+        supported_dtype = [np.float64, np.float32]
+    else:
+        supported_dtype = [np.float64, np.float32, np.float16]
+    for dtype in supported_dtype:
+        X = rng.randn(n_samples, n_features).astype(dtype)
+        if sparse_container is not None:
+            X = sparse_container(X)
+            with_mean = False
+
+        scaler = StandardScaler(with_mean=with_mean)
+        X_scaled = scaler.fit(X, sample_weight=sample_weight).transform(X)
+        assert X.dtype == X_scaled.dtype
+        assert scaler.mean_.dtype == np.float64
+        assert scaler.scale_.dtype == np.float64
+
+
+@pytest.mark.parametrize(
+    "scaler",
+    [
+        StandardScaler(with_mean=False),
+        RobustScaler(with_centering=False),
+    ],
+)
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
+@pytest.mark.parametrize("add_sample_weight", [False, True])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("constant", [0, 1.0, 100.0])
+def test_standard_scaler_constant_features(
+    scaler, add_sample_weight, sparse_container, dtype, constant
+):
+    if isinstance(scaler, RobustScaler) and add_sample_weight:
+        pytest.skip(f"{scaler.__class__.__name__} does not yet support sample_weight")
+
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 1
+    if add_sample_weight:
+        fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2)
+    else:
+        fit_params = {}
+    X_array = np.full(shape=(n_samples, n_features), fill_value=constant, dtype=dtype)
+    X = X_array if sparse_container is None else sparse_container(X_array)
+    X_scaled = scaler.fit(X, **fit_params).transform(X)
+
+    if isinstance(scaler, StandardScaler):
+        # The variance info should be close to zero for constant features.
+        assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7)
+
+    # Constant features should not be scaled (scale of 1.):
+    assert_allclose(scaler.scale_, np.ones(X.shape[1]))
+
+    assert X_scaled is not X  # make sure we make a copy
+    assert_allclose_dense_sparse(X_scaled, X)
+
+    if isinstance(scaler, StandardScaler) and not add_sample_weight:
+        # Also check consistency with the standard scale function.
+        X_scaled_2 = scale(X, with_mean=scaler.with_mean)
+        assert X_scaled_2 is not X  # make sure we did a copy
+        assert_allclose_dense_sparse(X_scaled_2, X)
+
+
+@pytest.mark.parametrize("n_samples", [10, 100, 10_000])
+@pytest.mark.parametrize("average", [1e-10, 1, 1e10])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
+def test_standard_scaler_near_constant_features(
+    n_samples, sparse_container, average, dtype
+):
+    # Check that when the variance is too small (var << mean**2) the feature
+    # is considered constant and not scaled.
+
+    scale_min, scale_max = -30, 19
+    scales = np.array([10**i for i in range(scale_min, scale_max + 1)], dtype=dtype)
+
+    n_features = scales.shape[0]
+    X = np.empty((n_samples, n_features), dtype=dtype)
+    # Make a dataset of known var = scales**2 and mean = average
+    X[: n_samples // 2, :] = average + scales
+    X[n_samples // 2 :, :] = average - scales
+    X_array = X if sparse_container is None else sparse_container(X)
+
+    scaler = StandardScaler(with_mean=False).fit(X_array)
+
+    # StandardScaler uses float64 accumulators even if the data has a float32
+    # dtype.
+    eps = np.finfo(np.float64).eps
+
+    # if var < bound = N.eps.var + N².eps².mean², the feature is considered
+    # constant and the scale_ attribute is set to 1.
+    bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2
+    within_bounds = scales**2 <= bounds
+
+    # Check that scale_min is small enough to have some scales below the
+    # bound and therefore detected as constant:
+    assert np.any(within_bounds)
+
+    # Check that such features are actually treated as constant by the scaler:
+    assert all(scaler.var_[within_bounds] <= bounds[within_bounds])
+    assert_allclose(scaler.scale_[within_bounds], 1.0)
+
+    # Depending the on the dtype of X, some features might not actually be
+    # representable as non constant for small scales (even if above the
+    # precision bound of the float64 variance estimate). Such feature should
+    # be correctly detected as constants with 0 variance by StandardScaler.
+    representable_diff = X[0, :] - X[-1, :] != 0
+    assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0)
+    assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1)
+
+    # The other features are scaled and scale_ is equal to sqrt(var_) assuming
+    # that scales are large enough for average + scale and average - scale to
+    # be distinct in X (depending on X's dtype).
+    common_mask = np.logical_and(scales**2 > bounds, representable_diff)
+    assert_allclose(scaler.scale_[common_mask], np.sqrt(scaler.var_)[common_mask])
+
+
+def test_scale_1d():
+    # 1-d inputs
+    X_list = [1.0, 3.0, 5.0, 0.0]
+    X_arr = np.array(X_list)
+
+    for X in [X_list, X_arr]:
+        X_scaled = scale(X)
+        assert_array_almost_equal(X_scaled.mean(), 0.0)
+        assert_array_almost_equal(X_scaled.std(), 1.0)
+        assert_array_equal(scale(X, with_mean=False, with_std=False), X)
+
+
+@skip_if_32bit
+def test_standard_scaler_numerical_stability():
+    # Test numerical stability of scaling
+    # np.log(1e-5) is taken because of its floating point representation
+    # was empirically found to cause numerical problems with np.mean & np.std.
+    x = np.full(8, np.log(1e-5), dtype=np.float64)
+    # This does not raise a warning as the number of samples is too low
+    # to trigger the problem in recent numpy
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        scale(x)
+    assert_array_almost_equal(scale(x), np.zeros(8))
+
+    # with 2 more samples, the std computation run into numerical issues:
+    x = np.full(10, np.log(1e-5), dtype=np.float64)
+    warning_message = "standard deviation of the data is probably very close to 0"
+    with pytest.warns(UserWarning, match=warning_message):
+        x_scaled = scale(x)
+    assert_array_almost_equal(x_scaled, np.zeros(10))
+
+    x = np.full(10, 1e-100, dtype=np.float64)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        x_small_scaled = scale(x)
+    assert_array_almost_equal(x_small_scaled, np.zeros(10))
+
+    # Large values can cause (often recoverable) numerical stability issues:
+    x_big = np.full(10, 1e100, dtype=np.float64)
+    warning_message = "Dataset may contain too large values"
+    with pytest.warns(UserWarning, match=warning_message):
+        x_big_scaled = scale(x_big)
+    assert_array_almost_equal(x_big_scaled, np.zeros(10))
+    assert_array_almost_equal(x_big_scaled, x_small_scaled)
+    with pytest.warns(UserWarning, match=warning_message):
+        x_big_centered = scale(x_big, with_std=False)
+    assert_array_almost_equal(x_big_centered, np.zeros(10))
+    assert_array_almost_equal(x_big_centered, x_small_scaled)
+
+
+def test_scaler_2d_arrays():
+    # Test scaling of 2d array along first axis
+    rng = np.random.RandomState(0)
+    n_features = 5
+    n_samples = 4
+    X = rng.randn(n_samples, n_features)
+    X[:, 0] = 0.0  # first feature is always of zero
+
+    scaler = StandardScaler()
+    X_scaled = scaler.fit(X).transform(X, copy=True)
+    assert not np.any(np.isnan(X_scaled))
+    assert scaler.n_samples_seen_ == n_samples
+
+    assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
+    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+    # Check that X has been copied
+    assert X_scaled is not X
+
+    # check inverse transform
+    X_scaled_back = scaler.inverse_transform(X_scaled)
+    assert X_scaled_back is not X
+    assert X_scaled_back is not X_scaled
+    assert_array_almost_equal(X_scaled_back, X)
+
+    X_scaled = scale(X, axis=1, with_std=False)
+    assert not np.any(np.isnan(X_scaled))
+    assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
+    X_scaled = scale(X, axis=1, with_std=True)
+    assert not np.any(np.isnan(X_scaled))
+    assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
+    assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0])
+    # Check that the data hasn't been modified
+    assert X_scaled is not X
+
+    X_scaled = scaler.fit(X).transform(X, copy=False)
+    assert not np.any(np.isnan(X_scaled))
+    assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
+    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+    # Check that X has not been copied
+    assert X_scaled is X
+
+    X = rng.randn(4, 5)
+    X[:, 0] = 1.0  # first feature is a constant, non zero feature
+    scaler = StandardScaler()
+    X_scaled = scaler.fit(X).transform(X, copy=True)
+    assert not np.any(np.isnan(X_scaled))
+    assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
+    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+    # Check that X has not been copied
+    assert X_scaled is not X
+
+
+def test_scaler_float16_overflow():
+    # Test if the scaler will not overflow on float16 numpy arrays
+    rng = np.random.RandomState(0)
+    # float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000
+    # which is enough to overflow the data type
+    X = rng.uniform(5, 10, [200000, 1]).astype(np.float16)
+
+    with np.errstate(over="raise"):
+        scaler = StandardScaler().fit(X)
+        X_scaled = scaler.transform(X)
+
+    # Calculate the float64 equivalent to verify result
+    X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64))
+
+    # Overflow calculations may cause -inf, inf, or nan. Since there is no nan
+    # input, all of the outputs should be finite. This may be redundant since a
+    # FloatingPointError exception will be thrown on overflow above.
+    assert np.all(np.isfinite(X_scaled))
+
+    # The normal distribution is very unlikely to go above 4. At 4.0-8.0 the
+    # float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are
+    # checked to account for precision differences.
+    assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2)
+
+
+def test_handle_zeros_in_scale():
+    s1 = np.array([0, 1e-16, 1, 2, 3])
+    s2 = _handle_zeros_in_scale(s1, copy=True)
+
+    assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3]))
+    assert_allclose(s2, np.array([1, 1, 1, 2, 3]))
+
+
+def test_minmax_scaler_partial_fit():
+    # Test if partial_fit run over many batches of size 1 and 50
+    # gives the same results as fit
+    X = X_2d
+    n = X.shape[0]
+
+    for chunk_size in [1, 2, 50, n, n + 42]:
+        # Test mean at the end of the process
+        scaler_batch = MinMaxScaler().fit(X)
+
+        scaler_incr = MinMaxScaler()
+        for batch in gen_batches(n_samples, chunk_size):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+
+        assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
+        assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+        assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
+        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
+
+        # Test std after 1 step
+        batch0 = slice(0, chunk_size)
+        scaler_batch = MinMaxScaler().fit(X[batch0])
+        scaler_incr = MinMaxScaler().partial_fit(X[batch0])
+
+        assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
+        assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+        assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
+        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
+
+        # Test std until the end of partial fits, and
+        scaler_batch = MinMaxScaler().fit(X)
+        scaler_incr = MinMaxScaler()  # Clean estimator
+        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+            assert_correct_incr(
+                i,
+                batch_start=batch.start,
+                batch_stop=batch.stop,
+                n=n,
+                chunk_size=chunk_size,
+                n_samples_seen=scaler_incr.n_samples_seen_,
+            )
+
+
+def test_standard_scaler_partial_fit():
+    # Test if partial_fit run over many batches of size 1 and 50
+    # gives the same results as fit
+    X = X_2d
+    n = X.shape[0]
+
+    for chunk_size in [1, 2, 50, n, n + 42]:
+        # Test mean at the end of the process
+        scaler_batch = StandardScaler(with_std=False).fit(X)
+
+        scaler_incr = StandardScaler(with_std=False)
+        for batch in gen_batches(n_samples, chunk_size):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+        assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
+        assert scaler_batch.var_ == scaler_incr.var_  # Nones
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+
+        # Test std after 1 step
+        batch0 = slice(0, chunk_size)
+        scaler_incr = StandardScaler().partial_fit(X[batch0])
+        if chunk_size == 1:
+            assert_array_almost_equal(
+                np.zeros(n_features, dtype=np.float64), scaler_incr.var_
+            )
+            assert_array_almost_equal(
+                np.ones(n_features, dtype=np.float64), scaler_incr.scale_
+            )
+        else:
+            assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_)
+            assert_array_almost_equal(
+                np.std(X[batch0], axis=0), scaler_incr.scale_
+            )  # no constants
+
+        # Test std until the end of partial fits, and
+        scaler_batch = StandardScaler().fit(X)
+        scaler_incr = StandardScaler()  # Clean estimator
+        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+            assert_correct_incr(
+                i,
+                batch_start=batch.start,
+                batch_stop=batch.stop,
+                n=n,
+                chunk_size=chunk_size,
+                n_samples_seen=scaler_incr.n_samples_seen_,
+            )
+
+        assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_standard_scaler_partial_fit_numerical_stability(sparse_container):
+    # Test if the incremental computation introduces significative errors
+    # for large datasets with values of large magniture
+    rng = np.random.RandomState(0)
+    n_features = 2
+    n_samples = 100
+    offsets = rng.uniform(-1e15, 1e15, size=n_features)
+    scales = rng.uniform(1e3, 1e6, size=n_features)
+    X = rng.randn(n_samples, n_features) * scales + offsets
+
+    scaler_batch = StandardScaler().fit(X)
+    scaler_incr = StandardScaler()
+    for chunk in X:
+        scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features))
+
+    # Regardless of abs values, they must not be more diff 6 significant digits
+    tol = 10 ** (-6)
+    assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol)
+    assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol)
+    assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol)
+    # NOTE Be aware that for much larger offsets std is very unstable (last
+    # assert) while mean is OK.
+
+    # Sparse input
+    size = (100, 3)
+    scale = 1e20
+    X = sparse_container(rng.randint(0, 2, size).astype(np.float64) * scale)
+
+    # with_mean=False is required with sparse input
+    scaler = StandardScaler(with_mean=False).fit(X)
+    scaler_incr = StandardScaler(with_mean=False)
+
+    for chunk in X:
+        if chunk.ndim == 1:
+            # Sparse arrays can be 1D (in scipy 1.14 and later) while old
+            # sparse matrix instances are always 2D.
+            chunk = chunk.reshape(1, -1)
+        scaler_incr = scaler_incr.partial_fit(chunk)
+
+    # Regardless of magnitude, they must not differ more than of 6 digits
+    tol = 10 ** (-6)
+    assert scaler.mean_ is not None
+    assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol)
+    assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol)
+
+
+@pytest.mark.parametrize("sample_weight", [True, None])
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_partial_fit_sparse_input(sample_weight, sparse_container):
+    # Check that sparsity is not destroyed
+    X = sparse_container(np.array([[1.0], [0.0], [0.0], [5.0]]))
+
+    if sample_weight:
+        sample_weight = rng.rand(X.shape[0])
+
+    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
+    X_null = null_transform.partial_fit(X, sample_weight=sample_weight).transform(X)
+    assert_array_equal(X_null.toarray(), X.toarray())
+    X_orig = null_transform.inverse_transform(X_null)
+    assert_array_equal(X_orig.toarray(), X_null.toarray())
+    assert_array_equal(X_orig.toarray(), X.toarray())
+
+
+@pytest.mark.parametrize("sample_weight", [True, None])
+def test_standard_scaler_trasform_with_partial_fit(sample_weight):
+    # Check some postconditions after applying partial_fit and transform
+    X = X_2d[:100, :]
+
+    if sample_weight:
+        sample_weight = rng.rand(X.shape[0])
+
+    scaler_incr = StandardScaler()
+    for i, batch in enumerate(gen_batches(X.shape[0], 1)):
+        X_sofar = X[: (i + 1), :]
+        chunks_copy = X_sofar.copy()
+        if sample_weight is None:
+            scaled_batch = StandardScaler().fit_transform(X_sofar)
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+        else:
+            scaled_batch = StandardScaler().fit_transform(
+                X_sofar, sample_weight=sample_weight[: i + 1]
+            )
+            scaler_incr = scaler_incr.partial_fit(
+                X[batch], sample_weight=sample_weight[batch]
+            )
+        scaled_incr = scaler_incr.transform(X_sofar)
+
+        assert_array_almost_equal(scaled_batch, scaled_incr)
+        assert_array_almost_equal(X_sofar, chunks_copy)  # No change
+        right_input = scaler_incr.inverse_transform(scaled_incr)
+        assert_array_almost_equal(X_sofar, right_input)
+
+        zero = np.zeros(X.shape[1])
+        epsilon = np.finfo(float).eps
+        assert_array_less(zero, scaler_incr.var_ + epsilon)  # as less or equal
+        assert_array_less(zero, scaler_incr.scale_ + epsilon)
+        if sample_weight is None:
+            # (i+1) because the Scaler has been already fitted
+            assert (i + 1) == scaler_incr.n_samples_seen_
+        else:
+            assert np.sum(sample_weight[: i + 1]) == pytest.approx(
+                scaler_incr.n_samples_seen_
+            )
+
+
+def test_standard_check_array_of_inverse_transform():
+    # Check if StandardScaler inverse_transform is
+    # converting the integer array to float
+    x = np.array(
+        [
+            [1, 1, 1, 0, 1, 0],
+            [1, 1, 1, 0, 1, 0],
+            [0, 8, 0, 1, 0, 0],
+            [1, 4, 1, 1, 0, 0],
+            [0, 1, 0, 0, 1, 0],
+            [0, 4, 0, 1, 0, 1],
+        ],
+        dtype=np.int32,
+    )
+
+    scaler = StandardScaler()
+    scaler.fit(x)
+
+    # The of inverse_transform should be converted
+    # to a float array.
+    # If not X *= self.scale_ will fail.
+    scaler.inverse_transform(x)
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
+)
+@pytest.mark.parametrize(
+    "check",
+    [check_array_api_input_and_values],
+    ids=_get_check_estimator_ids,
+)
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        MaxAbsScaler(),
+        MinMaxScaler(),
+        MinMaxScaler(clip=True),
+        KernelCenterer(),
+        Normalizer(norm="l1"),
+        Normalizer(norm="l2"),
+        Normalizer(norm="max"),
+    ],
+    ids=_get_check_estimator_ids,
+)
+def test_scaler_array_api_compliance(
+    estimator, check, array_namespace, device, dtype_name
+):
+    name = estimator.__class__.__name__
+    check(name, estimator, array_namespace, device=device, dtype_name=dtype_name)
+
+
+def test_min_max_scaler_iris():
+    X = iris.data
+    scaler = MinMaxScaler()
+    # default params
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(X_trans.min(axis=0), 0)
+    assert_array_almost_equal(X_trans.max(axis=0), 1)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # not default params: min=1, max=2
+    scaler = MinMaxScaler(feature_range=(1, 2))
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(X_trans.min(axis=0), 1)
+    assert_array_almost_equal(X_trans.max(axis=0), 2)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # min=-.5, max=.6
+    scaler = MinMaxScaler(feature_range=(-0.5, 0.6))
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(X_trans.min(axis=0), -0.5)
+    assert_array_almost_equal(X_trans.max(axis=0), 0.6)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # raises on invalid range
+    scaler = MinMaxScaler(feature_range=(2, 1))
+    with pytest.raises(ValueError):
+        scaler.fit(X)
+
+
+def test_min_max_scaler_zero_variance_features():
+    # Check min max scaler on toy data with zero variance features
+    X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
+
+    X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
+
+    # default params
+    scaler = MinMaxScaler()
+    X_trans = scaler.fit_transform(X)
+    X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]]
+    assert_array_almost_equal(X_trans, X_expected_0_1)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    X_trans_new = scaler.transform(X_new)
+    X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]]
+    assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)
+
+    # not default params
+    scaler = MinMaxScaler(feature_range=(1, 2))
+    X_trans = scaler.fit_transform(X)
+    X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]]
+    assert_array_almost_equal(X_trans, X_expected_1_2)
+
+    # function interface
+    X_trans = minmax_scale(X)
+    assert_array_almost_equal(X_trans, X_expected_0_1)
+    X_trans = minmax_scale(X, feature_range=(1, 2))
+    assert_array_almost_equal(X_trans, X_expected_1_2)
+
+
+def test_minmax_scale_axis1():
+    X = iris.data
+    X_trans = minmax_scale(X, axis=1)
+    assert_array_almost_equal(np.min(X_trans, axis=1), 0)
+    assert_array_almost_equal(np.max(X_trans, axis=1), 1)
+
+
+def test_min_max_scaler_1d():
+    # Test scaling of dataset along single axis
+    for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
+        scaler = MinMaxScaler(copy=True)
+        X_scaled = scaler.fit(X).transform(X)
+
+        if isinstance(X, list):
+            X = np.array(X)  # cast only after scaling done
+
+        if _check_dim_1axis(X) == 1:
+            assert_array_almost_equal(X_scaled.min(axis=0), np.zeros(n_features))
+            assert_array_almost_equal(X_scaled.max(axis=0), np.zeros(n_features))
+        else:
+            assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
+            assert_array_almost_equal(X_scaled.max(axis=0), 1.0)
+        assert scaler.n_samples_seen_ == X.shape[0]
+
+        # check inverse transform
+        X_scaled_back = scaler.inverse_transform(X_scaled)
+        assert_array_almost_equal(X_scaled_back, X)
+
+    # Constant feature
+    X = np.ones((5, 1))
+    scaler = MinMaxScaler()
+    X_scaled = scaler.fit(X).transform(X)
+    assert X_scaled.min() >= 0.0
+    assert X_scaled.max() <= 1.0
+    assert scaler.n_samples_seen_ == X.shape[0]
+
+    # Function interface
+    X_1d = X_1row.ravel()
+    min_ = X_1d.min()
+    max_ = X_1d.max()
+    assert_array_almost_equal(
+        (X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True)
+    )
+
+
+@pytest.mark.parametrize("sample_weight", [True, None])
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_scaler_without_centering(sample_weight, sparse_container):
+    rng = np.random.RandomState(42)
+    X = rng.randn(4, 5)
+    X[:, 0] = 0.0  # first feature is always of zero
+    X_sparse = sparse_container(X)
+
+    if sample_weight:
+        sample_weight = rng.rand(X.shape[0])
+
+    with pytest.raises(ValueError):
+        StandardScaler().fit(X_sparse)
+
+    scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight)
+    X_scaled = scaler.transform(X, copy=True)
+    assert not np.any(np.isnan(X_scaled))
+
+    scaler_sparse = StandardScaler(with_mean=False).fit(
+        X_sparse, sample_weight=sample_weight
+    )
+    X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True)
+    assert not np.any(np.isnan(X_sparse_scaled.data))
+
+    assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_)
+    assert_array_almost_equal(scaler.var_, scaler_sparse.var_)
+    assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_)
+    assert_array_almost_equal(scaler.n_samples_seen_, scaler_sparse.n_samples_seen_)
+
+    if sample_weight is None:
+        assert_array_almost_equal(
+            X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2
+        )
+        assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+
+    X_sparse_scaled_mean, X_sparse_scaled_var = mean_variance_axis(X_sparse_scaled, 0)
+    assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0))
+    assert_array_almost_equal(X_sparse_scaled_var, X_scaled.var(axis=0))
+
+    # Check that X has not been modified (copy)
+    assert X_scaled is not X
+    assert X_sparse_scaled is not X_sparse
+
+    X_scaled_back = scaler.inverse_transform(X_scaled)
+    assert X_scaled_back is not X
+    assert X_scaled_back is not X_scaled
+    assert_array_almost_equal(X_scaled_back, X)
+
+    X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled)
+    assert X_sparse_scaled_back is not X_sparse
+    assert X_sparse_scaled_back is not X_sparse_scaled
+    assert_array_almost_equal(X_sparse_scaled_back.toarray(), X)
+
+    if sparse_container in CSR_CONTAINERS:
+        null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
+        X_null = null_transform.fit_transform(X_sparse)
+        assert_array_equal(X_null.data, X_sparse.data)
+        X_orig = null_transform.inverse_transform(X_null)
+        assert_array_equal(X_orig.data, X_sparse.data)
+
+
+@pytest.mark.parametrize("with_mean", [True, False])
+@pytest.mark.parametrize("with_std", [True, False])
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
+def test_scaler_n_samples_seen_with_nan(with_mean, with_std, sparse_container):
+    X = np.array(
+        [[0, 1, 3], [np.nan, 6, 10], [5, 4, np.nan], [8, 0, np.nan]], dtype=np.float64
+    )
+    if sparse_container is not None:
+        X = sparse_container(X)
+
+    if sparse.issparse(X) and with_mean:
+        pytest.skip("'with_mean=True' cannot be used with sparse matrix.")
+
+    transformer = StandardScaler(with_mean=with_mean, with_std=with_std)
+    transformer.fit(X)
+
+    assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2]))
+
+
+def _check_identity_scalers_attributes(scaler_1, scaler_2):
+    assert scaler_1.mean_ is scaler_2.mean_ is None
+    assert scaler_1.var_ is scaler_2.var_ is None
+    assert scaler_1.scale_ is scaler_2.scale_ is None
+    assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_scaler_return_identity(sparse_container):
+    # test that the scaler return identity when with_mean and with_std are
+    # False
+    X_dense = np.array([[0, 1, 3], [5, 6, 0], [8, 0, 10]], dtype=np.float64)
+    X_sparse = sparse_container(X_dense)
+
+    transformer_dense = StandardScaler(with_mean=False, with_std=False)
+    X_trans_dense = transformer_dense.fit_transform(X_dense)
+    assert_allclose(X_trans_dense, X_dense)
+
+    transformer_sparse = clone(transformer_dense)
+    X_trans_sparse = transformer_sparse.fit_transform(X_sparse)
+    assert_allclose_dense_sparse(X_trans_sparse, X_sparse)
+
+    _check_identity_scalers_attributes(transformer_dense, transformer_sparse)
+
+    transformer_dense.partial_fit(X_dense)
+    transformer_sparse.partial_fit(X_sparse)
+    _check_identity_scalers_attributes(transformer_dense, transformer_sparse)
+
+    transformer_dense.fit(X_dense)
+    transformer_sparse.fit(X_sparse)
+    _check_identity_scalers_attributes(transformer_dense, transformer_sparse)
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_scaler_int(sparse_container):
+    # test that scaler converts integer input to floating
+    # for both sparse and dense matrices
+    rng = np.random.RandomState(42)
+    X = rng.randint(20, size=(4, 5))
+    X[:, 0] = 0  # first feature is always of zero
+    X_sparse = sparse_container(X)
+
+    with warnings.catch_warnings(record=True):
+        scaler = StandardScaler(with_mean=False).fit(X)
+        X_scaled = scaler.transform(X, copy=True)
+    assert not np.any(np.isnan(X_scaled))
+
+    with warnings.catch_warnings(record=True):
+        scaler_sparse = StandardScaler(with_mean=False).fit(X_sparse)
+        X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True)
+    assert not np.any(np.isnan(X_sparse_scaled.data))
+
+    assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_)
+    assert_array_almost_equal(scaler.var_, scaler_sparse.var_)
+    assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_)
+
+    assert_array_almost_equal(
+        X_scaled.mean(axis=0), [0.0, 1.109, 1.856, 21.0, 1.559], 2
+    )
+    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+
+    X_sparse_scaled_mean, X_sparse_scaled_std = mean_variance_axis(
+        X_sparse_scaled.astype(float), 0
+    )
+    assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0))
+    assert_array_almost_equal(X_sparse_scaled_std, X_scaled.std(axis=0))
+
+    # Check that X has not been modified (copy)
+    assert X_scaled is not X
+    assert X_sparse_scaled is not X_sparse
+
+    X_scaled_back = scaler.inverse_transform(X_scaled)
+    assert X_scaled_back is not X
+    assert X_scaled_back is not X_scaled
+    assert_array_almost_equal(X_scaled_back, X)
+
+    X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled)
+    assert X_sparse_scaled_back is not X_sparse
+    assert X_sparse_scaled_back is not X_sparse_scaled
+    assert_array_almost_equal(X_sparse_scaled_back.toarray(), X)
+
+    if sparse_container in CSR_CONTAINERS:
+        null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
+        with warnings.catch_warnings(record=True):
+            X_null = null_transform.fit_transform(X_sparse)
+        assert_array_equal(X_null.data, X_sparse.data)
+        X_orig = null_transform.inverse_transform(X_null)
+        assert_array_equal(X_orig.data, X_sparse.data)
+
+
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+def test_scaler_without_copy(sparse_container):
+    # Check that StandardScaler.fit does not change input
+    rng = np.random.RandomState(42)
+    X = rng.randn(4, 5)
+    X[:, 0] = 0.0  # first feature is always of zero
+    X_sparse = sparse_container(X)
+
+    X_copy = X.copy()
+    StandardScaler(copy=False).fit(X)
+    assert_array_equal(X, X_copy)
+
+    X_sparse_copy = X_sparse.copy()
+    StandardScaler(with_mean=False, copy=False).fit(X_sparse)
+    assert_array_equal(X_sparse.toarray(), X_sparse_copy.toarray())
+
+
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+def test_scale_sparse_with_mean_raise_exception(sparse_container):
+    rng = np.random.RandomState(42)
+    X = rng.randn(4, 5)
+    X_sparse = sparse_container(X)
+
+    # check scaling and fit with direct calls on sparse data
+    with pytest.raises(ValueError):
+        scale(X_sparse, with_mean=True)
+    with pytest.raises(ValueError):
+        StandardScaler(with_mean=True).fit(X_sparse)
+
+    # check transform and inverse_transform after a fit on a dense array
+    scaler = StandardScaler(with_mean=True).fit(X)
+    with pytest.raises(ValueError):
+        scaler.transform(X_sparse)
+
+    X_transformed_sparse = sparse_container(scaler.transform(X))
+    with pytest.raises(ValueError):
+        scaler.inverse_transform(X_transformed_sparse)
+
+
+def test_scale_input_finiteness_validation():
+    # Check if non finite inputs raise ValueError
+    X = [[np.inf, 5, 6, 7, 8]]
+    with pytest.raises(
+        ValueError, match="Input contains infinity or a value too large"
+    ):
+        scale(X)
+
+
+def test_robust_scaler_error_sparse():
+    X_sparse = sparse.rand(1000, 10)
+    scaler = RobustScaler(with_centering=True)
+    err_msg = "Cannot center sparse matrices"
+    with pytest.raises(ValueError, match=err_msg):
+        scaler.fit(X_sparse)
+
+
+@pytest.mark.parametrize("with_centering", [True, False])
+@pytest.mark.parametrize("with_scaling", [True, False])
+@pytest.mark.parametrize("X", [np.random.randn(10, 3), sparse.rand(10, 3, density=0.5)])
+def test_robust_scaler_attributes(X, with_centering, with_scaling):
+    # check consistent type of attributes
+    if with_centering and sparse.issparse(X):
+        pytest.skip("RobustScaler cannot center sparse matrix")
+
+    scaler = RobustScaler(with_centering=with_centering, with_scaling=with_scaling)
+    scaler.fit(X)
+
+    if with_centering:
+        assert isinstance(scaler.center_, np.ndarray)
+    else:
+        assert scaler.center_ is None
+    if with_scaling:
+        assert isinstance(scaler.scale_, np.ndarray)
+    else:
+        assert scaler.scale_ is None
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_robust_scaler_col_zero_sparse(csr_container):
+    # check that the scaler is working when there is not data materialized in a
+    # column of a sparse matrix
+    X = np.random.randn(10, 5)
+    X[:, 0] = 0
+    X = csr_container(X)
+
+    scaler = RobustScaler(with_centering=False)
+    scaler.fit(X)
+    assert scaler.scale_[0] == pytest.approx(1)
+
+    X_trans = scaler.transform(X)
+    assert_allclose(X[:, [0]].toarray(), X_trans[:, [0]].toarray())
+
+
+def test_robust_scaler_2d_arrays():
+    # Test robust scaling of 2d array along first axis
+    rng = np.random.RandomState(0)
+    X = rng.randn(4, 5)
+    X[:, 0] = 0.0  # first feature is always of zero
+
+    scaler = RobustScaler()
+    X_scaled = scaler.fit(X).transform(X)
+
+    assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0])
+    assert_array_almost_equal(X_scaled.std(axis=0)[0], 0)
+
+
+@pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1])
+@pytest.mark.parametrize("strictly_signed", ["positive", "negative", "zeros", None])
+def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed):
+    # Check the equivalence of the fitting with dense and sparse matrices
+    X_sparse = sparse.rand(1000, 5, density=density).tocsc()
+    if strictly_signed == "positive":
+        X_sparse.data = np.abs(X_sparse.data)
+    elif strictly_signed == "negative":
+        X_sparse.data = -np.abs(X_sparse.data)
+    elif strictly_signed == "zeros":
+        X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64)
+    X_dense = X_sparse.toarray()
+
+    scaler_sparse = RobustScaler(with_centering=False)
+    scaler_dense = RobustScaler(with_centering=False)
+
+    scaler_sparse.fit(X_sparse)
+    scaler_dense.fit(X_dense)
+
+    assert_allclose(scaler_sparse.scale_, scaler_dense.scale_)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_robust_scaler_transform_one_row_csr(csr_container):
+    # Check RobustScaler on transforming csr matrix with one row
+    rng = np.random.RandomState(0)
+    X = rng.randn(4, 5)
+    single_row = np.array([[0.1, 1.0, 2.0, 0.0, -1.0]])
+    scaler = RobustScaler(with_centering=False)
+    scaler = scaler.fit(X)
+    row_trans = scaler.transform(csr_container(single_row))
+    row_expected = single_row / scaler.scale_
+    assert_array_almost_equal(row_trans.toarray(), row_expected)
+    row_scaled_back = scaler.inverse_transform(row_trans)
+    assert_array_almost_equal(single_row, row_scaled_back.toarray())
+
+
+def test_robust_scaler_iris():
+    X = iris.data
+    scaler = RobustScaler()
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(np.median(X_trans, axis=0), 0)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+    q = np.percentile(X_trans, q=(25, 75), axis=0)
+    iqr = q[1] - q[0]
+    assert_array_almost_equal(iqr, 1)
+
+
+def test_robust_scaler_iris_quantiles():
+    X = iris.data
+    scaler = RobustScaler(quantile_range=(10, 90))
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(np.median(X_trans, axis=0), 0)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+    q = np.percentile(X_trans, q=(10, 90), axis=0)
+    q_range = q[1] - q[0]
+    assert_array_almost_equal(q_range, 1)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_quantile_transform_iris(csc_container):
+    X = iris.data
+    # uniform output distribution
+    transformer = QuantileTransformer(n_quantiles=30)
+    X_trans = transformer.fit_transform(X)
+    X_trans_inv = transformer.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+    # normal output distribution
+    transformer = QuantileTransformer(n_quantiles=30, output_distribution="normal")
+    X_trans = transformer.fit_transform(X)
+    X_trans_inv = transformer.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+    # make sure it is possible to take the inverse of a sparse matrix
+    # which contain negative value; this is the case in the iris dataset
+    X_sparse = csc_container(X)
+    X_sparse_tran = transformer.fit_transform(X_sparse)
+    X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran)
+    assert_array_almost_equal(X_sparse.toarray(), X_sparse_tran_inv.toarray())
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_quantile_transform_check_error(csc_container):
+    X = np.transpose(
+        [
+            [0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
+            [2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
+            [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1],
+        ]
+    )
+    X = csc_container(X)
+    X_neg = np.transpose(
+        [
+            [0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
+            [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
+            [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1],
+        ]
+    )
+    X_neg = csc_container(X_neg)
+
+    err_msg = (
+        "The number of quantiles cannot be greater than "
+        "the number of samples used. Got 1000 quantiles "
+        "and 10 samples."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        QuantileTransformer(subsample=10).fit(X)
+
+    transformer = QuantileTransformer(n_quantiles=10)
+    err_msg = "QuantileTransformer only accepts non-negative sparse matrices."
+    with pytest.raises(ValueError, match=err_msg):
+        transformer.fit(X_neg)
+    transformer.fit(X)
+    err_msg = "QuantileTransformer only accepts non-negative sparse matrices."
+    with pytest.raises(ValueError, match=err_msg):
+        transformer.transform(X_neg)
+
+    X_bad_feat = np.transpose(
+        [[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]
+    )
+    err_msg = (
+        "X has 2 features, but QuantileTransformer is expecting 3 features as input."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        transformer.inverse_transform(X_bad_feat)
+
+    transformer = QuantileTransformer(n_quantiles=10).fit(X)
+    # check that an error is raised if input is scalar
+    with pytest.raises(ValueError, match="Expected 2D array, got scalar array instead"):
+        transformer.transform(10)
+    # check that a warning is raised is n_quantiles > n_samples
+    transformer = QuantileTransformer(n_quantiles=100)
+    warn_msg = "n_quantiles is set to n_samples"
+    with pytest.warns(UserWarning, match=warn_msg) as record:
+        transformer.fit(X)
+    assert len(record) == 1
+    assert transformer.n_quantiles_ == X.shape[0]
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_quantile_transform_sparse_ignore_zeros(csc_container):
+    X = np.array([[0, 1], [0, 0], [0, 2], [0, 2], [0, 1]])
+    X_sparse = csc_container(X)
+    transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5)
+
+    # dense case -> warning raise
+    warning_message = (
+        "'ignore_implicit_zeros' takes effect"
+        " only with sparse matrix. This parameter has no"
+        " effect."
+    )
+    with pytest.warns(UserWarning, match=warning_message):
+        transformer.fit(X)
+
+    X_expected = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 0]])
+    X_trans = transformer.fit_transform(X_sparse)
+    assert_almost_equal(X_expected, X_trans.toarray())
+
+    # consider the case where sparse entries are missing values and user-given
+    # zeros are to be considered
+    X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0])
+    X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+    X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8])
+    X_sparse = csc_container((X_data, (X_row, X_col)))
+    X_trans = transformer.fit_transform(X_sparse)
+    X_expected = np.array(
+        [
+            [0.0, 0.5],
+            [0.0, 0.0],
+            [0.0, 1.0],
+            [0.0, 1.0],
+            [0.0, 0.5],
+            [0.0, 0.0],
+            [0.0, 0.5],
+            [0.0, 1.0],
+            [0.0, 0.0],
+        ]
+    )
+    assert_almost_equal(X_expected, X_trans.toarray())
+
+    transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5)
+    X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1])
+    X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1])
+    X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6])
+    X_sparse = csc_container((X_data, (X_row, X_col)))
+    X_trans = transformer.fit_transform(X_sparse)
+    X_expected = np.array(
+        [[0, 1], [0, 0.375], [0, 0.375], [0, 0.375], [0, 1], [0, 0], [0, 1]]
+    )
+    assert_almost_equal(X_expected, X_trans.toarray())
+    assert_almost_equal(
+        X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray()
+    )
+
+    # check in conjunction with subsampling
+    transformer = QuantileTransformer(
+        ignore_implicit_zeros=True, n_quantiles=5, subsample=8, random_state=0
+    )
+    X_trans = transformer.fit_transform(X_sparse)
+    assert_almost_equal(X_expected, X_trans.toarray())
+    assert_almost_equal(
+        X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray()
+    )
+
+
+def test_quantile_transform_dense_toy():
+    X = np.array(
+        [[0, 2, 2.6], [25, 4, 4.1], [50, 6, 2.3], [75, 8, 9.5], [100, 10, 0.1]]
+    )
+
+    transformer = QuantileTransformer(n_quantiles=5)
+    transformer.fit(X)
+
+    # using a uniform output, each entry of X should be map between 0 and 1
+    # and equally spaced
+    X_trans = transformer.fit_transform(X)
+    X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T
+    assert_almost_equal(np.sort(X_trans, axis=0), X_expected)
+
+    X_test = np.array(
+        [
+            [-1, 1, 0],
+            [101, 11, 10],
+        ]
+    )
+    X_expected = np.array(
+        [
+            [0, 0, 0],
+            [1, 1, 1],
+        ]
+    )
+    assert_array_almost_equal(transformer.transform(X_test), X_expected)
+
+    X_trans_inv = transformer.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+
+def test_quantile_transform_subsampling():
+    # Test that subsampling the input yield to a consistent results We check
+    # that the computed quantiles are almost mapped to a [0, 1] vector where
+    # values are equally spaced. The infinite norm is checked to be smaller
+    # than a given threshold. This is repeated 5 times.
+
+    # dense support
+    n_samples = 1000000
+    n_quantiles = 1000
+    X = np.sort(np.random.sample((n_samples, 1)), axis=0)
+    ROUND = 5
+    inf_norm_arr = []
+    for random_state in range(ROUND):
+        transformer = QuantileTransformer(
+            random_state=random_state,
+            n_quantiles=n_quantiles,
+            subsample=n_samples // 10,
+        )
+        transformer.fit(X)
+        diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)
+        inf_norm = np.max(np.abs(diff))
+        assert inf_norm < 1e-2
+        inf_norm_arr.append(inf_norm)
+    # each random subsampling yield a unique approximation to the expected
+    # linspace CDF
+    assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr)
+
+    # sparse support
+
+    X = sparse.rand(n_samples, 1, density=0.99, format="csc", random_state=0)
+    inf_norm_arr = []
+    for random_state in range(ROUND):
+        transformer = QuantileTransformer(
+            random_state=random_state,
+            n_quantiles=n_quantiles,
+            subsample=n_samples // 10,
+        )
+        transformer.fit(X)
+        diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)
+        inf_norm = np.max(np.abs(diff))
+        assert inf_norm < 1e-1
+        inf_norm_arr.append(inf_norm)
+    # each random subsampling yield a unique approximation to the expected
+    # linspace CDF
+    assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr)
+
+
+def test_quantile_transform_subsampling_disabled():
+    """Check the behaviour of `QuantileTransformer` when `subsample=None`."""
+    X = np.random.RandomState(0).normal(size=(200, 1))
+
+    n_quantiles = 5
+    transformer = QuantileTransformer(n_quantiles=n_quantiles, subsample=None).fit(X)
+
+    expected_references = np.linspace(0, 1, n_quantiles)
+    assert_allclose(transformer.references_, expected_references)
+    expected_quantiles = np.quantile(X.ravel(), expected_references)
+    assert_allclose(transformer.quantiles_.ravel(), expected_quantiles)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_quantile_transform_sparse_toy(csc_container):
+    X = np.array(
+        [
+            [0.0, 2.0, 0.0],
+            [25.0, 4.0, 0.0],
+            [50.0, 0.0, 2.6],
+            [0.0, 0.0, 4.1],
+            [0.0, 6.0, 0.0],
+            [0.0, 8.0, 0.0],
+            [75.0, 0.0, 2.3],
+            [0.0, 10.0, 0.0],
+            [0.0, 0.0, 9.5],
+            [100.0, 0.0, 0.1],
+        ]
+    )
+
+    X = csc_container(X)
+
+    transformer = QuantileTransformer(n_quantiles=10)
+    transformer.fit(X)
+
+    X_trans = transformer.fit_transform(X)
+    assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0)
+    assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0)
+
+    X_trans_inv = transformer.inverse_transform(X_trans)
+    assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
+
+    transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray())
+
+    X_trans = transformer_dense.transform(X)
+    assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0)
+    assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0)
+
+    X_trans_inv = transformer_dense.inverse_transform(X_trans)
+    assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
+
+
+def test_quantile_transform_axis1():
+    X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]])
+
+    X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5)
+    X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5)
+    assert_array_almost_equal(X_trans_a0, X_trans_a1.T)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_quantile_transform_bounds(csc_container):
+    # Lower and upper bounds are manually mapped. We checked that in the case
+    # of a constant feature and binary feature, the bounds are properly mapped.
+    X_dense = np.array([[0, 0], [0, 0], [1, 0]])
+    X_sparse = csc_container(X_dense)
+
+    # check sparse and dense are consistent
+    X_trans = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_dense)
+    assert_array_almost_equal(X_trans, X_dense)
+    X_trans_sp = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(
+        X_sparse
+    )
+    assert_array_almost_equal(X_trans_sp.toarray(), X_dense)
+    assert_array_almost_equal(X_trans, X_trans_sp.toarray())
+
+    # check the consistency of the bounds by learning on 1 matrix
+    # and transforming another
+    X = np.array([[0, 1], [0, 0.5], [1, 0]])
+    X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]])
+    transformer = QuantileTransformer(n_quantiles=3).fit(X)
+    X_trans = transformer.transform(X1)
+    assert_array_almost_equal(X_trans, X1)
+
+    # check that values outside of the range learned will be mapped properly.
+    X = np.random.random((1000, 1))
+    transformer = QuantileTransformer()
+    transformer.fit(X)
+    assert transformer.transform([[-10]]) == transformer.transform([[np.min(X)]])
+    assert transformer.transform([[10]]) == transformer.transform([[np.max(X)]])
+    assert transformer.inverse_transform([[-10]]) == transformer.inverse_transform(
+        [[np.min(transformer.references_)]]
+    )
+    assert transformer.inverse_transform([[10]]) == transformer.inverse_transform(
+        [[np.max(transformer.references_)]]
+    )
+
+
+def test_quantile_transform_and_inverse():
+    X_1 = iris.data
+    X_2 = np.array([[0.0], [BOUNDS_THRESHOLD / 10], [1.5], [2], [3], [3], [4]])
+    for X in [X_1, X_2]:
+        transformer = QuantileTransformer(n_quantiles=1000, random_state=0)
+        X_trans = transformer.fit_transform(X)
+        X_trans_inv = transformer.inverse_transform(X_trans)
+        assert_array_almost_equal(X, X_trans_inv, decimal=9)
+
+
+def test_quantile_transform_nan():
+    X = np.array([[np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0.5], [np.nan, 1, 1, 0]])
+
+    transformer = QuantileTransformer(n_quantiles=10, random_state=42)
+    transformer.fit_transform(X)
+
+    # check that the quantile of the first column is all NaN
+    assert np.isnan(transformer.quantiles_[:, 0]).all()
+    # all other column should not contain NaN
+    assert not np.isnan(transformer.quantiles_[:, 1:]).any()
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse"])
+def test_quantile_transformer_sorted_quantiles(array_type):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15733
+    # Taken from upstream bug report:
+    # https://github.com/numpy/numpy/issues/14685
+    X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10)
+    X = 0.1 * X.reshape(-1, 1)
+    X = _convert_container(X, array_type)
+
+    n_quantiles = 100
+    qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X)
+
+    # Check that the estimated quantile thresholds are monotically
+    # increasing:
+    quantiles = qt.quantiles_[:, 0]
+    assert len(quantiles) == 100
+    assert all(np.diff(quantiles) >= 0)
+
+
+def test_robust_scaler_invalid_range():
+    for range_ in [
+        (-1, 90),
+        (-2, -3),
+        (10, 101),
+        (100.5, 101),
+        (90, 50),
+    ]:
+        scaler = RobustScaler(quantile_range=range_)
+
+        with pytest.raises(ValueError, match=r"Invalid quantile range: \("):
+            scaler.fit(iris.data)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_scale_function_without_centering(csr_container):
+    rng = np.random.RandomState(42)
+    X = rng.randn(4, 5)
+    X[:, 0] = 0.0  # first feature is always of zero
+    X_csr = csr_container(X)
+
+    X_scaled = scale(X, with_mean=False)
+    assert not np.any(np.isnan(X_scaled))
+
+    X_csr_scaled = scale(X_csr, with_mean=False)
+    assert not np.any(np.isnan(X_csr_scaled.data))
+
+    # test csc has same outcome
+    X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
+    assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())
+
+    # raises value error on axis != 0
+    with pytest.raises(ValueError):
+        scale(X_csr, with_mean=False, axis=1)
+
+    assert_array_almost_equal(
+        X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2
+    )
+    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+    # Check that X has not been copied
+    assert X_scaled is not X
+
+    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
+    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
+    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
+
+    # null scale
+    X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True)
+    assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray())
+
+
+def test_robust_scale_axis1():
+    X = iris.data
+    X_trans = robust_scale(X, axis=1)
+    assert_array_almost_equal(np.median(X_trans, axis=1), 0)
+    q = np.percentile(X_trans, q=(25, 75), axis=1)
+    iqr = q[1] - q[0]
+    assert_array_almost_equal(iqr, 1)
+
+
+def test_robust_scale_1d_array():
+    X = iris.data[:, 1]
+    X_trans = robust_scale(X)
+    assert_array_almost_equal(np.median(X_trans), 0)
+    q = np.percentile(X_trans, q=(25, 75))
+    iqr = q[1] - q[0]
+    assert_array_almost_equal(iqr, 1)
+
+
+def test_robust_scaler_zero_variance_features():
+    # Check RobustScaler on toy data with zero variance features
+    X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
+
+    scaler = RobustScaler()
+    X_trans = scaler.fit_transform(X)
+
+    # NOTE: for such a small sample size, what we expect in the third column
+    # depends HEAVILY on the method used to calculate quantiles. The values
+    # here were calculated to fit the quantiles produces by np.percentile
+    # using numpy 1.9 Calculating quantiles with
+    # scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles
+    # would yield very different results!
+    X_expected = [[0.0, 0.0, +0.0], [0.0, 0.0, -1.0], [0.0, 0.0, +1.0]]
+    assert_array_almost_equal(X_trans, X_expected)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # make sure new data gets transformed correctly
+    X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
+    X_trans_new = scaler.transform(X_new)
+    X_expected_new = [[+0.0, 1.0, +0.0], [-1.0, 0.0, -0.83333], [+0.0, 0.0, +1.66667]]
+    assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3)
+
+
+def test_robust_scaler_unit_variance():
+    # Check RobustScaler with unit_variance=True on standard normal data with
+    # outliers
+    rng = np.random.RandomState(42)
+    X = rng.randn(1000000, 1)
+    X_with_outliers = np.vstack([X, np.ones((100, 1)) * 100, np.ones((100, 1)) * -100])
+
+    quantile_range = (1, 99)
+    robust_scaler = RobustScaler(quantile_range=quantile_range, unit_variance=True).fit(
+        X_with_outliers
+    )
+    X_trans = robust_scaler.transform(X)
+
+    assert robust_scaler.center_ == pytest.approx(0, abs=1e-3)
+    assert robust_scaler.scale_ == pytest.approx(1, abs=1e-2)
+    assert X_trans.std() == pytest.approx(1, abs=1e-2)
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_maxabs_scaler_zero_variance_features(sparse_container):
+    # Check MaxAbsScaler on toy data with zero variance features
+    X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]]
+
+    scaler = MaxAbsScaler()
+    X_trans = scaler.fit_transform(X)
+    X_expected = [
+        [0.0, 1.0, 1.0 / 3.0],
+        [0.0, 1.0, -0.2],
+        [0.0, 1.0, 1.0],
+        [0.0, 0.0, 0.0],
+    ]
+    assert_array_almost_equal(X_trans, X_expected)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # make sure new data gets transformed correctly
+    X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
+    X_trans_new = scaler.transform(X_new)
+    X_expected_new = [[+0.0, 2.0, 1.0 / 3.0], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.0]]
+
+    assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2)
+
+    # function interface
+    X_trans = maxabs_scale(X)
+    assert_array_almost_equal(X_trans, X_expected)
+
+    # sparse data
+    X_sparse = sparse_container(X)
+    X_trans_sparse = scaler.fit_transform(X_sparse)
+    X_expected = [
+        [0.0, 1.0, 1.0 / 3.0],
+        [0.0, 1.0, -0.2],
+        [0.0, 1.0, 1.0],
+        [0.0, 0.0, 0.0],
+    ]
+    assert_array_almost_equal(X_trans_sparse.toarray(), X_expected)
+    X_trans_sparse_inv = scaler.inverse_transform(X_trans_sparse)
+    assert_array_almost_equal(X, X_trans_sparse_inv.toarray())
+
+
+def test_maxabs_scaler_large_negative_value():
+    # Check MaxAbsScaler on toy data with a large negative value
+    X = [
+        [0.0, 1.0, +0.5, -1.0],
+        [0.0, 1.0, -0.3, -0.5],
+        [0.0, 1.0, -100.0, 0.0],
+        [0.0, 0.0, +0.0, -2.0],
+    ]
+
+    scaler = MaxAbsScaler()
+    X_trans = scaler.fit_transform(X)
+    X_expected = [
+        [0.0, 1.0, 0.005, -0.5],
+        [0.0, 1.0, -0.003, -0.25],
+        [0.0, 1.0, -1.0, 0.0],
+        [0.0, 0.0, 0.0, -1.0],
+    ]
+    assert_array_almost_equal(X_trans, X_expected)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_maxabs_scaler_transform_one_row_csr(csr_container):
+    # Check MaxAbsScaler on transforming csr matrix with one row
+    X = csr_container([[0.5, 1.0, 1.0]])
+    scaler = MaxAbsScaler()
+    scaler = scaler.fit(X)
+    X_trans = scaler.transform(X)
+    X_expected = csr_container([[1.0, 1.0, 1.0]])
+    assert_array_almost_equal(X_trans.toarray(), X_expected.toarray())
+    X_scaled_back = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X.toarray(), X_scaled_back.toarray())
+
+
+def test_maxabs_scaler_1d():
+    # Test scaling of dataset along single axis
+    for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
+        scaler = MaxAbsScaler(copy=True)
+        X_scaled = scaler.fit(X).transform(X)
+
+        if isinstance(X, list):
+            X = np.array(X)  # cast only after scaling done
+
+        if _check_dim_1axis(X) == 1:
+            assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), np.ones(n_features))
+        else:
+            assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0)
+        assert scaler.n_samples_seen_ == X.shape[0]
+
+        # check inverse transform
+        X_scaled_back = scaler.inverse_transform(X_scaled)
+        assert_array_almost_equal(X_scaled_back, X)
+
+    # Constant feature
+    X = np.ones((5, 1))
+    scaler = MaxAbsScaler()
+    X_scaled = scaler.fit(X).transform(X)
+    assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0)
+    assert scaler.n_samples_seen_ == X.shape[0]
+
+    # function interface
+    X_1d = X_1row.ravel()
+    max_abs = np.abs(X_1d).max()
+    assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_maxabs_scaler_partial_fit(csr_container):
+    # Test if partial_fit run over many batches of size 1 and 50
+    # gives the same results as fit
+    X = X_2d[:100, :]
+    n = X.shape[0]
+
+    for chunk_size in [1, 2, 50, n, n + 42]:
+        # Test mean at the end of the process
+        scaler_batch = MaxAbsScaler().fit(X)
+
+        scaler_incr = MaxAbsScaler()
+        scaler_incr_csr = MaxAbsScaler()
+        scaler_incr_csc = MaxAbsScaler()
+        for batch in gen_batches(n, chunk_size):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+            X_csr = csr_container(X[batch])
+            scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
+            X_csc = csr_container(X[batch])
+            scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)
+
+        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
+        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_)
+        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_)
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+        assert scaler_batch.n_samples_seen_ == scaler_incr_csr.n_samples_seen_
+        assert scaler_batch.n_samples_seen_ == scaler_incr_csc.n_samples_seen_
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
+        assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X))
+
+        # Test std after 1 step
+        batch0 = slice(0, chunk_size)
+        scaler_batch = MaxAbsScaler().fit(X[batch0])
+        scaler_incr = MaxAbsScaler().partial_fit(X[batch0])
+
+        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
+        assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X))
+
+        # Test std until the end of partial fits, and
+        scaler_batch = MaxAbsScaler().fit(X)
+        scaler_incr = MaxAbsScaler()  # Clean estimator
+        for i, batch in enumerate(gen_batches(n, chunk_size)):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+            assert_correct_incr(
+                i,
+                batch_start=batch.start,
+                batch_stop=batch.stop,
+                n=n,
+                chunk_size=chunk_size,
+                n_samples_seen=scaler_incr.n_samples_seen_,
+            )
+
+
+def check_normalizer(norm, X_norm):
+    """
+    Convenient checking function for `test_normalizer_l1_l2_max` and
+    `test_normalizer_l1_l2_max_non_csr`
+    """
+    if norm == "l1":
+        row_sums = np.abs(X_norm).sum(axis=1)
+        for i in range(3):
+            assert_almost_equal(row_sums[i], 1.0)
+        assert_almost_equal(row_sums[3], 0.0)
+    elif norm == "l2":
+        for i in range(3):
+            assert_almost_equal(la.norm(X_norm[i]), 1.0)
+        assert_almost_equal(la.norm(X_norm[3]), 0.0)
+    elif norm == "max":
+        row_maxs = abs(X_norm).max(axis=1)
+        for i in range(3):
+            assert_almost_equal(row_maxs[i], 1.0)
+        assert_almost_equal(row_maxs[3], 0.0)
+
+
+@pytest.mark.parametrize("norm", ["l1", "l2", "max"])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_normalizer_l1_l2_max(norm, csr_container):
+    rng = np.random.RandomState(0)
+    X_dense = rng.randn(4, 5)
+    X_sparse_unpruned = csr_container(X_dense)
+
+    # set the row number 3 to zero
+    X_dense[3, :] = 0.0
+
+    # set the row number 3 to zero without pruning (can happen in real life)
+    indptr_3 = X_sparse_unpruned.indptr[3]
+    indptr_4 = X_sparse_unpruned.indptr[4]
+    X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
+
+    # build the pruned variant using the regular constructor
+    X_sparse_pruned = csr_container(X_dense)
+
+    # check inputs that support the no-copy optim
+    for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
+        normalizer = Normalizer(norm=norm, copy=True)
+        X_norm1 = normalizer.transform(X)
+        assert X_norm1 is not X
+        X_norm1 = toarray(X_norm1)
+
+        normalizer = Normalizer(norm=norm, copy=False)
+        X_norm2 = normalizer.transform(X)
+        assert X_norm2 is X
+        X_norm2 = toarray(X_norm2)
+
+        for X_norm in (X_norm1, X_norm2):
+            check_normalizer(norm, X_norm)
+
+
+@pytest.mark.parametrize("norm", ["l1", "l2", "max"])
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + LIL_CONTAINERS
+)
+def test_normalizer_l1_l2_max_non_csr(norm, sparse_container):
+    rng = np.random.RandomState(0)
+    X_dense = rng.randn(4, 5)
+
+    # set the row number 3 to zero
+    X_dense[3, :] = 0.0
+
+    X = sparse_container(X_dense)
+    X_norm = Normalizer(norm=norm, copy=False).transform(X)
+
+    assert X_norm is not X
+    assert sparse.issparse(X_norm) and X_norm.format == "csr"
+
+    X_norm = toarray(X_norm)
+    check_normalizer(norm, X_norm)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_normalizer_max_sign(csr_container):
+    # check that we normalize by a positive number even for negative data
+    rng = np.random.RandomState(0)
+    X_dense = rng.randn(4, 5)
+    # set the row number 3 to zero
+    X_dense[3, :] = 0.0
+    # check for mixed data where the value with
+    # largest magnitude is negative
+    X_dense[2, abs(X_dense[2, :]).argmax()] *= -1
+    X_all_neg = -np.abs(X_dense)
+    X_all_neg_sparse = csr_container(X_all_neg)
+
+    for X in (X_dense, X_all_neg, X_all_neg_sparse):
+        normalizer = Normalizer(norm="max")
+        X_norm = normalizer.transform(X)
+        assert X_norm is not X
+        X_norm = toarray(X_norm)
+        assert_array_equal(np.sign(X_norm), np.sign(toarray(X)))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_normalize(csr_container):
+    # Test normalize function
+    # Only tests functionality not used by the tests for Normalizer.
+    X = np.random.RandomState(37).randn(3, 2)
+    assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T)
+
+    rs = np.random.RandomState(0)
+    X_dense = rs.randn(10, 5)
+    X_sparse = csr_container(X_dense)
+    ones = np.ones((10))
+    for X in (X_dense, X_sparse):
+        for dtype in (np.float32, np.float64):
+            for norm in ("l1", "l2"):
+                X = X.astype(dtype)
+                X_norm = normalize(X, norm=norm)
+                assert X_norm.dtype == dtype
+
+                X_norm = toarray(X_norm)
+                if norm == "l1":
+                    row_sums = np.abs(X_norm).sum(axis=1)
+                else:
+                    X_norm_squared = X_norm**2
+                    row_sums = X_norm_squared.sum(axis=1)
+
+                assert_array_almost_equal(row_sums, ones)
+
+    # Test return_norm
+    X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]])
+    for norm in ("l1", "l2", "max"):
+        _, norms = normalize(X_dense, norm=norm, return_norm=True)
+        if norm == "l1":
+            assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0]))
+        elif norm == "l2":
+            assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127]))
+        else:
+            assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
+
+    X_sparse = csr_container(X_dense)
+    for norm in ("l1", "l2"):
+        with pytest.raises(NotImplementedError):
+            normalize(X_sparse, norm=norm, return_norm=True)
+    _, norms = normalize(X_sparse, norm="max", return_norm=True)
+    assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
+
+
+@pytest.mark.parametrize(
+    "constructor", [np.array, list] + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_binarizer(constructor):
+    X_ = np.array([[1, 0, 5], [2, 3, -1]])
+    X = constructor(X_.copy())
+
+    binarizer = Binarizer(threshold=2.0, copy=True)
+    X_bin = toarray(binarizer.transform(X))
+    assert np.sum(X_bin == 0) == 4
+    assert np.sum(X_bin == 1) == 2
+    X_bin = binarizer.transform(X)
+    assert sparse.issparse(X) == sparse.issparse(X_bin)
+
+    binarizer = Binarizer(copy=True).fit(X)
+    X_bin = toarray(binarizer.transform(X))
+    assert X_bin is not X
+    assert np.sum(X_bin == 0) == 2
+    assert np.sum(X_bin == 1) == 4
+
+    binarizer = Binarizer(copy=True)
+    X_bin = binarizer.transform(X)
+    assert X_bin is not X
+    X_bin = toarray(X_bin)
+    assert np.sum(X_bin == 0) == 2
+    assert np.sum(X_bin == 1) == 4
+
+    binarizer = Binarizer(copy=False)
+    X_bin = binarizer.transform(X)
+    if constructor is not list:
+        assert X_bin is X
+
+    binarizer = Binarizer(copy=False)
+    X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64)
+    X_bin = binarizer.transform(X_float)
+    if constructor is not list:
+        assert X_bin is X_float
+
+    X_bin = toarray(X_bin)
+    assert np.sum(X_bin == 0) == 2
+    assert np.sum(X_bin == 1) == 4
+
+    binarizer = Binarizer(threshold=-0.5, copy=True)
+    if constructor in (np.array, list):
+        X = constructor(X_.copy())
+
+        X_bin = toarray(binarizer.transform(X))
+        assert np.sum(X_bin == 0) == 1
+        assert np.sum(X_bin == 1) == 5
+        X_bin = binarizer.transform(X)
+
+    # Cannot use threshold < 0 for sparse
+    if constructor in CSC_CONTAINERS:
+        with pytest.raises(ValueError):
+            binarizer.transform(constructor(X))
+
+
+def test_center_kernel():
+    # Test that KernelCenterer is equivalent to StandardScaler
+    # in feature space
+    rng = np.random.RandomState(0)
+    X_fit = rng.random_sample((5, 4))
+    scaler = StandardScaler(with_std=False)
+    scaler.fit(X_fit)
+    X_fit_centered = scaler.transform(X_fit)
+    K_fit = np.dot(X_fit, X_fit.T)
+
+    # center fit time matrix
+    centerer = KernelCenterer()
+    K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
+    K_fit_centered2 = centerer.fit_transform(K_fit)
+    assert_array_almost_equal(K_fit_centered, K_fit_centered2)
+
+    # center predict time matrix
+    X_pred = rng.random_sample((2, 4))
+    K_pred = np.dot(X_pred, X_fit.T)
+    X_pred_centered = scaler.transform(X_pred)
+    K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
+    K_pred_centered2 = centerer.transform(K_pred)
+    assert_array_almost_equal(K_pred_centered, K_pred_centered2)
+
+    # check the results coherence with the method proposed in:
+    # B. Schölkopf, A. Smola, and K.R. Müller,
+    # "Nonlinear component analysis as a kernel eigenvalue problem"
+    # equation (B.3)
+
+    # K_centered3 = (I - 1_M) K (I - 1_M)
+    #             =  K - 1_M K - K 1_M + 1_M K 1_M
+    ones_M = np.ones_like(K_fit) / K_fit.shape[0]
+    K_fit_centered3 = K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M
+    assert_allclose(K_fit_centered, K_fit_centered3)
+
+    # K_test_centered3 = (K_test - 1'_M K)(I - 1_M)
+    #                  = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
+    ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0]
+    K_pred_centered3 = (
+        K_pred - ones_prime_M @ K_fit - K_pred @ ones_M + ones_prime_M @ K_fit @ ones_M
+    )
+    assert_allclose(K_pred_centered, K_pred_centered3)
+
+
+def test_kernelcenterer_non_linear_kernel():
+    """Check kernel centering for non-linear kernel."""
+    rng = np.random.RandomState(0)
+    X, X_test = rng.randn(100, 50), rng.randn(20, 50)
+
+    def phi(X):
+        """Our mapping function phi."""
+        return np.vstack(
+            [
+                np.clip(X, a_min=0, a_max=None),
+                -np.clip(X, a_min=None, a_max=0),
+            ]
+        )
+
+    phi_X = phi(X)
+    phi_X_test = phi(X_test)
+
+    # centered the projection
+    scaler = StandardScaler(with_std=False)
+    phi_X_center = scaler.fit_transform(phi_X)
+    phi_X_test_center = scaler.transform(phi_X_test)
+
+    # create the different kernel
+    K = phi_X @ phi_X.T
+    K_test = phi_X_test @ phi_X.T
+    K_center = phi_X_center @ phi_X_center.T
+    K_test_center = phi_X_test_center @ phi_X_center.T
+
+    kernel_centerer = KernelCenterer()
+    kernel_centerer.fit(K)
+
+    assert_allclose(kernel_centerer.transform(K), K_center)
+    assert_allclose(kernel_centerer.transform(K_test), K_test_center)
+
+    # check the results coherence with the method proposed in:
+    # B. Schölkopf, A. Smola, and K.R. Müller,
+    # "Nonlinear component analysis as a kernel eigenvalue problem"
+    # equation (B.3)
+
+    # K_centered = (I - 1_M) K (I - 1_M)
+    #            =  K - 1_M K - K 1_M + 1_M K 1_M
+    ones_M = np.ones_like(K) / K.shape[0]
+    K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M
+    assert_allclose(kernel_centerer.transform(K), K_centered)
+
+    # K_test_centered = (K_test - 1'_M K)(I - 1_M)
+    #                 = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
+    ones_prime_M = np.ones_like(K_test) / K.shape[0]
+    K_test_centered = (
+        K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M
+    )
+    assert_allclose(kernel_centerer.transform(K_test), K_test_centered)
+
+
+def test_cv_pipeline_precomputed():
+    # Cross-validate a regression on four coplanar points with the same
+    # value. Use precomputed kernel to ensure Pipeline with KernelCenterer
+    # is treated as a pairwise operation.
+    X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
+    y_true = np.ones((4,))
+    K = X.dot(X.T)
+    kcent = KernelCenterer()
+    pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())])
+
+    # did the pipeline set the pairwise attribute?
+    assert pipeline.__sklearn_tags__().input_tags.pairwise
+
+    # test cross-validation, score should be almost perfect
+    # NB: this test is pretty vacuous -- it's mainly to test integration
+    #     of Pipeline and KernelCenterer
+    y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
+    assert_array_almost_equal(y_true, y_pred)
+
+
+def test_fit_transform():
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    for obj in (StandardScaler(), Normalizer(), Binarizer()):
+        X_transformed = obj.fit(X).transform(X)
+        X_transformed2 = obj.fit_transform(X)
+        assert_array_equal(X_transformed, X_transformed2)
+
+
+def test_add_dummy_feature():
+    X = [[1, 0], [0, 1], [0, 1]]
+    X = add_dummy_feature(X)
+    assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
+
+
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_add_dummy_feature_sparse(sparse_container):
+    X = sparse_container([[1, 0], [0, 1], [0, 1]])
+    desired_format = X.format
+    X = add_dummy_feature(X)
+    assert sparse.issparse(X) and X.format == desired_format, X
+    assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
+
+
+def test_fit_cold_start():
+    X = iris.data
+    X_2d = X[:, :2]
+
+    # Scalers that have a partial_fit method
+    scalers = [
+        StandardScaler(with_mean=False, with_std=False),
+        MinMaxScaler(),
+        MaxAbsScaler(),
+    ]
+
+    for scaler in scalers:
+        scaler.fit_transform(X)
+        # with a different shape, this may break the scaler unless the internal
+        # state is reset
+        scaler.fit_transform(X_2d)
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+def test_power_transformer_notfitted(method):
+    pt = PowerTransformer(method=method)
+    X = np.abs(X_1col)
+    with pytest.raises(NotFittedError):
+        pt.transform(X)
+    with pytest.raises(NotFittedError):
+        pt.inverse_transform(X)
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+@pytest.mark.parametrize("standardize", [True, False])
+@pytest.mark.parametrize("X", [X_1col, X_2d])
+def test_power_transformer_inverse(method, standardize, X):
+    # Make sure we get the original input when applying transform and then
+    # inverse transform
+    X = np.abs(X) if method == "box-cox" else X
+    pt = PowerTransformer(method=method, standardize=standardize)
+    X_trans = pt.fit_transform(X)
+    assert_almost_equal(X, pt.inverse_transform(X_trans))
+
+
+def test_power_transformer_1d():
+    X = np.abs(X_1col)
+
+    for standardize in [True, False]:
+        pt = PowerTransformer(method="box-cox", standardize=standardize)
+
+        X_trans = pt.fit_transform(X)
+        X_trans_func = power_transform(X, method="box-cox", standardize=standardize)
+
+        X_expected, lambda_expected = stats.boxcox(X.flatten())
+
+        if standardize:
+            X_expected = scale(X_expected)
+
+        assert_almost_equal(X_expected.reshape(-1, 1), X_trans)
+        assert_almost_equal(X_expected.reshape(-1, 1), X_trans_func)
+
+        assert_almost_equal(X, pt.inverse_transform(X_trans))
+        assert_almost_equal(lambda_expected, pt.lambdas_[0])
+
+        assert len(pt.lambdas_) == X.shape[1]
+        assert isinstance(pt.lambdas_, np.ndarray)
+
+
+def test_power_transformer_2d():
+    X = np.abs(X_2d)
+
+    for standardize in [True, False]:
+        pt = PowerTransformer(method="box-cox", standardize=standardize)
+
+        X_trans_class = pt.fit_transform(X)
+        X_trans_func = power_transform(X, method="box-cox", standardize=standardize)
+
+        for X_trans in [X_trans_class, X_trans_func]:
+            for j in range(X_trans.shape[1]):
+                X_expected, lmbda = stats.boxcox(X[:, j].flatten())
+
+                if standardize:
+                    X_expected = scale(X_expected)
+
+                assert_almost_equal(X_trans[:, j], X_expected)
+                assert_almost_equal(lmbda, pt.lambdas_[j])
+
+            # Test inverse transformation
+            X_inv = pt.inverse_transform(X_trans)
+            assert_array_almost_equal(X_inv, X)
+
+        assert len(pt.lambdas_) == X.shape[1]
+        assert isinstance(pt.lambdas_, np.ndarray)
+
+
+def test_power_transformer_boxcox_strictly_positive_exception():
+    # Exceptions should be raised for negative arrays and zero arrays when
+    # method is boxcox
+
+    pt = PowerTransformer(method="box-cox")
+    pt.fit(np.abs(X_2d))
+    X_with_negatives = X_2d
+    not_positive_message = "strictly positive"
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        pt.transform(X_with_negatives)
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        pt.fit(X_with_negatives)
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        power_transform(X_with_negatives, method="box-cox")
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        pt.transform(np.zeros(X_2d.shape))
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        pt.fit(np.zeros(X_2d.shape))
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        power_transform(np.zeros(X_2d.shape), method="box-cox")
+
+
+@pytest.mark.parametrize("X", [X_2d, np.abs(X_2d), -np.abs(X_2d), np.zeros(X_2d.shape)])
+def test_power_transformer_yeojohnson_any_input(X):
+    # Yeo-Johnson method should support any kind of input
+    power_transform(X, method="yeo-johnson")
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+def test_power_transformer_shape_exception(method):
+    pt = PowerTransformer(method=method)
+    X = np.abs(X_2d)
+    pt.fit(X)
+
+    # Exceptions should be raised for arrays with different num_columns
+    # than during fitting
+    wrong_shape_message = (
+        r"X has \d+ features, but PowerTransformer is " r"expecting \d+ features"
+    )
+
+    with pytest.raises(ValueError, match=wrong_shape_message):
+        pt.transform(X[:, 0:1])
+
+    with pytest.raises(ValueError, match=wrong_shape_message):
+        pt.inverse_transform(X[:, 0:1])
+
+
+def test_power_transformer_lambda_zero():
+    pt = PowerTransformer(method="box-cox", standardize=False)
+    X = np.abs(X_2d)[:, 0:1]
+
+    # Test the lambda = 0 case
+    pt.lambdas_ = np.array([0])
+    X_trans = pt.transform(X)
+    assert_array_almost_equal(pt.inverse_transform(X_trans), X)
+
+
+def test_power_transformer_lambda_one():
+    # Make sure lambda = 1 corresponds to the identity for yeo-johnson
+    pt = PowerTransformer(method="yeo-johnson", standardize=False)
+    X = np.abs(X_2d)[:, 0:1]
+
+    pt.lambdas_ = np.array([1])
+    X_trans = pt.transform(X)
+    assert_array_almost_equal(X_trans, X)
+
+
+@pytest.mark.parametrize(
+    "method, lmbda",
+    [
+        ("box-cox", 0.1),
+        ("box-cox", 0.5),
+        ("yeo-johnson", 0.1),
+        ("yeo-johnson", 0.5),
+        ("yeo-johnson", 1.0),
+    ],
+)
+def test_optimization_power_transformer(method, lmbda):
+    # Test the optimization procedure:
+    # - set a predefined value for lambda
+    # - apply inverse_transform to a normal dist (we get X_inv)
+    # - apply fit_transform to X_inv (we get X_inv_trans)
+    # - check that X_inv_trans is roughly equal to X
+
+    rng = np.random.RandomState(0)
+    n_samples = 20000
+    X = rng.normal(loc=0, scale=1, size=(n_samples, 1))
+
+    if method == "box-cox":
+        # For box-cox, means that lmbda * y + 1 > 0 or y > - 1 / lmbda
+        # Clip the data here to make sure the inequality is valid.
+        X = np.clip(X, -1 / lmbda + 1e-5, None)
+
+    pt = PowerTransformer(method=method, standardize=False)
+    pt.lambdas_ = [lmbda]
+    X_inv = pt.inverse_transform(X)
+
+    pt = PowerTransformer(method=method, standardize=False)
+    X_inv_trans = pt.fit_transform(X_inv)
+
+    assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, decimal=2)
+    assert_almost_equal(0, X_inv_trans.mean(), decimal=1)
+    assert_almost_equal(1, X_inv_trans.std(), decimal=1)
+
+
+def test_invserse_box_cox():
+    # output nan if the input is invalid
+    pt = PowerTransformer(method="box-cox", standardize=False)
+    pt.lambdas_ = [0.5]
+    X_inv = pt.inverse_transform([[-2.1]])
+    assert np.isnan(X_inv)
+
+
+def test_yeo_johnson_darwin_example():
+    # test from original paper "A new family of power transformations to
+    # improve normality or symmetry" by Yeo and Johnson.
+    X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0]
+    X = np.array(X).reshape(-1, 1)
+    lmbda = PowerTransformer(method="yeo-johnson").fit(X).lambdas_
+    assert np.allclose(lmbda, 1.305, atol=1e-3)
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+def test_power_transformer_nans(method):
+    # Make sure lambda estimation is not influenced by NaN values
+    # and that transform() supports NaN silently
+
+    X = np.abs(X_1col)
+    pt = PowerTransformer(method=method)
+    pt.fit(X)
+    lmbda_no_nans = pt.lambdas_[0]
+
+    # concat nans at the end and check lambda stays the same
+    X = np.concatenate([X, np.full_like(X, np.nan)])
+    X = shuffle(X, random_state=0)
+
+    pt.fit(X)
+    lmbda_nans = pt.lambdas_[0]
+
+    assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5)
+
+    X_trans = pt.transform(X)
+    assert_array_equal(np.isnan(X_trans), np.isnan(X))
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+@pytest.mark.parametrize("standardize", [True, False])
+def test_power_transformer_fit_transform(method, standardize):
+    # check that fit_transform() and fit().transform() return the same values
+    X = X_1col
+    if method == "box-cox":
+        X = np.abs(X)
+
+    pt = PowerTransformer(method, standardize=standardize)
+    assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X))
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+@pytest.mark.parametrize("standardize", [True, False])
+def test_power_transformer_copy_True(method, standardize):
+    # Check that neither fit, transform, fit_transform nor inverse_transform
+    # modify X inplace when copy=True
+    X = X_1col
+    if method == "box-cox":
+        X = np.abs(X)
+
+    X_original = X.copy()
+    assert X is not X_original  # sanity checks
+    assert_array_almost_equal(X, X_original)
+
+    pt = PowerTransformer(method, standardize=standardize, copy=True)
+
+    pt.fit(X)
+    assert_array_almost_equal(X, X_original)
+    X_trans = pt.transform(X)
+    assert X_trans is not X
+
+    X_trans = pt.fit_transform(X)
+    assert_array_almost_equal(X, X_original)
+    assert X_trans is not X
+
+    X_inv_trans = pt.inverse_transform(X_trans)
+    assert X_trans is not X_inv_trans
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+@pytest.mark.parametrize("standardize", [True, False])
+def test_power_transformer_copy_False(method, standardize):
+    # check that when copy=False fit doesn't change X inplace but transform,
+    # fit_transform and inverse_transform do.
+    X = X_1col
+    if method == "box-cox":
+        X = np.abs(X)
+
+    X_original = X.copy()
+    assert X is not X_original  # sanity checks
+    assert_array_almost_equal(X, X_original)
+
+    pt = PowerTransformer(method, standardize=standardize, copy=False)
+
+    pt.fit(X)
+    assert_array_almost_equal(X, X_original)  # fit didn't change X
+
+    X_trans = pt.transform(X)
+    assert X_trans is X
+
+    if method == "box-cox":
+        X = np.abs(X)
+    X_trans = pt.fit_transform(X)
+    assert X_trans is X
+
+    X_inv_trans = pt.inverse_transform(X_trans)
+    assert X_trans is X_inv_trans
+
+
+def test_power_transformer_box_cox_raise_all_nans_col():
+    """Check that box-cox raises informative when a column contains all nans.
+
+    Non-regression test for gh-26303
+    """
+    X = rng.random_sample((4, 5))
+    X[:, 0] = np.nan
+
+    err_msg = "Column must not be all nan."
+
+    pt = PowerTransformer(method="box-cox")
+    with pytest.raises(ValueError, match=err_msg):
+        pt.fit_transform(X)
+
+
+@pytest.mark.parametrize(
+    "X_2",
+    [sparse.random(10, 1, density=0.8, random_state=0)]
+    + [
+        csr_container(np.full((10, 1), fill_value=np.nan))
+        for csr_container in CSR_CONTAINERS
+    ],
+)
+def test_standard_scaler_sparse_partial_fit_finite_variance(X_2):
+    # non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/16448
+    X_1 = sparse.random(5, 1, density=0.8)
+    scaler = StandardScaler(with_mean=False)
+    scaler.fit(X_1).partial_fit(X_2)
+    assert np.isfinite(scaler.var_[0])
+
+
+@pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)])
+def test_minmax_scaler_clip(feature_range):
+    # test behaviour of the parameter 'clip' in MinMaxScaler
+    X = iris.data
+    scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X)
+    X_min, X_max = np.min(X, axis=0), np.max(X, axis=0)
+    X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]]
+    X_transformed = scaler.transform(X_test)
+    assert_allclose(
+        X_transformed,
+        [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]],
+    )
+
+
+def test_standard_scaler_raise_error_for_1d_input():
+    """Check that `inverse_transform` from `StandardScaler` raises an error
+    with 1D array.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19518
+    """
+    scaler = StandardScaler().fit(X_2d)
+    err_msg = "Expected 2D array, got 1D array instead"
+    with pytest.raises(ValueError, match=err_msg):
+        scaler.inverse_transform(X_2d[:, 0])
+
+
+def test_power_transformer_significantly_non_gaussian():
+    """Check that significantly non-Gaussian data before transforms correctly.
+
+    For some explored lambdas, the transformed data may be constant and will
+    be rejected. Non-regression test for
+    https://github.com/scikit-learn/scikit-learn/issues/14959
+    """
+
+    X_non_gaussian = 1e6 * np.array(
+        [0.6, 2.0, 3.0, 4.0] * 4 + [11, 12, 12, 16, 17, 20, 85, 90], dtype=np.float64
+    ).reshape(-1, 1)
+    pt = PowerTransformer()
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        X_trans = pt.fit_transform(X_non_gaussian)
+
+    assert not np.any(np.isnan(X_trans))
+    assert X_trans.mean() == pytest.approx(0.0)
+    assert X_trans.std() == pytest.approx(1.0)
+    assert X_trans.min() > -2
+    assert X_trans.max() < 2
+
+
+@pytest.mark.parametrize(
+    "Transformer",
+    [
+        MinMaxScaler,
+        MaxAbsScaler,
+        RobustScaler,
+        StandardScaler,
+        QuantileTransformer,
+        PowerTransformer,
+    ],
+)
+def test_one_to_one_features(Transformer):
+    """Check one-to-one transformers give correct feature names."""
+    tr = Transformer().fit(iris.data)
+    names_out = tr.get_feature_names_out(iris.feature_names)
+    assert_array_equal(names_out, iris.feature_names)
+
+
+@pytest.mark.parametrize(
+    "Transformer",
+    [
+        MinMaxScaler,
+        MaxAbsScaler,
+        RobustScaler,
+        StandardScaler,
+        QuantileTransformer,
+        PowerTransformer,
+        Normalizer,
+        Binarizer,
+    ],
+)
+def test_one_to_one_features_pandas(Transformer):
+    """Check one-to-one transformers give correct feature names."""
+    pd = pytest.importorskip("pandas")
+
+    df = pd.DataFrame(iris.data, columns=iris.feature_names)
+    tr = Transformer().fit(df)
+
+    names_out_df_default = tr.get_feature_names_out()
+    assert_array_equal(names_out_df_default, iris.feature_names)
+
+    names_out_df_valid_in = tr.get_feature_names_out(iris.feature_names)
+    assert_array_equal(names_out_df_valid_in, iris.feature_names)
+
+    msg = re.escape("input_features is not equal to feature_names_in_")
+    with pytest.raises(ValueError, match=msg):
+        invalid_names = list("abcd")
+        tr.get_feature_names_out(invalid_names)
+
+
+def test_kernel_centerer_feature_names_out():
+    """Test that kernel centerer `feature_names_out`."""
+
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((6, 4))
+    X_pairwise = linear_kernel(X)
+    centerer = KernelCenterer().fit(X_pairwise)
+
+    names_out = centerer.get_feature_names_out()
+    samples_out2 = X_pairwise.shape[1]
+    assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])
+
+
+@pytest.mark.parametrize("standardize", [True, False])
+def test_power_transformer_constant_feature(standardize):
+    """Check that PowerTransfomer leaves constant features unchanged."""
+    X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]]
+
+    pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X)
+
+    assert_allclose(pt.lambdas_, [1, 1, 1])
+
+    Xft = pt.fit_transform(X)
+    Xt = pt.transform(X)
+
+    for Xt_ in [Xft, Xt]:
+        if standardize:
+            assert_allclose(Xt_, np.zeros_like(X))
+        else:
+            assert_allclose(Xt_, X)