diff --git a/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/api-v1-jdf-1119.json.gz b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/api-v1-jdf-1119.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..cfe21c720a6a6f97d6857de1d0cf268ab20dda53 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/api-v1-jdf-1119.json.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82f899edc59cb41fdd671b256a228e5e06dfc5e24c92712e75005b251b000865 +size 1108 diff --git a/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/api-v1-jdl-dn-adult-census-l-2-s-act-.json.gz b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/api-v1-jdl-dn-adult-census-l-2-s-act-.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..7b7718d29ecb2075088f54c5f2c5fc0d01d9404b --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/api-v1-jdl-dn-adult-census-l-2-s-act-.json.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ec0955788914fa81f698e97a4d1aff773d7a125ed6e769c6271a0b48fc4011d +size 363 diff --git a/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/api-v1-jdq-1119.json.gz b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/api-v1-jdq-1119.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..3265a7d933efe836193228b86e84c6c7a8b45afd --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/api-v1-jdq-1119.json.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef7cbcb58c2edcfea45c058b751faf7783e710462a924e9aacad8d47a7e9f94b +size 1549 diff --git a/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/data-v1-dl-54002.arff.gz b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/data-v1-dl-54002.arff.gz new file mode 100644 index 0000000000000000000000000000000000000000..8f610044b5cc550df4d4ef18cd2131306dba05be --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_1119/data-v1-dl-54002.arff.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6931af256195fcdd2e47dd8b0f9edf16fbf03b198e77b70e3dfd9877cdf09515 +size 1190 diff --git a/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_2/api-v1-jdq-2.json.gz b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_2/api-v1-jdq-2.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..08e36a9fb7d7eb1d95b74eebf7c1b870d4a052c1 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/datasets/tests/data/openml/id_2/api-v1-jdq-2.json.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c46f6c5f221d877de604b906403b20cbdf674f1225bcdbb3e15bd1882a69a471 +size 1501 diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/__init__.cpython-39.pyc b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0cea88a8108e7c0c65504c7c20ee9648c7a7f38 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/__init__.cpython-39.pyc differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_data.cpython-39.pyc b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_data.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92b5a2ae00e06f5e34b8f0f928cfb6a54c1f581a Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_data.cpython-39.pyc differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_discretization.cpython-39.pyc b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_discretization.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8f1bf8e07e48854f71219e8d57e65e04421adb4 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_discretization.cpython-39.pyc differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_encoders.cpython-39.pyc b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_encoders.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..673bfab9c9680e0a09239382e3959966d9ac86ec Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_encoders.cpython-39.pyc differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_function_transformer.cpython-39.pyc b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_function_transformer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93fe7febf23b2fb5238eaa513caa8e0cb4c36c9b Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_function_transformer.cpython-39.pyc differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_label.cpython-39.pyc b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_label.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36fc1c01e52563baa131527cec48817da42e23ae Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_label.cpython-39.pyc differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_polynomial.cpython-39.pyc b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_polynomial.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ff486e5112680c566941d3b47e35942e43204d0 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_polynomial.cpython-39.pyc differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_target_encoder.cpython-39.pyc b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_target_encoder.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5cbff0038b957aac33a47b7dddffef44be6a520 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/preprocessing/__pycache__/_target_encoder.cpython-39.pyc differ diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/__init__.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_discretization.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_discretization.py new file mode 100644 index 0000000000000000000000000000000000000000..69a0fc5ad9df1f908bcbe46bd8f20af7bca83d86 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_discretization.py @@ -0,0 +1,500 @@ +import warnings + +import numpy as np +import pytest +import scipy.sparse as sp + +from sklearn import clone +from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder +from sklearn.utils._testing import ( + assert_allclose, + assert_allclose_dense_sparse, + assert_array_almost_equal, + assert_array_equal, +) + +X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]] + + +@pytest.mark.parametrize( + "strategy, expected, sample_weight", + [ + ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], None), + ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], None), + ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], None), + ( + "quantile", + [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], + [1, 1, 2, 1], + ), + ( + "quantile", + [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], + [1, 1, 1, 1], + ), + ( + "quantile", + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]], + [0, 1, 1, 1], + ), + ( + "kmeans", + [[0, 0, 0, 0], [1, 1, 1, 0], [1, 1, 1, 1], [2, 2, 2, 2]], + [1, 0, 3, 1], + ), + ( + "kmeans", + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], + [1, 1, 1, 1], + ), + ], +) +def test_fit_transform(strategy, expected, sample_weight): + est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy) + est.fit(X, sample_weight=sample_weight) + assert_array_equal(expected, est.transform(X)) + + +def test_valid_n_bins(): + KBinsDiscretizer(n_bins=2).fit_transform(X) + KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X) + assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int) + + +@pytest.mark.parametrize("strategy", ["uniform"]) +def test_kbinsdiscretizer_wrong_strategy_with_weights(strategy): + """Check that we raise an error when the wrong strategy is used.""" + sample_weight = np.ones(shape=(len(X))) + est = KBinsDiscretizer(n_bins=3, strategy=strategy) + err_msg = ( + "`sample_weight` was provided but it cannot be used with strategy='uniform'." + ) + with pytest.raises(ValueError, match=err_msg): + est.fit(X, sample_weight=sample_weight) + + +def test_invalid_n_bins_array(): + # Bad shape + n_bins = np.full((2, 4), 2.0) + est = KBinsDiscretizer(n_bins=n_bins) + err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)." + with pytest.raises(ValueError, match=err_msg): + est.fit_transform(X) + + # Incorrect number of features + n_bins = [1, 2, 2] + est = KBinsDiscretizer(n_bins=n_bins) + err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)." + with pytest.raises(ValueError, match=err_msg): + est.fit_transform(X) + + # Bad bin values + n_bins = [1, 2, 2, 1] + est = KBinsDiscretizer(n_bins=n_bins) + err_msg = ( + "KBinsDiscretizer received an invalid number of bins " + "at indices 0, 3. Number of bins must be at least 2, " + "and must be an int." + ) + with pytest.raises(ValueError, match=err_msg): + est.fit_transform(X) + + # Float bin values + n_bins = [2.1, 2, 2.1, 2] + est = KBinsDiscretizer(n_bins=n_bins) + err_msg = ( + "KBinsDiscretizer received an invalid number of bins " + "at indices 0, 2. Number of bins must be at least 2, " + "and must be an int." + ) + with pytest.raises(ValueError, match=err_msg): + est.fit_transform(X) + + +@pytest.mark.parametrize( + "strategy, expected, sample_weight", + [ + ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]], None), + ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]], None), + ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], None), + ( + "quantile", + [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], + [1, 1, 3, 1], + ), + ( + "quantile", + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]], + [0, 1, 3, 1], + ), + # ( + # "quantile", + # [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], + # [1, 1, 1, 1], + # ), + # + # TODO: This test case above aims to test if the case where an array of + # ones passed in sample_weight parameter is equal to the case when + # sample_weight is None. + # Unfortunately, the behavior of `_weighted_percentile` when + # `sample_weight = [1, 1, 1, 1]` are currently not equivalent. + # This problem has been addressed in issue : + # https://github.com/scikit-learn/scikit-learn/issues/17370 + ( + "kmeans", + [[0, 0, 0, 0], [0, 1, 1, 0], [1, 1, 1, 1], [1, 2, 2, 2]], + [1, 0, 3, 1], + ), + ], +) +def test_fit_transform_n_bins_array(strategy, expected, sample_weight): + est = KBinsDiscretizer( + n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy + ).fit(X, sample_weight=sample_weight) + assert_array_equal(expected, est.transform(X)) + + # test the shape of bin_edges_ + n_features = np.array(X).shape[1] + assert est.bin_edges_.shape == (n_features,) + for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_): + assert bin_edges.shape == (n_bins + 1,) + + +@pytest.mark.filterwarnings("ignore: Bins whose width are too small") +def test_kbinsdiscretizer_effect_sample_weight(): + """Check the impact of `sample_weight` one computed quantiles.""" + X = np.array([[-2], [-1], [1], [3], [500], [1000]]) + # add a large number of bins such that each sample with a non-null weight + # will be used as bin edge + est = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") + est.fit(X, sample_weight=[1, 1, 1, 1, 0, 0]) + assert_allclose(est.bin_edges_[0], [-2, -1, 1, 3]) + assert_allclose(est.transform(X), [[0.0], [1.0], [2.0], [2.0], [2.0], [2.0]]) + + +@pytest.mark.parametrize("strategy", ["kmeans", "quantile"]) +def test_kbinsdiscretizer_no_mutating_sample_weight(strategy): + """Make sure that `sample_weight` is not changed in place.""" + est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy) + sample_weight = np.array([1, 3, 1, 2], dtype=np.float64) + sample_weight_copy = np.copy(sample_weight) + est.fit(X, sample_weight=sample_weight) + assert_allclose(sample_weight, sample_weight_copy) + + +@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"]) +def test_same_min_max(strategy): + warnings.simplefilter("always") + X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]]) + est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal") + warning_message = "Feature 0 is constant and will be replaced with 0." + with pytest.warns(UserWarning, match=warning_message): + est.fit(X) + assert est.n_bins_[0] == 1 + # replace the feature with zeros + Xt = est.transform(X) + assert_array_equal(Xt[:, 0], np.zeros(X.shape[0])) + + +def test_transform_1d_behavior(): + X = np.arange(4) + est = KBinsDiscretizer(n_bins=2) + with pytest.raises(ValueError): + est.fit(X) + + est = KBinsDiscretizer(n_bins=2) + est.fit(X.reshape(-1, 1)) + with pytest.raises(ValueError): + est.transform(X) + + +@pytest.mark.parametrize("i", range(1, 9)) +def test_numeric_stability(i): + X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1) + Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1) + + # Test up to discretizing nano units + X = X_init / 10**i + Xt = KBinsDiscretizer(n_bins=2, encode="ordinal").fit_transform(X) + assert_array_equal(Xt_expected, Xt) + + +def test_encode_options(): + est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="ordinal").fit(X) + Xt_1 = est.transform(X) + est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot-dense").fit(X) + Xt_2 = est.transform(X) + assert not sp.issparse(Xt_2) + assert_array_equal( + OneHotEncoder( + categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=False + ).fit_transform(Xt_1), + Xt_2, + ) + est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot").fit(X) + Xt_3 = est.transform(X) + assert sp.issparse(Xt_3) + assert_array_equal( + OneHotEncoder( + categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=True + ) + .fit_transform(Xt_1) + .toarray(), + Xt_3.toarray(), + ) + + +@pytest.mark.parametrize( + "strategy, expected_2bins, expected_3bins, expected_5bins", + [ + ("uniform", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]), + ("kmeans", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]), + ("quantile", [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4]), + ], +) +def test_nonuniform_strategies( + strategy, expected_2bins, expected_3bins, expected_5bins +): + X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1) + + # with 2 bins + est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode="ordinal") + Xt = est.fit_transform(X) + assert_array_equal(expected_2bins, Xt.ravel()) + + # with 3 bins + est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode="ordinal") + Xt = est.fit_transform(X) + assert_array_equal(expected_3bins, Xt.ravel()) + + # with 5 bins + est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode="ordinal") + Xt = est.fit_transform(X) + assert_array_equal(expected_5bins, Xt.ravel()) + + +@pytest.mark.parametrize( + "strategy, expected_inv", + [ + ( + "uniform", + [ + [-1.5, 2.0, -3.5, -0.5], + [-0.5, 3.0, -2.5, -0.5], + [0.5, 4.0, -1.5, 0.5], + [0.5, 4.0, -1.5, 1.5], + ], + ), + ( + "kmeans", + [ + [-1.375, 2.125, -3.375, -0.5625], + [-1.375, 2.125, -3.375, -0.5625], + [-0.125, 3.375, -2.125, 0.5625], + [0.75, 4.25, -1.25, 1.625], + ], + ), + ( + "quantile", + [ + [-1.5, 2.0, -3.5, -0.75], + [-0.5, 3.0, -2.5, 0.0], + [0.5, 4.0, -1.5, 1.25], + [0.5, 4.0, -1.5, 1.25], + ], + ), + ], +) +@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"]) +def test_inverse_transform(strategy, encode, expected_inv): + kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) + Xt = kbd.fit_transform(X) + Xinv = kbd.inverse_transform(Xt) + assert_array_almost_equal(expected_inv, Xinv) + + +@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"]) +def test_transform_outside_fit_range(strategy): + X = np.array([0, 1, 2, 3])[:, None] + kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal") + kbd.fit(X) + + X2 = np.array([-2, 5])[:, None] + X2t = kbd.transform(X2) + assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_) + assert_array_equal(X2t.min(axis=0), [0]) + + +def test_overwrite(): + X = np.array([0, 1, 2, 3])[:, None] + X_before = X.copy() + + est = KBinsDiscretizer(n_bins=3, encode="ordinal") + Xt = est.fit_transform(X) + assert_array_equal(X, X_before) + + Xt_before = Xt.copy() + Xinv = est.inverse_transform(Xt) + assert_array_equal(Xt, Xt_before) + assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]])) + + +@pytest.mark.parametrize( + "strategy, expected_bin_edges", [("quantile", [0, 1, 3]), ("kmeans", [0, 1.5, 3])] +) +def test_redundant_bins(strategy, expected_bin_edges): + X = [[0], [0], [0], [0], [3], [3]] + kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, subsample=None) + warning_message = "Consider decreasing the number of bins." + with pytest.warns(UserWarning, match=warning_message): + kbd.fit(X) + assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges) + + +def test_percentile_numeric_stability(): + X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1) + bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95]) + Xt = np.array([0, 0, 4]).reshape(-1, 1) + kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") + warning_message = "Consider decreasing the number of bins." + with pytest.warns(UserWarning, match=warning_message): + kbd.fit(X) + + assert_array_almost_equal(kbd.bin_edges_[0], bin_edges) + assert_array_almost_equal(kbd.transform(X), Xt) + + +@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64]) +@pytest.mark.parametrize("out_dtype", [None, np.float32, np.float64]) +@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"]) +def test_consistent_dtype(in_dtype, out_dtype, encode): + X_input = np.array(X, dtype=in_dtype) + kbd = KBinsDiscretizer(n_bins=3, encode=encode, dtype=out_dtype) + kbd.fit(X_input) + + # test output dtype + if out_dtype is not None: + expected_dtype = out_dtype + elif out_dtype is None and X_input.dtype == np.float16: + # wrong numeric input dtype are cast in np.float64 + expected_dtype = np.float64 + else: + expected_dtype = X_input.dtype + Xt = kbd.transform(X_input) + assert Xt.dtype == expected_dtype + + +@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64]) +@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"]) +def test_32_equal_64(input_dtype, encode): + # TODO this check is redundant with common checks and can be removed + # once #16290 is merged + X_input = np.array(X, dtype=input_dtype) + + # 32 bit output + kbd_32 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float32) + kbd_32.fit(X_input) + Xt_32 = kbd_32.transform(X_input) + + # 64 bit output + kbd_64 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float64) + kbd_64.fit(X_input) + Xt_64 = kbd_64.transform(X_input) + + assert_allclose_dense_sparse(Xt_32, Xt_64) + + +def test_kbinsdiscretizer_subsample_default(): + # Since the size of X is small (< 2e5), subsampling will not take place. + X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1) + kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") + kbd_default.fit(X) + + kbd_without_subsampling = clone(kbd_default) + kbd_without_subsampling.set_params(subsample=None) + kbd_without_subsampling.fit(X) + + for bin_kbd_default, bin_kbd_with_subsampling in zip( + kbd_default.bin_edges_[0], kbd_without_subsampling.bin_edges_[0] + ): + np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling) + assert kbd_default.bin_edges_.shape == kbd_without_subsampling.bin_edges_.shape + + +@pytest.mark.parametrize( + "encode, expected_names", + [ + ( + "onehot", + [ + f"feat{col_id}_{float(bin_id)}" + for col_id in range(3) + for bin_id in range(4) + ], + ), + ( + "onehot-dense", + [ + f"feat{col_id}_{float(bin_id)}" + for col_id in range(3) + for bin_id in range(4) + ], + ), + ("ordinal", [f"feat{col_id}" for col_id in range(3)]), + ], +) +def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names): + """Check get_feature_names_out for different settings. + Non-regression test for #22731 + """ + X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]] + + kbd = KBinsDiscretizer(n_bins=4, encode=encode).fit(X) + Xt = kbd.transform(X) + + input_features = [f"feat{i}" for i in range(3)] + output_names = kbd.get_feature_names_out(input_features) + assert Xt.shape[1] == output_names.shape[0] + + assert_array_equal(output_names, expected_names) + + +@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"]) +def test_kbinsdiscretizer_subsample(strategy, global_random_seed): + # Check that the bin edges are almost the same when subsampling is used. + X = np.random.RandomState(global_random_seed).random_sample((100000, 1)) + 1 + + kbd_subsampling = KBinsDiscretizer( + strategy=strategy, subsample=50000, random_state=global_random_seed + ) + kbd_subsampling.fit(X) + + kbd_no_subsampling = clone(kbd_subsampling) + kbd_no_subsampling.set_params(subsample=None) + kbd_no_subsampling.fit(X) + + # We use a large tolerance because we can't expect the bin edges to be exactly the + # same when subsampling is used. + assert_allclose( + kbd_subsampling.bin_edges_[0], kbd_no_subsampling.bin_edges_[0], rtol=1e-2 + ) + + +# TODO(1.7): remove this test +def test_KBD_inverse_transform_Xt_deprecation(): + X = np.arange(10)[:, None] + kbd = KBinsDiscretizer() + X = kbd.fit_transform(X) + + with pytest.raises(TypeError, match="Missing required positional argument"): + kbd.inverse_transform() + + with pytest.raises(TypeError, match="Cannot use both X and Xt. Use X only"): + kbd.inverse_transform(X=X, Xt=X) + + with warnings.catch_warnings(record=True): + warnings.simplefilter("error") + kbd.inverse_transform(X) + + with pytest.warns(FutureWarning, match="Xt was renamed X in version 1.5"): + kbd.inverse_transform(Xt=X) diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_encoders.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..04c45cb7b6aa5e79a0806f9c497a43397ec6a28d --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_encoders.py @@ -0,0 +1,2367 @@ +import re +import warnings + +import numpy as np +import pytest +from scipy import sparse + +from sklearn.exceptions import NotFittedError +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder +from sklearn.utils._missing import is_scalar_nan +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS + + +def test_one_hot_encoder_sparse_dense(): + # check that sparse and dense will give the same results + + X = np.array([[3, 2, 1], [0, 1, 1]]) + enc_sparse = OneHotEncoder() + enc_dense = OneHotEncoder(sparse_output=False) + + X_trans_sparse = enc_sparse.fit_transform(X) + X_trans_dense = enc_dense.fit_transform(X) + + assert X_trans_sparse.shape == (2, 5) + assert X_trans_dense.shape == (2, 5) + + assert sparse.issparse(X_trans_sparse) + assert not sparse.issparse(X_trans_dense) + + # check outcome + assert_array_equal( + X_trans_sparse.toarray(), [[0.0, 1.0, 0.0, 1.0, 1.0], [1.0, 0.0, 1.0, 0.0, 1.0]] + ) + assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +def test_one_hot_encoder_handle_unknown(handle_unknown): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + X2 = np.array([[4, 1, 1]]) + + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown="error") + oh.fit(X) + with pytest.raises(ValueError, match="Found unknown categories"): + oh.transform(X2) + + # Test the ignore option, ignores unknown features (giving all 0's) + oh = OneHotEncoder(handle_unknown=handle_unknown) + oh.fit(X) + X2_passed = X2.copy() + assert_array_equal( + oh.transform(X2_passed).toarray(), + np.array([[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]]), + ) + # ensure transformed data was not modified in place + assert_allclose(X2, X2_passed) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +def test_one_hot_encoder_handle_unknown_strings(handle_unknown): + X = np.array(["11111111", "22", "333", "4444"]).reshape((-1, 1)) + X2 = np.array(["55555", "22"]).reshape((-1, 1)) + # Non Regression test for the issue #12470 + # Test the ignore option, when categories are numpy string dtype + # particularly when the known category strings are larger + # than the unknown category strings + oh = OneHotEncoder(handle_unknown=handle_unknown) + oh.fit(X) + X2_passed = X2.copy() + assert_array_equal( + oh.transform(X2_passed).toarray(), + np.array([[0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]), + ) + # ensure transformed data was not modified in place + assert_array_equal(X2, X2_passed) + + +@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64]) +@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64]) +def test_one_hot_encoder_dtype(input_dtype, output_dtype): + X = np.asarray([[0, 1]], dtype=input_dtype).T + X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype) + + oh = OneHotEncoder(categories="auto", dtype=output_dtype) + assert_array_equal(oh.fit_transform(X).toarray(), X_expected) + assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected) + + oh = OneHotEncoder(categories="auto", dtype=output_dtype, sparse_output=False) + assert_array_equal(oh.fit_transform(X), X_expected) + assert_array_equal(oh.fit(X).transform(X), X_expected) + + +@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64]) +def test_one_hot_encoder_dtype_pandas(output_dtype): + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) + X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype) + + oh = OneHotEncoder(dtype=output_dtype) + assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected) + assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected) + + oh = OneHotEncoder(dtype=output_dtype, sparse_output=False) + assert_array_equal(oh.fit_transform(X_df), X_expected) + assert_array_equal(oh.fit(X_df).transform(X_df), X_expected) + + +def test_one_hot_encoder_feature_names(): + enc = OneHotEncoder() + X = [ + ["Male", 1, "girl", 2, 3], + ["Female", 41, "girl", 1, 10], + ["Male", 51, "boy", 12, 3], + ["Male", 91, "girl", 21, 30], + ] + + enc.fit(X) + feature_names = enc.get_feature_names_out() + + assert_array_equal( + [ + "x0_Female", + "x0_Male", + "x1_1", + "x1_41", + "x1_51", + "x1_91", + "x2_boy", + "x2_girl", + "x3_1", + "x3_2", + "x3_12", + "x3_21", + "x4_3", + "x4_10", + "x4_30", + ], + feature_names, + ) + + feature_names2 = enc.get_feature_names_out(["one", "two", "three", "four", "five"]) + + assert_array_equal( + [ + "one_Female", + "one_Male", + "two_1", + "two_41", + "two_51", + "two_91", + "three_boy", + "three_girl", + "four_1", + "four_2", + "four_12", + "four_21", + "five_3", + "five_10", + "five_30", + ], + feature_names2, + ) + + with pytest.raises(ValueError, match="input_features should have length"): + enc.get_feature_names_out(["one", "two"]) + + +def test_one_hot_encoder_feature_names_unicode(): + enc = OneHotEncoder() + X = np.array([["c❤t1", "dat2"]], dtype=object).T + enc.fit(X) + feature_names = enc.get_feature_names_out() + assert_array_equal(["x0_c❤t1", "x0_dat2"], feature_names) + feature_names = enc.get_feature_names_out(input_features=["n👍me"]) + assert_array_equal(["n👍me_c❤t1", "n👍me_dat2"], feature_names) + + +def test_one_hot_encoder_custom_feature_name_combiner(): + """Check the behaviour of `feature_name_combiner` as a callable.""" + + def name_combiner(feature, category): + return feature + "_" + repr(category) + + enc = OneHotEncoder(feature_name_combiner=name_combiner) + X = np.array([["None", None]], dtype=object).T + enc.fit(X) + feature_names = enc.get_feature_names_out() + assert_array_equal(["x0_'None'", "x0_None"], feature_names) + feature_names = enc.get_feature_names_out(input_features=["a"]) + assert_array_equal(["a_'None'", "a_None"], feature_names) + + def wrong_combiner(feature, category): + # we should be returning a Python string + return 0 + + enc = OneHotEncoder(feature_name_combiner=wrong_combiner).fit(X) + err_msg = ( + "When `feature_name_combiner` is a callable, it should return a Python string." + ) + with pytest.raises(TypeError, match=err_msg): + enc.get_feature_names_out() + + +def test_one_hot_encoder_set_params(): + X = np.array([[1, 2]]).T + oh = OneHotEncoder() + # set params on not yet fitted object + oh.set_params(categories=[[0, 1, 2, 3]]) + assert oh.get_params()["categories"] == [[0, 1, 2, 3]] + assert oh.fit_transform(X).toarray().shape == (2, 4) + # set params on already fitted object + oh.set_params(categories=[[0, 1, 2, 3, 4]]) + assert oh.fit_transform(X).toarray().shape == (2, 5) + + +def check_categorical_onehot(X): + enc = OneHotEncoder(categories="auto") + Xtr1 = enc.fit_transform(X) + + enc = OneHotEncoder(categories="auto", sparse_output=False) + Xtr2 = enc.fit_transform(X) + + assert_allclose(Xtr1.toarray(), Xtr2) + + assert sparse.issparse(Xtr1) and Xtr1.format == "csr" + return Xtr1.toarray() + + +@pytest.mark.parametrize( + "X", + [ + [["def", 1, 55], ["abc", 2, 55]], + np.array([[10, 1, 55], [5, 2, 55]]), + np.array([["b", "A", "cat"], ["a", "B", "cat"]], dtype=object), + np.array([["b", 1, "cat"], ["a", np.nan, "cat"]], dtype=object), + np.array([["b", 1, "cat"], ["a", float("nan"), "cat"]], dtype=object), + np.array([[None, 1, "cat"], ["a", 2, "cat"]], dtype=object), + np.array([[None, 1, None], ["a", np.nan, None]], dtype=object), + np.array([[None, 1, None], ["a", float("nan"), None]], dtype=object), + ], + ids=[ + "mixed", + "numeric", + "object", + "mixed-nan", + "mixed-float-nan", + "mixed-None", + "mixed-None-nan", + "mixed-None-float-nan", + ], +) +def test_one_hot_encoder(X): + Xtr = check_categorical_onehot(np.array(X)[:, [0]]) + assert_allclose(Xtr, [[0, 1], [1, 0]]) + + Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) + assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]]) + + Xtr = OneHotEncoder(categories="auto").fit_transform(X) + assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]]) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +@pytest.mark.parametrize("sparse_", [False, True]) +@pytest.mark.parametrize("drop", [None, "first"]) +def test_one_hot_encoder_inverse(handle_unknown, sparse_, drop): + X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]] + enc = OneHotEncoder(sparse_output=sparse_, drop=drop) + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + X = [[2, 55], [1, 55], [3, 55]] + enc = OneHotEncoder(sparse_output=sparse_, categories="auto", drop=drop) + X_tr = enc.fit_transform(X) + exp = np.array(X) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + if drop is None: + # with unknown categories + # drop is incompatible with handle_unknown=ignore + X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]] + enc = OneHotEncoder( + sparse_output=sparse_, + handle_unknown=handle_unknown, + categories=[["abc", "def"], [1, 2], [54, 55, 56]], + ) + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + exp[2, 1] = None + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # with an otherwise numerical output, still object if unknown + X = [[2, 55], [1, 55], [3, 55]] + enc = OneHotEncoder( + sparse_output=sparse_, + categories=[[1, 2], [54, 56]], + handle_unknown=handle_unknown, + ) + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + exp[2, 0] = None + exp[:, 1] = None + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # incorrect shape raises + X_tr = np.array([[0, 1, 1], [1, 0, 1]]) + msg = re.escape("Shape of the passed X data is not correct") + with pytest.raises(ValueError, match=msg): + enc.inverse_transform(X_tr) + + +@pytest.mark.parametrize("sparse_", [False, True]) +@pytest.mark.parametrize( + "X, X_trans", + [ + ([[2, 55], [1, 55], [2, 55]], [[0, 1, 1], [0, 0, 0], [0, 1, 1]]), + ( + [["one", "a"], ["two", "a"], ["three", "b"], ["two", "a"]], + [[0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0]], + ), + ], +) +def test_one_hot_encoder_inverse_transform_raise_error_with_unknown( + X, X_trans, sparse_ +): + """Check that `inverse_transform` raise an error with unknown samples, no + dropped feature, and `handle_unknow="error`. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/14934 + """ + enc = OneHotEncoder(sparse_output=sparse_).fit(X) + msg = ( + r"Samples \[(\d )*\d\] can not be inverted when drop=None and " + r"handle_unknown='error' because they contain all zeros" + ) + + if sparse_: + # emulate sparse data transform by a one-hot encoder sparse. + X_trans = _convert_container(X_trans, "sparse") + with pytest.raises(ValueError, match=msg): + enc.inverse_transform(X_trans) + + +def test_one_hot_encoder_inverse_if_binary(): + X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object) + ohe = OneHotEncoder(drop="if_binary", sparse_output=False) + X_tr = ohe.fit_transform(X) + assert_array_equal(ohe.inverse_transform(X_tr), X) + + +@pytest.mark.parametrize("drop", ["if_binary", "first", None]) +@pytest.mark.parametrize("reset_drop", ["if_binary", "first", None]) +def test_one_hot_encoder_drop_reset(drop, reset_drop): + # check that resetting drop option without refitting does not throw an error + X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object) + ohe = OneHotEncoder(drop=drop, sparse_output=False) + ohe.fit(X) + X_tr = ohe.transform(X) + feature_names = ohe.get_feature_names_out() + ohe.set_params(drop=reset_drop) + assert_array_equal(ohe.inverse_transform(X_tr), X) + assert_allclose(ohe.transform(X), X_tr) + assert_array_equal(ohe.get_feature_names_out(), feature_names) + + +@pytest.mark.parametrize("method", ["fit", "fit_transform"]) +@pytest.mark.parametrize("X", [[1, 2], np.array([3.0, 4.0])]) +def test_X_is_not_1D(X, method): + oh = OneHotEncoder() + + msg = "Expected 2D array, got 1D array instead" + with pytest.raises(ValueError, match=msg): + getattr(oh, method)(X) + + +@pytest.mark.parametrize("method", ["fit", "fit_transform"]) +def test_X_is_not_1D_pandas(method): + pd = pytest.importorskip("pandas") + X = pd.Series([6, 3, 4, 6]) + oh = OneHotEncoder() + + msg = f"Expected a 2-dimensional container but got {type(X)} instead." + with pytest.raises(ValueError, match=msg): + getattr(oh, method)(X) + + +@pytest.mark.parametrize( + "X, cat_exp, cat_dtype", + [ + ([["abc", 55], ["def", 55]], [["abc", "def"], [55]], np.object_), + (np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer), + ( + np.array([["A", "cat"], ["B", "cat"]], dtype=object), + [["A", "B"], ["cat"]], + np.object_, + ), + (np.array([["A", "cat"], ["B", "cat"]]), [["A", "B"], ["cat"]], np.str_), + (np.array([[1, 2], [np.nan, 2]]), [[1, np.nan], [2]], np.float64), + ( + np.array([["A", np.nan], [None, np.nan]], dtype=object), + [["A", None], [np.nan]], + np.object_, + ), + ( + np.array([["A", float("nan")], [None, float("nan")]], dtype=object), + [["A", None], [float("nan")]], + np.object_, + ), + ], + ids=[ + "mixed", + "numeric", + "object", + "string", + "missing-float", + "missing-np.nan-object", + "missing-float-nan-object", + ], +) +def test_one_hot_encoder_categories(X, cat_exp, cat_dtype): + # order of categories should not depend on order of samples + for Xi in [X, X[::-1]]: + enc = OneHotEncoder(categories="auto") + enc.fit(Xi) + # assert enc.categories == 'auto' + assert isinstance(enc.categories_, list) + for res, exp in zip(enc.categories_, cat_exp): + res_list = res.tolist() + if is_scalar_nan(exp[-1]): + assert is_scalar_nan(res_list[-1]) + assert res_list[:-1] == exp[:-1] + else: + assert res.tolist() == exp + assert np.issubdtype(res.dtype, cat_dtype) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +@pytest.mark.parametrize( + "X, X2, cats, cat_dtype", + [ + ( + np.array([["a", "b"]], dtype=object).T, + np.array([["a", "d"]], dtype=object).T, + [["a", "b", "c"]], + np.object_, + ), + ( + np.array([[1, 2]], dtype="int64").T, + np.array([[1, 4]], dtype="int64").T, + [[1, 2, 3]], + np.int64, + ), + ( + np.array([["a", "b"]], dtype=object).T, + np.array([["a", "d"]], dtype=object).T, + [np.array(["a", "b", "c"])], + np.object_, + ), + ( + np.array([[None, "a"]], dtype=object).T, + np.array([[None, "b"]], dtype=object).T, + [[None, "a", "z"]], + object, + ), + ( + np.array([["a", "b"]], dtype=object).T, + np.array([["a", np.nan]], dtype=object).T, + [["a", "b", "z"]], + object, + ), + ( + np.array([["a", None]], dtype=object).T, + np.array([["a", np.nan]], dtype=object).T, + [["a", None, "z"]], + object, + ), + ], + ids=[ + "object", + "numeric", + "object-string", + "object-string-none", + "object-string-nan", + "object-None-and-nan", + ], +) +def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype, handle_unknown): + enc = OneHotEncoder(categories=cats) + exp = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) + assert_array_equal(enc.fit_transform(X).toarray(), exp) + assert list(enc.categories[0]) == list(cats[0]) + assert enc.categories_[0].tolist() == list(cats[0]) + # manually specified categories should have same dtype as + # the data when coerced from lists + assert enc.categories_[0].dtype == cat_dtype + + # when specifying categories manually, unknown categories should already + # raise when fitting + enc = OneHotEncoder(categories=cats) + with pytest.raises(ValueError, match="Found unknown categories"): + enc.fit(X2) + enc = OneHotEncoder(categories=cats, handle_unknown=handle_unknown) + exp = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) + assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp) + + +def test_one_hot_encoder_unsorted_categories(): + X = np.array([["a", "b"]], dtype=object).T + + enc = OneHotEncoder(categories=[["b", "a", "c"]]) + exp = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0]]) + assert_array_equal(enc.fit(X).transform(X).toarray(), exp) + assert_array_equal(enc.fit_transform(X).toarray(), exp) + assert enc.categories_[0].tolist() == ["b", "a", "c"] + assert np.issubdtype(enc.categories_[0].dtype, np.object_) + + # unsorted passed categories still raise for numerical values + X = np.array([[1, 2]]).T + enc = OneHotEncoder(categories=[[2, 1, 3]]) + msg = "Unsorted categories are not supported" + with pytest.raises(ValueError, match=msg): + enc.fit_transform(X) + + +@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) +def test_encoder_nan_ending_specified_categories(Encoder): + """Test encoder for specified categories that nan is at the end. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27088 + """ + cats = [np.array([0, np.nan, 1])] + enc = Encoder(categories=cats) + X = np.array([[0, 1]], dtype=object).T + with pytest.raises(ValueError, match="Nan should be the last element"): + enc.fit(X) + + +def test_one_hot_encoder_specified_categories_mixed_columns(): + # multiple columns + X = np.array([["a", "b"], [0, 2]], dtype=object).T + enc = OneHotEncoder(categories=[["a", "b", "c"], [0, 1, 2]]) + exp = np.array([[1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 1.0]]) + assert_array_equal(enc.fit_transform(X).toarray(), exp) + assert enc.categories_[0].tolist() == ["a", "b", "c"] + assert np.issubdtype(enc.categories_[0].dtype, np.object_) + assert enc.categories_[1].tolist() == [0, 1, 2] + # integer categories but from object dtype data + assert np.issubdtype(enc.categories_[1].dtype, np.object_) + + +def test_one_hot_encoder_pandas(): + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) + + Xtr = check_categorical_onehot(X_df) + assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]]) + + +@pytest.mark.parametrize( + "drop, expected_names", + [ + ("first", ["x0_c", "x2_b"]), + ("if_binary", ["x0_c", "x1_2", "x2_b"]), + (["c", 2, "b"], ["x0_b", "x2_a"]), + ], + ids=["first", "binary", "manual"], +) +def test_one_hot_encoder_feature_names_drop(drop, expected_names): + X = [["c", 2, "a"], ["b", 2, "b"]] + + ohe = OneHotEncoder(drop=drop) + ohe.fit(X) + feature_names = ohe.get_feature_names_out() + assert_array_equal(expected_names, feature_names) + + +def test_one_hot_encoder_drop_equals_if_binary(): + # Canonical case + X = [[10, "yes"], [20, "no"], [30, "yes"]] + expected = np.array( + [[1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0]] + ) + expected_drop_idx = np.array([None, 0]) + + ohe = OneHotEncoder(drop="if_binary", sparse_output=False) + result = ohe.fit_transform(X) + assert_array_equal(ohe.drop_idx_, expected_drop_idx) + assert_allclose(result, expected) + + # with only one cat, the behaviour is equivalent to drop=None + X = [["true", "a"], ["false", "a"], ["false", "a"]] + expected = np.array([[1.0, 1.0], [0.0, 1.0], [0.0, 1.0]]) + expected_drop_idx = np.array([0, None]) + + ohe = OneHotEncoder(drop="if_binary", sparse_output=False) + result = ohe.fit_transform(X) + assert_array_equal(ohe.drop_idx_, expected_drop_idx) + assert_allclose(result, expected) + + +@pytest.mark.parametrize( + "X", + [ + [["abc", 2, 55], ["def", 1, 55]], + np.array([[10, 2, 55], [20, 1, 55]]), + np.array([["a", "B", "cat"], ["b", "A", "cat"]], dtype=object), + ], + ids=["mixed", "numeric", "object"], +) +def test_ordinal_encoder(X): + enc = OrdinalEncoder() + exp = np.array([[0, 1, 0], [1, 0, 0]], dtype="int64") + assert_array_equal(enc.fit_transform(X), exp.astype("float64")) + enc = OrdinalEncoder(dtype="int64") + assert_array_equal(enc.fit_transform(X), exp) + + +@pytest.mark.parametrize( + "X, X2, cats, cat_dtype", + [ + ( + np.array([["a", "b"]], dtype=object).T, + np.array([["a", "d"]], dtype=object).T, + [["a", "b", "c"]], + np.object_, + ), + ( + np.array([[1, 2]], dtype="int64").T, + np.array([[1, 4]], dtype="int64").T, + [[1, 2, 3]], + np.int64, + ), + ( + np.array([["a", "b"]], dtype=object).T, + np.array([["a", "d"]], dtype=object).T, + [np.array(["a", "b", "c"])], + np.object_, + ), + ], + ids=["object", "numeric", "object-string-cat"], +) +def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype): + enc = OrdinalEncoder(categories=cats) + exp = np.array([[0.0], [1.0]]) + assert_array_equal(enc.fit_transform(X), exp) + assert list(enc.categories[0]) == list(cats[0]) + assert enc.categories_[0].tolist() == list(cats[0]) + # manually specified categories should have same dtype as + # the data when coerced from lists + assert enc.categories_[0].dtype == cat_dtype + + # when specifying categories manually, unknown categories should already + # raise when fitting + enc = OrdinalEncoder(categories=cats) + with pytest.raises(ValueError, match="Found unknown categories"): + enc.fit(X2) + + +def test_ordinal_encoder_inverse(): + X = [["abc", 2, 55], ["def", 1, 55]] + enc = OrdinalEncoder() + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # incorrect shape raises + X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) + msg = re.escape("Shape of the passed X data is not correct") + with pytest.raises(ValueError, match=msg): + enc.inverse_transform(X_tr) + + +def test_ordinal_encoder_handle_unknowns_string(): + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2) + X_fit = np.array([["a", "x"], ["b", "y"], ["c", "z"]], dtype=object) + X_trans = np.array([["c", "xy"], ["bla", "y"], ["a", "x"]], dtype=object) + enc.fit(X_fit) + + X_trans_enc = enc.transform(X_trans) + exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype="int64") + assert_array_equal(X_trans_enc, exp) + + X_trans_inv = enc.inverse_transform(X_trans_enc) + inv_exp = np.array([["c", None], [None, "y"], ["a", "x"]], dtype=object) + assert_array_equal(X_trans_inv, inv_exp) + + +@pytest.mark.parametrize("dtype", [float, int]) +def test_ordinal_encoder_handle_unknowns_numeric(dtype): + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999) + X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype) + X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype) + enc.fit(X_fit) + + X_trans_enc = enc.transform(X_trans) + exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype="int64") + assert_array_equal(X_trans_enc, exp) + + X_trans_inv = enc.inverse_transform(X_trans_enc) + inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object) + assert_array_equal(X_trans_inv, inv_exp) + + +def test_ordinal_encoder_handle_unknowns_nan(): + # Make sure unknown_value=np.nan properly works + + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan) + + X_fit = np.array([[1], [2], [3]]) + enc.fit(X_fit) + X_trans = enc.transform([[1], [2], [4]]) + assert_array_equal(X_trans, [[0], [1], [np.nan]]) + + +def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype(): + # Make sure an error is raised when unknown_value=np.nan and the dtype + # isn't a float dtype + enc = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=np.nan, dtype=int + ) + + X_fit = np.array([[1], [2], [3]]) + with pytest.raises(ValueError, match="dtype parameter should be a float dtype"): + enc.fit(X_fit) + + +def test_ordinal_encoder_raise_categories_shape(): + X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T + cats = ["Low", "Medium", "High"] + enc = OrdinalEncoder(categories=cats) + msg = "Shape mismatch: if categories is an array," + + with pytest.raises(ValueError, match=msg): + enc.fit(X) + + +def test_encoder_dtypes(): + # check that dtypes are preserved when determining categories + enc = OneHotEncoder(categories="auto") + exp = np.array([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]], dtype="float64") + + for X in [ + np.array([[1, 2], [3, 4]], dtype="int64"), + np.array([[1, 2], [3, 4]], dtype="float64"), + np.array([["a", "b"], ["c", "d"]]), # str dtype + np.array([[b"a", b"b"], [b"c", b"d"]]), # bytes dtype + np.array([[1, "a"], [3, "b"]], dtype="object"), + ]: + enc.fit(X) + assert all([enc.categories_[i].dtype == X.dtype for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + X = [[1, 2], [3, 4]] + enc.fit(X) + assert all([np.issubdtype(enc.categories_[i].dtype, np.integer) for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + X = [[1, "a"], [3, "b"]] + enc.fit(X) + assert all([enc.categories_[i].dtype == "object" for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + +def test_encoder_dtypes_pandas(): + # check dtype (similar to test_categorical_encoder_dtypes for dataframes) + pd = pytest.importorskip("pandas") + + enc = OneHotEncoder(categories="auto") + exp = np.array( + [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]], + dtype="float64", + ) + + X = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}, dtype="int64") + enc.fit(X) + assert all([enc.categories_[i].dtype == "int64" for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + X = pd.DataFrame({"A": [1, 2], "B": ["a", "b"], "C": [3.0, 4.0]}) + X_type = [X["A"].dtype, X["B"].dtype, X["C"].dtype] + enc.fit(X) + assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + +def test_one_hot_encoder_warning(): + enc = OneHotEncoder() + X = [["Male", 1], ["Female", 3]] + with warnings.catch_warnings(): + warnings.simplefilter("error") + enc.fit_transform(X) + + +@pytest.mark.parametrize("drop", ["if_binary", "first"]) +def test_ohe_handle_unknown_warn(drop): + """Check handle_unknown='warn' works correctly.""" + + X = [["a", 0], ["b", 2], ["b", 1]] + + ohe = OneHotEncoder( + drop=drop, + sparse_output=False, + handle_unknown="warn", + categories=[["b", "a"], [1, 2]], + ) + ohe.fit(X) + + X_test = [["c", 1]] + X_expected = np.array([[0, 0]]) + + warn_msg = ( + r"Found unknown categories in columns \[0\] during transform. " + r"These unknown categories will be encoded as all zeros" + ) + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + +@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")]) +def test_one_hot_encoder_drop_manual(missing_value): + cats_to_drop = ["def", 12, 3, 56, missing_value] + enc = OneHotEncoder(drop=cats_to_drop) + X = [ + ["abc", 12, 2, 55, "a"], + ["def", 12, 1, 55, "a"], + ["def", 12, 3, 56, missing_value], + ] + trans = enc.fit_transform(X).toarray() + exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]] + assert_array_equal(trans, exp) + assert enc.drop is cats_to_drop + + dropped_cats = [ + cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_) + ] + X_inv_trans = enc.inverse_transform(trans) + X_array = np.array(X, dtype=object) + + # last value is np.nan + if is_scalar_nan(cats_to_drop[-1]): + assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1]) + assert is_scalar_nan(dropped_cats[-1]) + assert is_scalar_nan(cats_to_drop[-1]) + # do not include the last column which includes missing values + assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1]) + + # check last column is the missing value + assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1]) + assert is_scalar_nan(X_array[-1, -1]) + assert is_scalar_nan(X_inv_trans[-1, -1]) + else: + assert_array_equal(dropped_cats, cats_to_drop) + assert_array_equal(X_array, X_inv_trans) + + +@pytest.mark.parametrize("drop", [["abc", 3], ["abc", 3, 41, "a"]]) +def test_invalid_drop_length(drop): + enc = OneHotEncoder(drop=drop) + err_msg = "`drop` should have length equal to the number" + with pytest.raises(ValueError, match=err_msg): + enc.fit([["abc", 2, 55], ["def", 1, 55], ["def", 3, 59]]) + + +@pytest.mark.parametrize("density", [True, False], ids=["sparse", "dense"]) +@pytest.mark.parametrize("drop", ["first", ["a", 2, "b"]], ids=["first", "manual"]) +def test_categories(density, drop): + ohe_base = OneHotEncoder(sparse_output=density) + ohe_test = OneHotEncoder(sparse_output=density, drop=drop) + X = [["c", 1, "a"], ["a", 2, "b"]] + ohe_base.fit(X) + ohe_test.fit(X) + assert_array_equal(ohe_base.categories_, ohe_test.categories_) + if drop == "first": + assert_array_equal(ohe_test.drop_idx_, 0) + else: + for drop_cat, drop_idx, cat_list in zip( + drop, ohe_test.drop_idx_, ohe_test.categories_ + ): + assert cat_list[int(drop_idx)] == drop_cat + assert isinstance(ohe_test.drop_idx_, np.ndarray) + assert ohe_test.drop_idx_.dtype == object + + +@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) +def test_encoders_has_categorical_tags(Encoder): + assert Encoder().__sklearn_tags__().input_tags.categorical + + +@pytest.mark.parametrize( + "kwargs", + [ + {"max_categories": 2}, + {"min_frequency": 11}, + {"min_frequency": 0.29}, + {"max_categories": 2, "min_frequency": 6}, + {"max_categories": 4, "min_frequency": 12}, + ], +) +@pytest.mark.parametrize("categories", ["auto", [["a", "b", "c", "d"]]]) +def test_ohe_infrequent_two_levels(kwargs, categories): + """Test that different parameters for combine 'a', 'c', and 'd' into + the infrequent category works as expected.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + categories=categories, + handle_unknown="infrequent_if_exist", + sparse_output=False, + **kwargs, + ).fit(X_train) + assert_array_equal(ohe.infrequent_categories_, [["a", "c", "d"]]) + + X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] + expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4] + X_inv = ohe.inverse_transform(X_trans) + assert_array_equal(expected_inv, X_inv) + + feature_names = ohe.get_feature_names_out() + assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names) + + +@pytest.mark.parametrize("drop", ["if_binary", "first", ["b"]]) +def test_ohe_infrequent_two_levels_drop_frequent(drop): + """Test two levels and dropping the frequent category.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", + sparse_output=False, + max_categories=2, + drop=drop, + ).fit(X_train) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "b" + + X_test = np.array([["b"], ["c"]]) + X_trans = ohe.transform(X_test) + assert_allclose([[0], [1]], X_trans) + + feature_names = ohe.get_feature_names_out() + assert_array_equal(["x0_infrequent_sklearn"], feature_names) + + X_inverse = ohe.inverse_transform(X_trans) + assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse) + + +@pytest.mark.parametrize("drop", [["a"], ["d"]]) +def test_ohe_infrequent_two_levels_drop_infrequent_errors(drop): + """Test two levels and dropping any infrequent category removes the + whole infrequent category.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", + sparse_output=False, + max_categories=2, + drop=drop, + ) + + msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent" + with pytest.raises(ValueError, match=msg): + ohe.fit(X_train) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"max_categories": 3}, + {"min_frequency": 6}, + {"min_frequency": 9}, + {"min_frequency": 0.24}, + {"min_frequency": 0.16}, + {"max_categories": 3, "min_frequency": 8}, + {"max_categories": 4, "min_frequency": 6}, + ], +) +def test_ohe_infrequent_three_levels(kwargs): + """Test that different parameters for combing 'a', and 'd' into + the infrequent category works as expected.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs + ).fit(X_train) + assert_array_equal(ohe.infrequent_categories_, [["a", "d"]]) + + X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] + expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1], [0, 0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + expected_inv = [ + ["b"], + ["infrequent_sklearn"], + ["c"], + ["infrequent_sklearn"], + ["infrequent_sklearn"], + ] + X_inv = ohe.inverse_transform(X_trans) + assert_array_equal(expected_inv, X_inv) + + feature_names = ohe.get_feature_names_out() + assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names) + + +@pytest.mark.parametrize("drop", ["first", ["b"]]) +def test_ohe_infrequent_three_levels_drop_frequent(drop): + """Test three levels and dropping the frequent category.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", + sparse_output=False, + max_categories=3, + drop=drop, + ).fit(X_train) + + X_test = np.array([["b"], ["c"], ["d"]]) + assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test)) + + # Check handle_unknown="ignore" + ohe.set_params(handle_unknown="ignore").fit(X_train) + msg = "Found unknown categories" + with pytest.warns(UserWarning, match=msg): + X_trans = ohe.transform([["b"], ["e"]]) + + assert_allclose([[0, 0], [0, 0]], X_trans) + + +@pytest.mark.parametrize("drop", [["a"], ["d"]]) +def test_ohe_infrequent_three_levels_drop_infrequent_errors(drop): + """Test three levels and dropping the infrequent category.""" + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", + sparse_output=False, + max_categories=3, + drop=drop, + ) + + msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent" + with pytest.raises(ValueError, match=msg): + ohe.fit(X_train) + + +def test_ohe_infrequent_handle_unknown_error(): + """Test that different parameters for combining 'a', and 'd' into + the infrequent category works as expected.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="error", sparse_output=False, max_categories=3 + ).fit(X_train) + assert_array_equal(ohe.infrequent_categories_, [["a", "d"]]) + + # all categories are known + X_test = [["b"], ["a"], ["c"], ["d"]] + expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + # 'bad' is not known and will error + X_test = [["bad"]] + msg = r"Found unknown categories \['bad'\] in column 0" + with pytest.raises(ValueError, match=msg): + ohe.transform(X_test) + + +@pytest.mark.parametrize( + "kwargs", [{"max_categories": 3, "min_frequency": 1}, {"min_frequency": 4}] +) +def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs): + """'a' is the only frequent category, all other categories are infrequent.""" + + X_train = np.array([["a"] * 5 + ["e"] * 30], dtype=object).T + ohe = OneHotEncoder( + categories=[["c", "d", "a", "b"]], + sparse_output=False, + handle_unknown="infrequent_if_exist", + **kwargs, + ).fit(X_train) + + X_test = [["a"], ["b"], ["c"], ["d"], ["e"]] + expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + # 'a' is dropped + drops = ["first", "if_binary", ["a"]] + X_test = [["a"], ["c"]] + for drop in drops: + ohe.set_params(drop=drop).fit(X_train) + assert_allclose([[0], [1]], ohe.transform(X_test)) + + +def test_ohe_infrequent_two_levels_user_cats(): + """Test that the order of the categories provided by a user is respected.""" + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object + ).T + ohe = OneHotEncoder( + categories=[["c", "d", "a", "b"]], + sparse_output=False, + handle_unknown="infrequent_if_exist", + max_categories=2, + ).fit(X_train) + + assert_array_equal(ohe.infrequent_categories_, [["c", "d", "a"]]) + + X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] + expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + # 'infrequent' is used to denote the infrequent categories for + # `inverse_transform` + expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4] + X_inv = ohe.inverse_transform(X_trans) + assert_array_equal(expected_inv, X_inv) + + +def test_ohe_infrequent_three_levels_user_cats(): + """Test that the order of the categories provided by a user is respected. + In this case 'c' is encoded as the first category and 'b' is encoded + as the second one.""" + + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object + ).T + ohe = OneHotEncoder( + categories=[["c", "d", "b", "a"]], + sparse_output=False, + handle_unknown="infrequent_if_exist", + max_categories=3, + ).fit(X_train) + + assert_array_equal(ohe.infrequent_categories_, [["d", "a"]]) + + X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] + expected = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1], [0, 0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + # 'infrequent' is used to denote the infrequent categories for + # `inverse_transform` + expected_inv = [ + ["b"], + ["infrequent_sklearn"], + ["c"], + ["infrequent_sklearn"], + ["infrequent_sklearn"], + ] + X_inv = ohe.inverse_transform(X_trans) + assert_array_equal(expected_inv, X_inv) + + +def test_ohe_infrequent_mixed(): + """Test infrequent categories where feature 0 has infrequent categories, + and feature 1 does not.""" + + # X[:, 0] 1 and 2 are infrequent + # X[:, 1] nothing is infrequent + X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]] + + ohe = OneHotEncoder(max_categories=3, drop="if_binary", sparse_output=False) + ohe.fit(X) + + X_test = [[3, 0], [1, 1]] + X_trans = ohe.transform(X_test) + + # feature 1 is binary so it drops a category 0 + assert_allclose(X_trans, [[0, 1, 0, 0], [0, 0, 1, 1]]) + + +def test_ohe_infrequent_multiple_categories(): + """Test infrequent categories with feature matrix with 3 features.""" + + X = np.c_[ + [0, 1, 3, 3, 3, 3, 2, 0, 3], + [0, 0, 5, 1, 1, 10, 5, 5, 0], + [1, 0, 1, 0, 1, 0, 1, 0, 1], + ] + + ohe = OneHotEncoder( + categories="auto", max_categories=3, handle_unknown="infrequent_if_exist" + ) + # X[:, 0] 1 and 2 are infrequent + # X[:, 1] 1 and 10 are infrequent + # X[:, 2] nothing is infrequent + + X_trans = ohe.fit_transform(X).toarray() + assert_array_equal(ohe.infrequent_categories_[0], [1, 2]) + assert_array_equal(ohe.infrequent_categories_[1], [1, 10]) + assert_array_equal(ohe.infrequent_categories_[2], None) + + # 'infrequent' is used to denote the infrequent categories + # For the first column, 1 and 2 have the same frequency. In this case, + # 1 will be chosen to be the feature name because is smaller lexiconically + feature_names = ohe.get_feature_names_out() + assert_array_equal( + [ + "x0_0", + "x0_3", + "x0_infrequent_sklearn", + "x1_0", + "x1_5", + "x1_infrequent_sklearn", + "x2_0", + "x2_1", + ], + feature_names, + ) + + expected = [ + [1, 0, 0, 1, 0, 0, 0, 1], + [0, 0, 1, 1, 0, 0, 1, 0], + [0, 1, 0, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 0, 1, 1, 0], + [0, 1, 0, 0, 0, 1, 0, 1], + [0, 1, 0, 0, 0, 1, 1, 0], + [0, 0, 1, 0, 1, 0, 0, 1], + [1, 0, 0, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 1], + ] + + assert_allclose(expected, X_trans) + + X_test = [[3, 1, 2], [4, 0, 3]] + + X_test_trans = ohe.transform(X_test) + + # X[:, 2] does not have an infrequent category, thus it is encoded as all + # zeros + expected = [[0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0]] + assert_allclose(expected, X_test_trans.toarray()) + + X_inv = ohe.inverse_transform(X_test_trans) + expected_inv = np.array( + [[3, "infrequent_sklearn", None], ["infrequent_sklearn", 0, None]], dtype=object + ) + assert_array_equal(expected_inv, X_inv) + + # error for unknown categories + ohe = OneHotEncoder( + categories="auto", max_categories=3, handle_unknown="error" + ).fit(X) + with pytest.raises(ValueError, match="Found unknown categories"): + ohe.transform(X_test) + + # only infrequent or known categories + X_test = [[1, 1, 1], [3, 10, 0]] + X_test_trans = ohe.transform(X_test) + + expected = [[0, 0, 1, 0, 0, 1, 0, 1], [0, 1, 0, 0, 0, 1, 1, 0]] + assert_allclose(expected, X_test_trans.toarray()) + + X_inv = ohe.inverse_transform(X_test_trans) + + expected_inv = np.array( + [["infrequent_sklearn", "infrequent_sklearn", 1], [3, "infrequent_sklearn", 0]], + dtype=object, + ) + assert_array_equal(expected_inv, X_inv) + + +def test_ohe_infrequent_multiple_categories_dtypes(): + """Test infrequent categories with a pandas dataframe with multiple dtypes.""" + + pd = pytest.importorskip("pandas") + X = pd.DataFrame( + { + "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"], + "int": [5, 3, 0, 10, 10, 12, 0, 3, 5], + }, + columns=["str", "int"], + ) + + ohe = OneHotEncoder( + categories="auto", max_categories=3, handle_unknown="infrequent_if_exist" + ) + # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be + # considered infrequent because they are greater + + # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1. + # 0, 3, 12 will be considered infrequent + + X_trans = ohe.fit_transform(X).toarray() + assert_array_equal(ohe.infrequent_categories_[0], ["a", "b"]) + assert_array_equal(ohe.infrequent_categories_[1], [0, 3, 12]) + + expected = [ + [0, 0, 1, 1, 0, 0], + [0, 1, 0, 0, 0, 1], + [1, 0, 0, 0, 0, 1], + [0, 1, 0, 0, 1, 0], + [0, 1, 0, 0, 1, 0], + [0, 0, 1, 0, 0, 1], + [1, 0, 0, 0, 0, 1], + [0, 0, 1, 0, 0, 1], + [0, 0, 1, 1, 0, 0], + ] + + assert_allclose(expected, X_trans) + + X_test = pd.DataFrame({"str": ["b", "f"], "int": [14, 12]}, columns=["str", "int"]) + + expected = [[0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 0, 1]] + X_test_trans = ohe.transform(X_test) + assert_allclose(expected, X_test_trans.toarray()) + + X_inv = ohe.inverse_transform(X_test_trans) + expected_inv = np.array( + [["infrequent_sklearn", "infrequent_sklearn"], ["f", "infrequent_sklearn"]], + dtype=object, + ) + assert_array_equal(expected_inv, X_inv) + + # only infrequent or known categories + X_test = pd.DataFrame({"str": ["c", "b"], "int": [12, 5]}, columns=["str", "int"]) + X_test_trans = ohe.transform(X_test).toarray() + expected = [[1, 0, 0, 0, 0, 1], [0, 0, 1, 1, 0, 0]] + assert_allclose(expected, X_test_trans) + + X_inv = ohe.inverse_transform(X_test_trans) + expected_inv = np.array( + [["c", "infrequent_sklearn"], ["infrequent_sklearn", 5]], dtype=object + ) + assert_array_equal(expected_inv, X_inv) + + +@pytest.mark.parametrize("kwargs", [{"min_frequency": 21, "max_categories": 1}]) +def test_ohe_infrequent_one_level_errors(kwargs): + """All user provided categories are infrequent.""" + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 2]).T + + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs + ) + ohe.fit(X_train) + + X_trans = ohe.transform([["a"]]) + assert_allclose(X_trans, [[1]]) + + +@pytest.mark.parametrize("kwargs", [{"min_frequency": 2, "max_categories": 3}]) +def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs): + """All user provided categories are infrequent.""" + + X_train = np.array([["e"] * 3], dtype=object).T + ohe = OneHotEncoder( + categories=[["c", "d", "a", "b"]], + sparse_output=False, + handle_unknown="infrequent_if_exist", + **kwargs, + ).fit(X_train) + + X_trans = ohe.transform([["a"], ["e"]]) + assert_allclose(X_trans, [[1], [1]]) + + +# deliberately omit 'OS' as an invalid combo +@pytest.mark.parametrize( + "input_dtype, category_dtype", ["OO", "OU", "UO", "UU", "SO", "SU", "SS"] +) +@pytest.mark.parametrize("array_type", ["list", "array", "dataframe"]) +def test_encoders_string_categories(input_dtype, category_dtype, array_type): + """Check that encoding work with object, unicode, and byte string dtypes. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/15616 + https://github.com/scikit-learn/scikit-learn/issues/15726 + https://github.com/scikit-learn/scikit-learn/issues/19677 + """ + + X = np.array([["b"], ["a"]], dtype=input_dtype) + categories = [np.array(["b", "a"], dtype=category_dtype)] + ohe = OneHotEncoder(categories=categories, sparse_output=False).fit(X) + + X_test = _convert_container( + [["a"], ["a"], ["b"], ["a"]], array_type, dtype=input_dtype + ) + X_trans = ohe.transform(X_test) + + expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]]) + assert_allclose(X_trans, expected) + + oe = OrdinalEncoder(categories=categories).fit(X) + X_trans = oe.transform(X_test) + + expected = np.array([[1], [1], [0], [1]]) + assert_array_equal(X_trans, expected) + + +def test_mixed_string_bytes_categoricals(): + """Check that this mixture of predefined categories and X raises an error. + + Categories defined as bytes can not easily be compared to data that is + a string. + """ + # data as unicode + X = np.array([["b"], ["a"]], dtype="U") + # predefined categories as bytes + categories = [np.array(["b", "a"], dtype="S")] + ohe = OneHotEncoder(categories=categories, sparse_output=False) + + msg = re.escape( + "In column 0, the predefined categories have type 'bytes' which is incompatible" + " with values of type 'str_'." + ) + + with pytest.raises(ValueError, match=msg): + ohe.fit(X) + + +@pytest.mark.parametrize("missing_value", [np.nan, None]) +def test_ohe_missing_values_get_feature_names(missing_value): + # encoder with missing values with object dtypes + X = np.array([["a", "b", missing_value, "a", missing_value]], dtype=object).T + ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(X) + names = ohe.get_feature_names_out() + assert_array_equal(names, ["x0_a", "x0_b", f"x0_{missing_value}"]) + + +def test_ohe_missing_value_support_pandas(): + # check support for pandas with mixed dtypes and missing values + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + { + "col1": ["dog", "cat", None, "cat"], + "col2": np.array([3, 0, 4, np.nan], dtype=float), + }, + columns=["col1", "col2"], + ) + expected_df_trans = np.array( + [ + [0, 1, 0, 0, 1, 0, 0], + [1, 0, 0, 1, 0, 0, 0], + [0, 0, 1, 0, 0, 1, 0], + [1, 0, 0, 0, 0, 0, 1], + ] + ) + + Xtr = check_categorical_onehot(df) + assert_allclose(Xtr, expected_df_trans) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"]) +def test_ohe_missing_value_support_pandas_categorical(pd_nan_type, handle_unknown): + # checks pandas dataframe with categorical features + pd = pytest.importorskip("pandas") + + pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan + + df = pd.DataFrame( + { + "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"), + } + ) + expected_df_trans = np.array( + [ + [0, 0, 1, 0], + [1, 0, 0, 0], + [0, 0, 0, 1], + [0, 1, 0, 0], + [1, 0, 0, 0], + ] + ) + + ohe = OneHotEncoder(sparse_output=False, handle_unknown=handle_unknown) + df_trans = ohe.fit_transform(df) + assert_allclose(expected_df_trans, df_trans) + + assert len(ohe.categories_) == 1 + assert_array_equal(ohe.categories_[0][:-1], ["a", "b", "c"]) + assert np.isnan(ohe.categories_[0][-1]) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown): + """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist' + during transform.""" + X = [["a", 0], ["b", 2], ["b", 1]] + + ohe = OneHotEncoder( + drop="first", sparse_output=False, handle_unknown=handle_unknown + ) + X_trans = ohe.fit_transform(X) + + X_expected = np.array( + [ + [0, 0, 0], + [1, 0, 1], + [1, 1, 0], + ] + ) + assert_allclose(X_trans, X_expected) + + # Both categories are unknown + X_test = [["c", 3]] + X_expected = np.array([[0, 0, 0]]) + + warn_msg = ( + r"Found unknown categories in columns \[0, 1\] during " + "transform. These unknown categories will be encoded as all " + "zeros" + ) + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + # inverse_transform maps to None + X_inv = ohe.inverse_transform(X_expected) + assert_array_equal(X_inv, np.array([["a", 0]], dtype=object)) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown): + """Check drop='if_binary' and handle_unknown='ignore' during transform.""" + X = [["a", 0], ["b", 2], ["b", 1]] + + ohe = OneHotEncoder( + drop="if_binary", sparse_output=False, handle_unknown=handle_unknown + ) + X_trans = ohe.fit_transform(X) + + X_expected = np.array( + [ + [0, 1, 0, 0], + [1, 0, 0, 1], + [1, 0, 1, 0], + ] + ) + assert_allclose(X_trans, X_expected) + + # Both categories are unknown + X_test = [["c", 3]] + X_expected = np.array([[0, 0, 0, 0]]) + + warn_msg = ( + r"Found unknown categories in columns \[0, 1\] during " + "transform. These unknown categories will be encoded as all " + "zeros" + ) + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + # inverse_transform maps to None + X_inv = ohe.inverse_transform(X_expected) + assert_array_equal(X_inv, np.array([["a", None]], dtype=object)) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +def test_ohe_drop_first_explicit_categories(handle_unknown): + """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist' + during fit with categories passed in.""" + + X = [["a", 0], ["b", 2], ["b", 1]] + + ohe = OneHotEncoder( + drop="first", + sparse_output=False, + handle_unknown=handle_unknown, + categories=[["b", "a"], [1, 2]], + ) + ohe.fit(X) + + X_test = [["c", 1]] + X_expected = np.array([[0, 0]]) + + warn_msg = ( + r"Found unknown categories in columns \[0\] during transform. " + r"These unknown categories will be encoded as all zeros" + ) + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + +def test_ohe_more_informative_error_message(): + """Raise informative error message when pandas output and sparse_output=True.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"a": [1, 2, 3], "b": ["z", "b", "b"]}, columns=["a", "b"]) + + ohe = OneHotEncoder(sparse_output=True) + ohe.set_output(transform="pandas") + + msg = ( + "Pandas output does not support sparse data. Set " + "sparse_output=False to output pandas dataframes or disable Pandas output" + ) + with pytest.raises(ValueError, match=msg): + ohe.fit_transform(df) + + ohe.fit(df) + with pytest.raises(ValueError, match=msg): + ohe.transform(df) + + +def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype(): + """Test ordinal encoder with nan passthrough fails when dtype=np.int32.""" + + X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T + oe = OrdinalEncoder(dtype=np.int32) + + msg = ( + r"There are missing values in features \[0\]. For OrdinalEncoder " + f"to encode missing values with dtype: {np.int32}" + ) + with pytest.raises(ValueError, match=msg): + oe.fit(X) + + +@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2]) +def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value): + """Test ordinal encoder with nan on float dtypes.""" + + X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T + oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X) + + assert len(oe.categories_) == 1 + + assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan]) + + X_trans = oe.transform(X) + assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]]) + + X_inverse = oe.inverse_transform(X_trans) + assert_allclose(X_inverse, X) + + +@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"]) +@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2]) +def test_ordinal_encoder_missing_value_support_pandas_categorical( + pd_nan_type, encoded_missing_value +): + """Check ordinal encoder is compatible with pandas.""" + # checks pandas dataframe with categorical features + pd = pytest.importorskip("pandas") + + pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan + + df = pd.DataFrame( + { + "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"), + } + ) + + oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df) + assert len(oe.categories_) == 1 + assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"]) + assert np.isnan(oe.categories_[0][-1]) + + df_trans = oe.transform(df) + + assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]]) + + X_inverse = oe.inverse_transform(df_trans) + assert X_inverse.shape == (5, 1) + assert_array_equal(X_inverse[:2, 0], ["c", "a"]) + assert_array_equal(X_inverse[3:, 0], ["b", "a"]) + assert np.isnan(X_inverse[2, 0]) + + +@pytest.mark.parametrize( + "X, X2, cats, cat_dtype", + [ + ( + ( + np.array([["a", np.nan]], dtype=object).T, + np.array([["a", "b"]], dtype=object).T, + [np.array(["a", "d", np.nan], dtype=object)], + np.object_, + ) + ), + ( + ( + np.array([["a", np.nan]], dtype=object).T, + np.array([["a", "b"]], dtype=object).T, + [np.array(["a", "d", np.nan], dtype=object)], + np.object_, + ) + ), + ( + ( + np.array([[2.0, np.nan]], dtype=np.float64).T, + np.array([[3.0]], dtype=np.float64).T, + [np.array([2.0, 4.0, np.nan])], + np.float64, + ) + ), + ], + ids=[ + "object-None-missing-value", + "object-nan-missing_value", + "numeric-missing-value", + ], +) +def test_ordinal_encoder_specified_categories_missing_passthrough( + X, X2, cats, cat_dtype +): + """Test ordinal encoder for specified categories.""" + oe = OrdinalEncoder(categories=cats) + exp = np.array([[0.0], [np.nan]]) + assert_array_equal(oe.fit_transform(X), exp) + # manually specified categories should have same dtype as + # the data when coerced from lists + assert oe.categories_[0].dtype == cat_dtype + + # when specifying categories manually, unknown categories should already + # raise when fitting + oe = OrdinalEncoder(categories=cats) + with pytest.raises(ValueError, match="Found unknown categories"): + oe.fit(X2) + + +@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) +def test_encoder_duplicate_specified_categories(Encoder): + """Test encoder for specified categories have duplicate values. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27088 + """ + cats = [np.array(["a", "b", "a"], dtype=object)] + enc = Encoder(categories=cats) + X = np.array([["a", "b"]], dtype=object).T + with pytest.raises( + ValueError, match="the predefined categories contain duplicate elements." + ): + enc.fit(X) + + +@pytest.mark.parametrize( + "X, expected_X_trans, X_test", + [ + ( + np.array([[1.0, np.nan, 3.0]]).T, + np.array([[0.0, np.nan, 1.0]]).T, + np.array([[4.0]]), + ), + ( + np.array([[1.0, 4.0, 3.0]]).T, + np.array([[0.0, 2.0, 1.0]]).T, + np.array([[np.nan]]), + ), + ( + np.array([["c", np.nan, "b"]], dtype=object).T, + np.array([[1.0, np.nan, 0.0]]).T, + np.array([["d"]], dtype=object), + ), + ( + np.array([["c", "a", "b"]], dtype=object).T, + np.array([[2.0, 0.0, 1.0]]).T, + np.array([[np.nan]], dtype=object), + ), + ], +) +def test_ordinal_encoder_handle_missing_and_unknown(X, expected_X_trans, X_test): + """Test the interaction between missing values and handle_unknown""" + + oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) + + X_trans = oe.fit_transform(X) + assert_allclose(X_trans, expected_X_trans) + + assert_allclose(oe.transform(X_test), [[-1.0]]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_ordinal_encoder_sparse(csr_container): + """Check that we raise proper error with sparse input in OrdinalEncoder. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19878 + """ + X = np.array([[3, 2, 1], [0, 1, 1]]) + X_sparse = csr_container(X) + + encoder = OrdinalEncoder() + + err_msg = "Sparse data was passed, but dense data is required" + with pytest.raises(TypeError, match=err_msg): + encoder.fit(X_sparse) + with pytest.raises(TypeError, match=err_msg): + encoder.fit_transform(X_sparse) + + X_trans = encoder.fit_transform(X) + X_trans_sparse = csr_container(X_trans) + with pytest.raises(TypeError, match=err_msg): + encoder.inverse_transform(X_trans_sparse) + + +def test_ordinal_encoder_fit_with_unseen_category(): + """Check OrdinalEncoder.fit works with unseen category when + `handle_unknown="use_encoded_value"`. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19872 + """ + X = np.array([0, 0, 1, 0, 2, 5])[:, np.newaxis] + oe = OrdinalEncoder( + categories=[[-1, 0, 1]], handle_unknown="use_encoded_value", unknown_value=-999 + ) + oe.fit(X) + + oe = OrdinalEncoder(categories=[[-1, 0, 1]], handle_unknown="error") + with pytest.raises(ValueError, match="Found unknown categories"): + oe.fit(X) + + +@pytest.mark.parametrize( + "X_train", + [ + [["AA", "B"]], + np.array([["AA", "B"]], dtype="O"), + np.array([["AA", "B"]], dtype="U"), + ], +) +@pytest.mark.parametrize( + "X_test", + [ + [["A", "B"]], + np.array([["A", "B"]], dtype="O"), + np.array([["A", "B"]], dtype="U"), + ], +) +def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test): + """Checks that `OrdinalEncoder` transforms string dtypes. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19872 + """ + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-9) + enc.fit(X_train) + + X_trans = enc.transform(X_test) + assert_allclose(X_trans, [[-9, 0]]) + + +def test_ordinal_encoder_python_integer(): + """Check that `OrdinalEncoder` accepts Python integers that are potentially + larger than 64 bits. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/20721 + """ + X = np.array( + [ + 44253463435747313673, + 9867966753463435747313673, + 44253462342215747313673, + 442534634357764313673, + ] + ).reshape(-1, 1) + encoder = OrdinalEncoder().fit(X) + assert_array_equal(encoder.categories_, np.sort(X, axis=0).T) + X_trans = encoder.transform(X) + assert_array_equal(X_trans, [[0], [3], [2], [1]]) + + +def test_ordinal_encoder_features_names_out_pandas(): + """Check feature names out is same as the input.""" + pd = pytest.importorskip("pandas") + + names = ["b", "c", "a"] + X = pd.DataFrame([[1, 2, 3]], columns=names) + enc = OrdinalEncoder().fit(X) + + feature_names_out = enc.get_feature_names_out() + assert_array_equal(names, feature_names_out) + + +def test_ordinal_encoder_unknown_missing_interaction(): + """Check interactions between encode_unknown and missing value encoding.""" + + X = np.array([["a"], ["b"], [np.nan]], dtype=object) + + oe = OrdinalEncoder( + handle_unknown="use_encoded_value", + unknown_value=np.nan, + encoded_missing_value=-3, + ).fit(X) + + X_trans = oe.transform(X) + assert_allclose(X_trans, [[0], [1], [-3]]) + + # "c" is unknown and is mapped to np.nan + # "None" is a missing value and is set to -3 + X_test = np.array([["c"], [np.nan]], dtype=object) + X_test_trans = oe.transform(X_test) + assert_allclose(X_test_trans, [[np.nan], [-3]]) + + # Non-regression test for #24082 + X_roundtrip = oe.inverse_transform(X_test_trans) + + # np.nan is unknown so it maps to None + assert X_roundtrip[0][0] is None + + # -3 is the encoded missing value so it maps back to nan + assert np.isnan(X_roundtrip[1][0]) + + +@pytest.mark.parametrize("with_pandas", [True, False]) +def test_ordinal_encoder_encoded_missing_value_error(with_pandas): + """Check OrdinalEncoder errors when encoded_missing_value is used by + an known category.""" + X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object) + + # The 0-th feature has no missing values so it is not included in the list of + # features + error_msg = ( + r"encoded_missing_value \(1\) is already used to encode a known category " + r"in features: " + ) + + if with_pandas: + pd = pytest.importorskip("pandas") + X = pd.DataFrame(X, columns=["letter", "pet"]) + error_msg = error_msg + r"\['pet'\]" + else: + error_msg = error_msg + r"\[1\]" + + oe = OrdinalEncoder(encoded_missing_value=1) + + with pytest.raises(ValueError, match=error_msg): + oe.fit(X) + + +@pytest.mark.parametrize( + "X_train, X_test_trans_expected, X_roundtrip_expected", + [ + ( + # missing value is not in training set + # inverse transform will considering encoded nan as unknown + np.array([["a"], ["1"]], dtype=object), + [[0], [np.nan], [np.nan]], + np.asarray([["1"], [None], [None]], dtype=object), + ), + ( + # missing value in training set, + # inverse transform will considering encoded nan as missing + np.array([[np.nan], ["1"], ["a"]], dtype=object), + [[0], [np.nan], [np.nan]], + np.asarray([["1"], [np.nan], [np.nan]], dtype=object), + ), + ], +) +def test_ordinal_encoder_unknown_missing_interaction_both_nan( + X_train, X_test_trans_expected, X_roundtrip_expected +): + """Check transform when unknown_value and encoded_missing_value is nan. + + Non-regression test for #24082. + """ + oe = OrdinalEncoder( + handle_unknown="use_encoded_value", + unknown_value=np.nan, + encoded_missing_value=np.nan, + ).fit(X_train) + + X_test = np.array([["1"], [np.nan], ["b"]]) + X_test_trans = oe.transform(X_test) + + # both nan and unknown are encoded as nan + assert_allclose(X_test_trans, X_test_trans_expected) + X_roundtrip = oe.inverse_transform(X_test_trans) + + n_samples = X_roundtrip_expected.shape[0] + for i in range(n_samples): + expected_val = X_roundtrip_expected[i, 0] + val = X_roundtrip[i, 0] + + if expected_val is None: + assert val is None + elif is_scalar_nan(expected_val): + assert np.isnan(val) + else: + assert val == expected_val + + +def test_one_hot_encoder_set_output(): + """Check OneHotEncoder works with set_output.""" + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) + ohe = OneHotEncoder() + + ohe.set_output(transform="pandas") + + match = "Pandas output does not support sparse data. Set sparse_output=False" + with pytest.raises(ValueError, match=match): + ohe.fit_transform(X_df) + + ohe_default = OneHotEncoder(sparse_output=False).set_output(transform="default") + ohe_pandas = OneHotEncoder(sparse_output=False).set_output(transform="pandas") + + X_default = ohe_default.fit_transform(X_df) + X_pandas = ohe_pandas.fit_transform(X_df) + + assert_allclose(X_pandas.to_numpy(), X_default) + assert_array_equal(ohe_pandas.get_feature_names_out(), X_pandas.columns) + + +def test_ordinal_set_output(): + """Check OrdinalEncoder works with set_output.""" + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) + + ord_default = OrdinalEncoder().set_output(transform="default") + ord_pandas = OrdinalEncoder().set_output(transform="pandas") + + X_default = ord_default.fit_transform(X_df) + X_pandas = ord_pandas.fit_transform(X_df) + + assert_allclose(X_pandas.to_numpy(), X_default) + assert_array_equal(ord_pandas.get_feature_names_out(), X_pandas.columns) + + +def test_predefined_categories_dtype(): + """Check that the categories_ dtype is `object` for string categories + + Regression test for gh-25171. + """ + categories = [["as", "mmas", "eas", "ras", "acs"], ["1", "2"]] + + enc = OneHotEncoder(categories=categories) + + enc.fit([["as", "1"]]) + + assert len(categories) == len(enc.categories_) + for n, cat in enumerate(enc.categories_): + assert cat.dtype == object + assert_array_equal(categories[n], cat) + + +def test_ordinal_encoder_missing_unknown_encoding_max(): + """Check missing value or unknown encoding can equal the cardinality.""" + X = np.array([["dog"], ["cat"], [np.nan]], dtype=object) + X_trans = OrdinalEncoder(encoded_missing_value=2).fit_transform(X) + assert_allclose(X_trans, [[1], [0], [2]]) + + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=2).fit(X) + X_test = np.array([["snake"]]) + X_trans = enc.transform(X_test) + assert_allclose(X_trans, [[2]]) + + +def test_drop_idx_infrequent_categories(): + """Check drop_idx is defined correctly with infrequent categories. + + Non-regression test for gh-25550. + """ + X = np.array( + [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object + ).T + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X) + assert_array_equal( + ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"] + ) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "b" + + X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X) + assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"]) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "c" + + X = np.array( + [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object + ).T + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X) + assert_array_equal( + ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"] + ) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "d" + + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X) + assert_array_equal( + ohe.get_feature_names_out(), + ["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"], + ) + assert ohe.drop_idx_ is None + + +@pytest.mark.parametrize( + "kwargs", + [ + {"max_categories": 3}, + {"min_frequency": 6}, + {"min_frequency": 9}, + {"min_frequency": 0.24}, + {"min_frequency": 0.16}, + {"max_categories": 3, "min_frequency": 8}, + {"max_categories": 4, "min_frequency": 6}, + ], +) +def test_ordinal_encoder_infrequent_three_levels(kwargs): + """Test parameters for grouping 'a', and 'd' into the infrequent category.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ordinal = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1, **kwargs + ).fit(X_train) + assert_array_equal(ordinal.categories_, [["a", "b", "c", "d"]]) + assert_array_equal(ordinal.infrequent_categories_, [["a", "d"]]) + + X_test = [["a"], ["b"], ["c"], ["d"], ["z"]] + expected_trans = [[2], [0], [1], [2], [-1]] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + X_inverse = ordinal.inverse_transform(X_trans) + expected_inverse = [ + ["infrequent_sklearn"], + ["b"], + ["c"], + ["infrequent_sklearn"], + [None], + ] + assert_array_equal(X_inverse, expected_inverse) + + +def test_ordinal_encoder_infrequent_three_levels_user_cats(): + """Test that the order of the categories provided by a user is respected. + + In this case 'c' is encoded as the first category and 'b' is encoded + as the second one. + """ + + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object + ).T + ordinal = OrdinalEncoder( + categories=[["c", "d", "b", "a"]], + max_categories=3, + handle_unknown="use_encoded_value", + unknown_value=-1, + ).fit(X_train) + assert_array_equal(ordinal.categories_, [["c", "d", "b", "a"]]) + assert_array_equal(ordinal.infrequent_categories_, [["d", "a"]]) + + X_test = [["a"], ["b"], ["c"], ["d"], ["z"]] + expected_trans = [[2], [1], [0], [2], [-1]] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + X_inverse = ordinal.inverse_transform(X_trans) + expected_inverse = [ + ["infrequent_sklearn"], + ["b"], + ["c"], + ["infrequent_sklearn"], + [None], + ] + assert_array_equal(X_inverse, expected_inverse) + + +def test_ordinal_encoder_infrequent_mixed(): + """Test when feature 0 has infrequent categories and feature 1 does not.""" + + X = np.column_stack(([0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1])) + + ordinal = OrdinalEncoder(max_categories=3).fit(X) + + assert_array_equal(ordinal.infrequent_categories_[0], [1, 2]) + assert ordinal.infrequent_categories_[1] is None + + X_test = [[3, 0], [1, 1]] + expected_trans = [[1, 0], [2, 1]] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + X_inverse = ordinal.inverse_transform(X_trans) + expected_inverse = np.array([[3, 0], ["infrequent_sklearn", 1]], dtype=object) + assert_array_equal(X_inverse, expected_inverse) + + +def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): + """Test infrequent categories with a pandas DataFrame with multiple dtypes.""" + + pd = pytest.importorskip("pandas") + categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"]) + X = pd.DataFrame( + { + "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"], + "int": [5, 3, 0, 10, 10, 12, 0, 3, 5], + "categorical": pd.Series( + ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"], + dtype=categorical_dtype, + ), + }, + columns=["str", "int", "categorical"], + ) + + ordinal = OrdinalEncoder(max_categories=3).fit(X) + # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be + # considered infrequent because they appear first when sorted + + # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1. + # 0, 3, 12 will be considered infrequent because they appear first when + # sorted. + + # X[:, 2] "snake" and "bird" or infrequent + + assert_array_equal(ordinal.infrequent_categories_[0], ["a", "b"]) + assert_array_equal(ordinal.infrequent_categories_[1], [0, 3, 12]) + assert_array_equal(ordinal.infrequent_categories_[2], ["bird", "snake"]) + + X_test = pd.DataFrame( + { + "str": ["a", "b", "f", "c"], + "int": [12, 0, 10, 5], + "categorical": pd.Series( + ["cat"] + ["snake"] + ["bird"] + ["dog"], + dtype=categorical_dtype, + ), + }, + columns=["str", "int", "categorical"], + ) + expected_trans = [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + +def test_ordinal_encoder_infrequent_custom_mapping(): + """Check behavior of unknown_value and encoded_missing_value with infrequent.""" + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], dtype=object + ).T + + ordinal = OrdinalEncoder( + handle_unknown="use_encoded_value", + unknown_value=2, + max_categories=2, + encoded_missing_value=3, + ).fit(X_train) + assert_array_equal(ordinal.infrequent_categories_, [["a", "c", "d"]]) + + X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object) + expected_trans = [[1], [0], [1], [1], [2], [3]] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"max_categories": 6}, + {"min_frequency": 2}, + ], +) +def test_ordinal_encoder_all_frequent(kwargs): + """All categories are considered frequent have same encoding as default encoder.""" + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object + ).T + + adjusted_encoder = OrdinalEncoder( + **kwargs, handle_unknown="use_encoded_value", unknown_value=-1 + ).fit(X_train) + default_encoder = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 + ).fit(X_train) + + X_test = [["a"], ["b"], ["c"], ["d"], ["e"]] + + assert_allclose( + adjusted_encoder.transform(X_test), default_encoder.transform(X_test) + ) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"max_categories": 1}, + {"min_frequency": 100}, + ], +) +def test_ordinal_encoder_all_infrequent(kwargs): + """When all categories are infrequent, they are all encoded as zero.""" + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object + ).T + encoder = OrdinalEncoder( + **kwargs, handle_unknown="use_encoded_value", unknown_value=-1 + ).fit(X_train) + + X_test = [["a"], ["b"], ["c"], ["d"], ["e"]] + assert_allclose(encoder.transform(X_test), [[0], [0], [0], [0], [-1]]) + + +def test_ordinal_encoder_missing_appears_frequent(): + """Check behavior when missing value appears frequently.""" + X = np.array( + [[np.nan] * 20 + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"]], + dtype=object, + ).T + ordinal = OrdinalEncoder(max_categories=3).fit(X) + + X_test = np.array([["snake", "cat", "dog", np.nan]], dtype=object).T + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, [[2], [0], [1], [np.nan]]) + + +def test_ordinal_encoder_missing_appears_infrequent(): + """Check behavior when missing value appears infrequently.""" + + # feature 0 has infrequent categories + # feature 1 has no infrequent categories + X = np.array( + [ + [np.nan] + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"], + ["red"] * 9 + ["green"] * 9, + ], + dtype=object, + ).T + ordinal = OrdinalEncoder(min_frequency=4).fit(X) + + X_test = np.array( + [ + ["snake", "red"], + ["deer", "green"], + [np.nan, "green"], + ["dog", "green"], + ["cat", "red"], + ], + dtype=object, + ) + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]]) + + +@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) +def test_encoder_not_fitted(Encoder): + """Check that we raise a `NotFittedError` by calling transform before fit with + the encoders. + + One could expect that the passing the `categories` argument to the encoder + would make it stateless. However, `fit` is making a couple of check, such as the + position of `np.nan`. + """ + X = np.array([["A"], ["B"], ["C"]], dtype=object) + encoder = Encoder(categories=[["A", "B", "C"]]) + with pytest.raises(NotFittedError): + encoder.transform(X) diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_function_transformer.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_function_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..42ec20087900ad7fffabf7a675c0affaa897b96b --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_function_transformer.py @@ -0,0 +1,579 @@ +import warnings + +import numpy as np +import pytest + +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, StandardScaler +from sklearn.utils._testing import ( + _convert_container, + assert_allclose_dense_sparse, + assert_array_equal, +) +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS + + +def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X): + def _func(X, *args, **kwargs): + args_store.append(X) + args_store.extend(args) + kwargs_store.update(kwargs) + return func(X) + + return _func + + +def test_delegate_to_func(): + # (args|kwargs)_store will hold the positional and keyword arguments + # passed to the function inside the FunctionTransformer. + args_store = [] + kwargs_store = {} + X = np.arange(10).reshape((5, 2)) + assert_array_equal( + FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X), + X, + "transform should have returned X unchanged", + ) + + # The function should only have received X. + assert args_store == [ + X + ], "Incorrect positional arguments passed to func: {args}".format(args=args_store) + + assert ( + not kwargs_store + ), "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store) + + # reset the argument stores. + args_store[:] = [] + kwargs_store.clear() + transformed = FunctionTransformer( + _make_func(args_store, kwargs_store), + ).transform(X) + + assert_array_equal( + transformed, X, err_msg="transform should have returned X unchanged" + ) + + # The function should have received X + assert args_store == [ + X + ], "Incorrect positional arguments passed to func: {args}".format(args=args_store) + + assert ( + not kwargs_store + ), "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store) + + +def test_np_log(): + X = np.arange(10).reshape((5, 2)) + + # Test that the numpy.log example still works. + assert_array_equal( + FunctionTransformer(np.log1p).transform(X), + np.log1p(X), + ) + + +def test_kw_arg(): + X = np.linspace(0, 1, num=10).reshape((5, 2)) + + F = FunctionTransformer(np.around, kw_args=dict(decimals=3)) + + # Test that rounding is correct + assert_array_equal(F.transform(X), np.around(X, decimals=3)) + + +def test_kw_arg_update(): + X = np.linspace(0, 1, num=10).reshape((5, 2)) + + F = FunctionTransformer(np.around, kw_args=dict(decimals=3)) + + F.kw_args["decimals"] = 1 + + # Test that rounding is correct + assert_array_equal(F.transform(X), np.around(X, decimals=1)) + + +def test_kw_arg_reset(): + X = np.linspace(0, 1, num=10).reshape((5, 2)) + + F = FunctionTransformer(np.around, kw_args=dict(decimals=3)) + + F.kw_args = dict(decimals=1) + + # Test that rounding is correct + assert_array_equal(F.transform(X), np.around(X, decimals=1)) + + +def test_inverse_transform(): + X = np.array([1, 4, 9, 16]).reshape((2, 2)) + + # Test that inverse_transform works correctly + F = FunctionTransformer( + func=np.sqrt, + inverse_func=np.around, + inv_kw_args=dict(decimals=3), + ) + assert_array_equal( + F.inverse_transform(F.transform(X)), + np.around(np.sqrt(X), decimals=3), + ) + + +@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) +def test_check_inverse(sparse_container): + X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2)) + if sparse_container is not None: + X = sparse_container(X) + + trans = FunctionTransformer( + func=np.sqrt, + inverse_func=np.around, + accept_sparse=sparse_container is not None, + check_inverse=True, + validate=True, + ) + warning_message = ( + "The provided functions are not strictly" + " inverse of each other. If you are sure you" + " want to proceed regardless, set" + " 'check_inverse=False'." + ) + with pytest.warns(UserWarning, match=warning_message): + trans.fit(X) + + trans = FunctionTransformer( + func=np.expm1, + inverse_func=np.log1p, + accept_sparse=sparse_container is not None, + check_inverse=True, + validate=True, + ) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + Xt = trans.fit_transform(X) + + assert_allclose_dense_sparse(X, trans.inverse_transform(Xt)) + + +def test_check_inverse_func_or_inverse_not_provided(): + # check that we don't check inverse when one of the func or inverse is not + # provided. + X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2)) + + trans = FunctionTransformer( + func=np.expm1, inverse_func=None, check_inverse=True, validate=True + ) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + trans.fit(X) + trans = FunctionTransformer( + func=None, inverse_func=np.expm1, check_inverse=True, validate=True + ) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + trans.fit(X) + + +def test_function_transformer_frame(): + pd = pytest.importorskip("pandas") + X_df = pd.DataFrame(np.random.randn(100, 10)) + transformer = FunctionTransformer() + X_df_trans = transformer.fit_transform(X_df) + assert hasattr(X_df_trans, "loc") + + +@pytest.mark.parametrize("X_type", ["array", "series"]) +def test_function_transformer_raise_error_with_mixed_dtype(X_type): + """Check that `FunctionTransformer.check_inverse` raises error on mixed dtype.""" + mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"} + inverse_mapping = {value: key for key, value in mapping.items()} + dtype = "object" + + data = ["one", "two", "three", "one", "one", 5, 6] + data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype) + + def func(X): + return np.array([mapping[X[i]] for i in range(X.size)], dtype=object) + + def inverse_func(X): + return _convert_container( + [inverse_mapping[x] for x in X], + X_type, + columns_name=["value"], + dtype=dtype, + ) + + transformer = FunctionTransformer( + func=func, inverse_func=inverse_func, validate=False, check_inverse=True + ) + + msg = "'check_inverse' is only supported when all the elements in `X` is numerical." + with pytest.raises(ValueError, match=msg): + transformer.fit(data) + + +def test_function_transformer_support_all_nummerical_dataframes_check_inverse_True(): + """Check support for dataframes with only numerical values.""" + pd = pytest.importorskip("pandas") + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = FunctionTransformer( + func=lambda x: x + 2, inverse_func=lambda x: x - 2, check_inverse=True + ) + + # Does not raise an error + df_out = transformer.fit_transform(df) + assert_allclose_dense_sparse(df_out, df + 2) + + +def test_function_transformer_with_dataframe_and_check_inverse_True(): + """Check error is raised when check_inverse=True. + + Non-regresion test for gh-25261. + """ + pd = pytest.importorskip("pandas") + transformer = FunctionTransformer( + func=lambda x: x, inverse_func=lambda x: x, check_inverse=True + ) + + df_mixed = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + msg = "'check_inverse' is only supported when all the elements in `X` is numerical." + with pytest.raises(ValueError, match=msg): + transformer.fit(df_mixed) + + +@pytest.mark.parametrize( + "X, feature_names_out, input_features, expected", + [ + ( + # NumPy inputs, default behavior: generate names + np.random.rand(100, 3), + "one-to-one", + None, + ("x0", "x1", "x2"), + ), + ( + # Pandas input, default behavior: use input feature names + {"a": np.random.rand(100), "b": np.random.rand(100)}, + "one-to-one", + None, + ("a", "b"), + ), + ( + # NumPy input, feature_names_out=callable + np.random.rand(100, 3), + lambda transformer, input_features: ("a", "b"), + None, + ("a", "b"), + ), + ( + # Pandas input, feature_names_out=callable + {"a": np.random.rand(100), "b": np.random.rand(100)}, + lambda transformer, input_features: ("c", "d", "e"), + None, + ("c", "d", "e"), + ), + ( + # NumPy input, feature_names_out=callable – default input_features + np.random.rand(100, 3), + lambda transformer, input_features: tuple(input_features) + ("a",), + None, + ("x0", "x1", "x2", "a"), + ), + ( + # Pandas input, feature_names_out=callable – default input_features + {"a": np.random.rand(100), "b": np.random.rand(100)}, + lambda transformer, input_features: tuple(input_features) + ("c",), + None, + ("a", "b", "c"), + ), + ( + # NumPy input, input_features=list of names + np.random.rand(100, 3), + "one-to-one", + ("a", "b", "c"), + ("a", "b", "c"), + ), + ( + # Pandas input, input_features=list of names + {"a": np.random.rand(100), "b": np.random.rand(100)}, + "one-to-one", + ("a", "b"), # must match feature_names_in_ + ("a", "b"), + ), + ( + # NumPy input, feature_names_out=callable, input_features=list + np.random.rand(100, 3), + lambda transformer, input_features: tuple(input_features) + ("d",), + ("a", "b", "c"), + ("a", "b", "c", "d"), + ), + ( + # Pandas input, feature_names_out=callable, input_features=list + {"a": np.random.rand(100), "b": np.random.rand(100)}, + lambda transformer, input_features: tuple(input_features) + ("c",), + ("a", "b"), # must match feature_names_in_ + ("a", "b", "c"), + ), + ], +) +@pytest.mark.parametrize("validate", [True, False]) +def test_function_transformer_get_feature_names_out( + X, feature_names_out, input_features, expected, validate +): + if isinstance(X, dict): + pd = pytest.importorskip("pandas") + X = pd.DataFrame(X) + + transformer = FunctionTransformer( + feature_names_out=feature_names_out, validate=validate + ) + transformer.fit(X) + names = transformer.get_feature_names_out(input_features) + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, expected) + + +def test_function_transformer_get_feature_names_out_without_validation(): + transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False) + X = np.random.rand(100, 2) + transformer.fit_transform(X) + + names = transformer.get_feature_names_out(("a", "b")) + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, ("a", "b")) + + +def test_function_transformer_feature_names_out_is_None(): + transformer = FunctionTransformer() + X = np.random.rand(100, 2) + transformer.fit_transform(X) + + msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'" + with pytest.raises(AttributeError, match=msg): + transformer.get_feature_names_out() + + +def test_function_transformer_feature_names_out_uses_estimator(): + def add_n_random_features(X, n): + return np.concatenate([X, np.random.rand(len(X), n)], axis=1) + + def feature_names_out(transformer, input_features): + n = transformer.kw_args["n"] + return list(input_features) + [f"rnd{i}" for i in range(n)] + + transformer = FunctionTransformer( + func=add_n_random_features, + feature_names_out=feature_names_out, + kw_args=dict(n=3), + validate=True, + ) + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)}) + transformer.fit_transform(df) + names = transformer.get_feature_names_out() + + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2")) + + +def test_function_transformer_validate_inverse(): + """Test that function transformer does not reset estimator in + `inverse_transform`.""" + + def add_constant_feature(X): + X_one = np.ones((X.shape[0], 1)) + return np.concatenate((X, X_one), axis=1) + + def inverse_add_constant(X): + return X[:, :-1] + + X = np.array([[1, 2], [3, 4], [3, 4]]) + trans = FunctionTransformer( + func=add_constant_feature, + inverse_func=inverse_add_constant, + validate=True, + ) + X_trans = trans.fit_transform(X) + assert trans.n_features_in_ == X.shape[1] + + trans.inverse_transform(X_trans) + assert trans.n_features_in_ == X.shape[1] + + +@pytest.mark.parametrize( + "feature_names_out, expected", + [ + ("one-to-one", ["pet", "color"]), + [lambda est, names: [f"{n}_out" for n in names], ["pet_out", "color_out"]], + ], +) +@pytest.mark.parametrize("in_pipeline", [True, False]) +def test_get_feature_names_out_dataframe_with_string_data( + feature_names_out, expected, in_pipeline +): + """Check that get_feature_names_out works with DataFrames with string data.""" + pd = pytest.importorskip("pandas") + X = pd.DataFrame({"pet": ["dog", "cat"], "color": ["red", "green"]}) + + def func(X): + if feature_names_out == "one-to-one": + return X + else: + name = feature_names_out(None, X.columns) + return X.rename(columns=dict(zip(X.columns, name))) + + transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out) + if in_pipeline: + transformer = make_pipeline(transformer) + + X_trans = transformer.fit_transform(X) + assert isinstance(X_trans, pd.DataFrame) + + names = transformer.get_feature_names_out() + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, expected) + + +def test_set_output_func(): + """Check behavior of set_output with different settings.""" + pd = pytest.importorskip("pandas") + + X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]}) + + ft = FunctionTransformer(np.log, feature_names_out="one-to-one") + + # no warning is raised when feature_names_out is defined + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + ft.set_output(transform="pandas") + + X_trans = ft.fit_transform(X) + assert isinstance(X_trans, pd.DataFrame) + assert_array_equal(X_trans.columns, ["a", "b"]) + + ft = FunctionTransformer(lambda x: 2 * x) + ft.set_output(transform="pandas") + + # no warning is raised when func returns a panda dataframe + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + X_trans = ft.fit_transform(X) + assert isinstance(X_trans, pd.DataFrame) + assert_array_equal(X_trans.columns, ["a", "b"]) + + # Warning is raised when func returns a ndarray + ft_np = FunctionTransformer(lambda x: np.asarray(x)) + + for transform in ("pandas", "polars"): + ft_np.set_output(transform=transform) + msg = ( + f"When `set_output` is configured to be '{transform}'.*{transform} " + "DataFrame.*" + ) + with pytest.warns(UserWarning, match=msg): + ft_np.fit_transform(X) + + # default transform does not warn + ft_np.set_output(transform="default") + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + ft_np.fit_transform(X) + + +def test_consistence_column_name_between_steps(): + """Check that we have a consistence between the feature names out of + `FunctionTransformer` and the feature names in of the next step in the pipeline. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27695 + """ + pd = pytest.importorskip("pandas") + + def with_suffix(_, names): + return [name + "__log" for name in names] + + pipeline = make_pipeline( + FunctionTransformer(np.log1p, feature_names_out=with_suffix), StandardScaler() + ) + + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["a", "b"]) + X_trans = pipeline.fit_transform(df) + assert pipeline.get_feature_names_out().tolist() == ["a__log", "b__log"] + # StandardScaler will convert to a numpy array + assert isinstance(X_trans, np.ndarray) + + +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +@pytest.mark.parametrize("transform_output", ["default", "pandas", "polars"]) +def test_function_transformer_overwrite_column_names(dataframe_lib, transform_output): + """Check that we overwrite the column names when we should.""" + lib = pytest.importorskip(dataframe_lib) + if transform_output != "numpy": + pytest.importorskip(transform_output) + + df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]}) + + def with_suffix(_, names): + return [name + "__log" for name in names] + + transformer = FunctionTransformer(feature_names_out=with_suffix).set_output( + transform=transform_output + ) + X_trans = transformer.fit_transform(df) + assert_array_equal(np.asarray(X_trans), np.asarray(df)) + + feature_names = transformer.get_feature_names_out() + assert list(X_trans.columns) == with_suffix(None, df.columns) + assert feature_names.tolist() == with_suffix(None, df.columns) + + +@pytest.mark.parametrize( + "feature_names_out", + ["one-to-one", lambda _, names: [f"{name}_log" for name in names]], +) +def test_function_transformer_overwrite_column_names_numerical(feature_names_out): + """Check the same as `test_function_transformer_overwrite_column_names` + but for the specific case of pandas where column names can be numerical.""" + pd = pytest.importorskip("pandas") + + df = pd.DataFrame({0: [1, 2, 3], 1: [10, 20, 100]}) + + transformer = FunctionTransformer(feature_names_out=feature_names_out) + X_trans = transformer.fit_transform(df) + assert_array_equal(np.asarray(X_trans), np.asarray(df)) + + feature_names = transformer.get_feature_names_out() + assert list(X_trans.columns) == list(feature_names) + + +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +@pytest.mark.parametrize( + "feature_names_out", + ["one-to-one", lambda _, names: [f"{name}_log" for name in names]], +) +def test_function_transformer_error_column_inconsistent( + dataframe_lib, feature_names_out +): + """Check that we raise an error when `func` returns a dataframe with new + column names that become inconsistent with `get_feature_names_out`.""" + lib = pytest.importorskip(dataframe_lib) + + df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]}) + + def func(df): + if dataframe_lib == "pandas": + return df.rename(columns={"a": "c"}) + else: + return df.rename({"a": "c"}) + + transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out) + err_msg = "The output generated by `func` have different column names" + with pytest.raises(ValueError, match=err_msg): + transformer.fit_transform(df).columns diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_label.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_label.py new file mode 100644 index 0000000000000000000000000000000000000000..c0aa85b2cfe70bcd2fb85c54aca977450c0e32ec --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_label.py @@ -0,0 +1,745 @@ +import numpy as np +import pytest +from scipy.sparse import issparse + +from sklearn import config_context, datasets +from sklearn.preprocessing._label import ( + LabelBinarizer, + LabelEncoder, + MultiLabelBinarizer, + _inverse_binarize_multiclass, + _inverse_binarize_thresholding, + label_binarize, +) +from sklearn.utils._array_api import ( + _convert_to_numpy, + get_namespace, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._testing import ( + _array_api_for_tests, + assert_array_equal, +) +from sklearn.utils.fixes import ( + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, +) +from sklearn.utils.multiclass import type_of_target +from sklearn.utils.validation import _to_object_array + +iris = datasets.load_iris() + + +def toarray(a): + if hasattr(a, "toarray"): + a = a.toarray() + return a + + +def test_label_binarizer(): + # one-class case defaults to negative label + # For dense case: + inp = ["pos", "pos", "pos", "pos"] + lb = LabelBinarizer(sparse_output=False) + expected = np.array([[0, 0, 0, 0]]).T + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["pos"]) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + # For sparse case: + lb = LabelBinarizer(sparse_output=True) + got = lb.fit_transform(inp) + assert issparse(got) + assert_array_equal(lb.classes_, ["pos"]) + assert_array_equal(expected, got.toarray()) + assert_array_equal(lb.inverse_transform(got.toarray()), inp) + + lb = LabelBinarizer(sparse_output=False) + # two-class case + inp = ["neg", "pos", "pos", "neg"] + expected = np.array([[0, 1, 1, 0]]).T + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["neg", "pos"]) + assert_array_equal(expected, got) + + to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) + assert_array_equal(lb.inverse_transform(to_invert), inp) + + # multi-class case + inp = ["spam", "ham", "eggs", "ham", "0"] + expected = np.array( + [[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]] + ) + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"]) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + +def test_label_binarizer_unseen_labels(): + lb = LabelBinarizer() + + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) + got = lb.fit_transform(["b", "d", "e"]) + assert_array_equal(expected, got) + + expected = np.array( + [[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]] + ) + got = lb.transform(["a", "b", "c", "d", "e", "f"]) + assert_array_equal(expected, got) + + +def test_label_binarizer_set_label_encoding(): + lb = LabelBinarizer(neg_label=-2, pos_label=0) + + # two-class case with pos_label=0 + inp = np.array([0, 1, 1, 0]) + expected = np.array([[-2, 0, 0, -2]]).T + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + lb = LabelBinarizer(neg_label=-2, pos_label=2) + + # multi-class case + inp = np.array([3, 2, 1, 2, 0]) + expected = np.array( + [ + [-2, -2, -2, +2], + [-2, -2, +2, -2], + [-2, +2, -2, -2], + [-2, -2, +2, -2], + [+2, -2, -2, -2], + ] + ) + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +@pytest.mark.parametrize("unique_first", [True, False]) +def test_label_binarizer_pandas_nullable(dtype, unique_first): + """Checks that LabelBinarizer works with pandas nullable dtypes. + + Non-regression test for gh-25637. + """ + pd = pytest.importorskip("pandas") + + y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype) + if unique_first: + # Calling unique creates a pandas array which has a different interface + # compared to a pandas Series. Specifically, pandas arrays do not have "iloc". + y_true = y_true.unique() + lb = LabelBinarizer().fit(y_true) + y_out = lb.transform([1, 0]) + + assert_array_equal(y_out, [[1], [0]]) + + +def test_label_binarizer_errors(): + # Check that invalid arguments yield ValueError + one_class = np.array([0, 0, 0, 0]) + lb = LabelBinarizer().fit(one_class) + + multi_label = [(2, 3), (0,), (0, 2)] + err_msg = "You appear to be using a legacy multi-label data representation." + with pytest.raises(ValueError, match=err_msg): + lb.transform(multi_label) + + lb = LabelBinarizer() + err_msg = "This LabelBinarizer instance is not fitted yet" + with pytest.raises(ValueError, match=err_msg): + lb.transform([]) + with pytest.raises(ValueError, match=err_msg): + lb.inverse_transform([]) + + input_labels = [0, 1, 0, 1] + err_msg = "neg_label=2 must be strictly less than pos_label=1." + lb = LabelBinarizer(neg_label=2, pos_label=1) + with pytest.raises(ValueError, match=err_msg): + lb.fit(input_labels) + err_msg = "neg_label=2 must be strictly less than pos_label=2." + lb = LabelBinarizer(neg_label=2, pos_label=2) + with pytest.raises(ValueError, match=err_msg): + lb.fit(input_labels) + err_msg = ( + "Sparse binarization is only supported with non zero pos_label and zero " + "neg_label, got pos_label=2 and neg_label=1" + ) + lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) + with pytest.raises(ValueError, match=err_msg): + lb.fit(input_labels) + + # Sequence of seq type should raise ValueError + y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] + err_msg = "You appear to be using a legacy multi-label data representation" + with pytest.raises(ValueError, match=err_msg): + LabelBinarizer().fit_transform(y_seq_of_seqs) + + # Fail on the dimension of 'binary' + err_msg = "output_type='binary', but y.shape" + with pytest.raises(ValueError, match=err_msg): + _inverse_binarize_thresholding( + y=np.array([[1, 2, 3], [2, 1, 3]]), + output_type="binary", + classes=[1, 2, 3], + threshold=0, + ) + + # Fail on multioutput data + err_msg = "Multioutput target data is not supported with label binarization" + with pytest.raises(ValueError, match=err_msg): + LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) + with pytest.raises(ValueError, match=err_msg): + label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_label_binarizer_sparse_errors(csr_container): + # Fail on y_type + err_msg = "foo format is not supported" + with pytest.raises(ValueError, match=err_msg): + _inverse_binarize_thresholding( + y=csr_container([[1, 2], [2, 1]]), + output_type="foo", + classes=[1, 2], + threshold=0, + ) + + # Fail on the number of classes + err_msg = "The number of class is not equal to the number of dimension of y." + with pytest.raises(ValueError, match=err_msg): + _inverse_binarize_thresholding( + y=csr_container([[1, 2], [2, 1]]), + output_type="foo", + classes=[1, 2, 3], + threshold=0, + ) + + +@pytest.mark.parametrize( + "values, classes, unknown", + [ + ( + np.array([2, 1, 3, 1, 3], dtype="int64"), + np.array([1, 2, 3], dtype="int64"), + np.array([4], dtype="int64"), + ), + ( + np.array(["b", "a", "c", "a", "c"], dtype=object), + np.array(["a", "b", "c"], dtype=object), + np.array(["d"], dtype=object), + ), + ( + np.array(["b", "a", "c", "a", "c"]), + np.array(["a", "b", "c"]), + np.array(["d"]), + ), + ], + ids=["int64", "object", "str"], +) +def test_label_encoder(values, classes, unknown): + # Test LabelEncoder's transform, fit_transform and + # inverse_transform methods + le = LabelEncoder() + le.fit(values) + assert_array_equal(le.classes_, classes) + assert_array_equal(le.transform(values), [1, 0, 2, 0, 2]) + assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values) + le = LabelEncoder() + ret = le.fit_transform(values) + assert_array_equal(ret, [1, 0, 2, 0, 2]) + + with pytest.raises(ValueError, match="unseen labels"): + le.transform(unknown) + + +def test_label_encoder_negative_ints(): + le = LabelEncoder() + le.fit([1, 1, 4, 5, -1, 0]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) + assert_array_equal( + le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1] + ) + with pytest.raises(ValueError): + le.transform([0, 6]) + + +@pytest.mark.parametrize("dtype", ["str", "object"]) +def test_label_encoder_str_bad_shape(dtype): + le = LabelEncoder() + le.fit(np.array(["apple", "orange"], dtype=dtype)) + msg = "should be a 1d array" + with pytest.raises(ValueError, match=msg): + le.transform("apple") + + +def test_label_encoder_errors(): + # Check that invalid arguments yield ValueError + le = LabelEncoder() + with pytest.raises(ValueError): + le.transform([]) + with pytest.raises(ValueError): + le.inverse_transform([]) + + # Fail on unseen labels + le = LabelEncoder() + le.fit([1, 2, 3, -1, 1]) + msg = "contains previously unseen labels" + with pytest.raises(ValueError, match=msg): + le.inverse_transform([-2]) + with pytest.raises(ValueError, match=msg): + le.inverse_transform([-2, -3, -4]) + + # Fail on inverse_transform("") + msg = r"should be a 1d array.+shape \(\)" + with pytest.raises(ValueError, match=msg): + le.inverse_transform("") + + +@pytest.mark.parametrize( + "values", + [ + np.array([2, 1, 3, 1, 3], dtype="int64"), + np.array(["b", "a", "c", "a", "c"], dtype=object), + np.array(["b", "a", "c", "a", "c"]), + ], + ids=["int64", "object", "str"], +) +def test_label_encoder_empty_array(values): + le = LabelEncoder() + le.fit(values) + # test empty transform + transformed = le.transform([]) + assert_array_equal(np.array([]), transformed) + # test empty inverse transform + inverse_transformed = le.inverse_transform([]) + assert_array_equal(np.array([]), inverse_transformed) + + +def test_sparse_output_multilabel_binarizer(): + # test input as iterable of iterables + inputs = [ + lambda: [(2, 3), (1,), (1, 2)], + lambda: ({2, 3}, {1}, {1, 2}), + lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]), + ] + indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) + + inverse = inputs[0]() + for sparse_output in [True, False]: + for inp in inputs: + # With fit_transform + mlb = MultiLabelBinarizer(sparse_output=sparse_output) + got = mlb.fit_transform(inp()) + assert issparse(got) == sparse_output + if sparse_output: + # verify CSR assumption that indices and indptr have same dtype + assert got.indices.dtype == got.indptr.dtype + got = got.toarray() + assert_array_equal(indicator_mat, got) + assert_array_equal([1, 2, 3], mlb.classes_) + assert mlb.inverse_transform(got) == inverse + + # With fit + mlb = MultiLabelBinarizer(sparse_output=sparse_output) + got = mlb.fit(inp()).transform(inp()) + assert issparse(got) == sparse_output + if sparse_output: + # verify CSR assumption that indices and indptr have same dtype + assert got.indices.dtype == got.indptr.dtype + got = got.toarray() + assert_array_equal(indicator_mat, got) + assert_array_equal([1, 2, 3], mlb.classes_) + assert mlb.inverse_transform(got) == inverse + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_output_multilabel_binarizer_errors(csr_container): + inp = iter([iter((2, 3)), iter((1,)), {1, 2}]) + mlb = MultiLabelBinarizer(sparse_output=False) + mlb.fit(inp) + with pytest.raises(ValueError): + mlb.inverse_transform( + csr_container(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])) + ) + + +def test_multilabel_binarizer(): + # test input as iterable of iterables + inputs = [ + lambda: [(2, 3), (1,), (1, 2)], + lambda: ({2, 3}, {1}, {1, 2}), + lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]), + ] + indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) + inverse = inputs[0]() + for inp in inputs: + # With fit_transform + mlb = MultiLabelBinarizer() + got = mlb.fit_transform(inp()) + assert_array_equal(indicator_mat, got) + assert_array_equal([1, 2, 3], mlb.classes_) + assert mlb.inverse_transform(got) == inverse + + # With fit + mlb = MultiLabelBinarizer() + got = mlb.fit(inp()).transform(inp()) + assert_array_equal(indicator_mat, got) + assert_array_equal([1, 2, 3], mlb.classes_) + assert mlb.inverse_transform(got) == inverse + + +def test_multilabel_binarizer_empty_sample(): + mlb = MultiLabelBinarizer() + y = [[1, 2], [1], []] + Y = np.array([[1, 1], [1, 0], [0, 0]]) + assert_array_equal(mlb.fit_transform(y), Y) + + +def test_multilabel_binarizer_unknown_class(): + mlb = MultiLabelBinarizer() + y = [[1, 2]] + Y = np.array([[1, 0], [0, 1]]) + warning_message = "unknown class.* will be ignored" + with pytest.warns(UserWarning, match=warning_message): + matrix = mlb.fit(y).transform([[4, 1], [2, 0]]) + + Y = np.array([[1, 0, 0], [0, 1, 0]]) + mlb = MultiLabelBinarizer(classes=[1, 2, 3]) + with pytest.warns(UserWarning, match=warning_message): + matrix = mlb.fit(y).transform([[4, 1], [2, 0]]) + assert_array_equal(matrix, Y) + + +def test_multilabel_binarizer_given_classes(): + inp = [(2, 3), (1,), (1, 2)] + indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) + # fit_transform() + mlb = MultiLabelBinarizer(classes=[1, 3, 2]) + assert_array_equal(mlb.fit_transform(inp), indicator_mat) + assert_array_equal(mlb.classes_, [1, 3, 2]) + + # fit().transform() + mlb = MultiLabelBinarizer(classes=[1, 3, 2]) + assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) + assert_array_equal(mlb.classes_, [1, 3, 2]) + + # ensure works with extra class + mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2]) + assert_array_equal( + mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat)) + ) + assert_array_equal(mlb.classes_, [4, 1, 3, 2]) + + # ensure fit is no-op as iterable is not consumed + inp = iter(inp) + mlb = MultiLabelBinarizer(classes=[1, 3, 2]) + assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) + + # ensure a ValueError is thrown if given duplicate classes + err_msg = ( + "The classes argument contains duplicate classes. Remove " + "these duplicates before passing them to MultiLabelBinarizer." + ) + mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3]) + with pytest.raises(ValueError, match=err_msg): + mlb.fit(inp) + + +def test_multilabel_binarizer_multiple_calls(): + inp = [(2, 3), (1,), (1, 2)] + indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) + + indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) + + # first call + mlb = MultiLabelBinarizer(classes=[1, 3, 2]) + assert_array_equal(mlb.fit_transform(inp), indicator_mat) + # second call change class + mlb.classes = [1, 2, 3] + assert_array_equal(mlb.fit_transform(inp), indicator_mat2) + + +def test_multilabel_binarizer_same_length_sequence(): + # Ensure sequences of the same length are not interpreted as a 2-d array + inp = [[1], [0], [2]] + indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]]) + # fit_transform() + mlb = MultiLabelBinarizer() + assert_array_equal(mlb.fit_transform(inp), indicator_mat) + assert_array_equal(mlb.inverse_transform(indicator_mat), inp) + + # fit().transform() + mlb = MultiLabelBinarizer() + assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) + assert_array_equal(mlb.inverse_transform(indicator_mat), inp) + + +def test_multilabel_binarizer_non_integer_labels(): + tuple_classes = _to_object_array([(1,), (2,), (3,)]) + inputs = [ + ([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]), + ([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]), + ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes), + ] + indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) + for inp, classes in inputs: + # fit_transform() + mlb = MultiLabelBinarizer() + inp = np.array(inp, dtype=object) + assert_array_equal(mlb.fit_transform(inp), indicator_mat) + assert_array_equal(mlb.classes_, classes) + indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object) + assert_array_equal(indicator_mat_inv, inp) + + # fit().transform() + mlb = MultiLabelBinarizer() + assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) + assert_array_equal(mlb.classes_, classes) + indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object) + assert_array_equal(indicator_mat_inv, inp) + + mlb = MultiLabelBinarizer() + with pytest.raises(TypeError): + mlb.fit_transform([({}), ({}, {"a": "b"})]) + + +def test_multilabel_binarizer_non_unique(): + inp = [(1, 1, 1, 0)] + indicator_mat = np.array([[1, 1]]) + mlb = MultiLabelBinarizer() + assert_array_equal(mlb.fit_transform(inp), indicator_mat) + + +def test_multilabel_binarizer_inverse_validation(): + inp = [(1, 1, 1, 0)] + mlb = MultiLabelBinarizer() + mlb.fit_transform(inp) + # Not binary + with pytest.raises(ValueError): + mlb.inverse_transform(np.array([[1, 3]])) + # The following binary cases are fine, however + mlb.inverse_transform(np.array([[0, 0]])) + mlb.inverse_transform(np.array([[1, 1]])) + mlb.inverse_transform(np.array([[1, 0]])) + + # Wrong shape + with pytest.raises(ValueError): + mlb.inverse_transform(np.array([[1]])) + with pytest.raises(ValueError): + mlb.inverse_transform(np.array([[1, 1, 1]])) + + +def test_label_binarize_with_class_order(): + out = label_binarize([1, 6], classes=[1, 2, 4, 6]) + expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]]) + assert_array_equal(out, expected) + + # Modified class order + out = label_binarize([1, 6], classes=[1, 6, 4, 2]) + expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]]) + assert_array_equal(out, expected) + + out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1]) + expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]]) + assert_array_equal(out, expected) + + +def check_binarized_results(y, classes, pos_label, neg_label, expected): + for sparse_output in [True, False]: + if (pos_label == 0 or neg_label != 0) and sparse_output: + with pytest.raises(ValueError): + label_binarize( + y, + classes=classes, + neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output, + ) + continue + + # check label_binarize + binarized = label_binarize( + y, + classes=classes, + neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output, + ) + assert_array_equal(toarray(binarized), expected) + assert issparse(binarized) == sparse_output + + # check inverse + y_type = type_of_target(y) + if y_type == "multiclass": + inversed = _inverse_binarize_multiclass(binarized, classes=classes) + + else: + inversed = _inverse_binarize_thresholding( + binarized, + output_type=y_type, + classes=classes, + threshold=((neg_label + pos_label) / 2.0), + ) + + assert_array_equal(toarray(inversed), toarray(y)) + + # Check label binarizer + lb = LabelBinarizer( + neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output + ) + binarized = lb.fit_transform(y) + assert_array_equal(toarray(binarized), expected) + assert issparse(binarized) == sparse_output + inverse_output = lb.inverse_transform(binarized) + assert_array_equal(toarray(inverse_output), toarray(y)) + assert issparse(inverse_output) == issparse(y) + + +def test_label_binarize_binary(): + y = [0, 1, 0] + classes = [0, 1] + pos_label = 2 + neg_label = -1 + expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1)) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + # Binary case where sparse_output = True will not result in a ValueError + y = [0, 1, 0] + classes = [0, 1] + pos_label = 3 + neg_label = 0 + expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1)) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + +def test_label_binarize_multiclass(): + y = [0, 1, 2] + classes = [0, 1, 2] + pos_label = 2 + neg_label = 0 + expected = 2 * np.eye(3) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + with pytest.raises(ValueError): + label_binarize( + y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True + ) + + +@pytest.mark.parametrize( + "arr_type", + [np.array] + + COO_CONTAINERS + + CSC_CONTAINERS + + CSR_CONTAINERS + + DOK_CONTAINERS + + LIL_CONTAINERS, +) +def test_label_binarize_multilabel(arr_type): + y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]]) + classes = [0, 1, 2] + pos_label = 2 + neg_label = 0 + expected = pos_label * y_ind + y = arr_type(y_ind) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + with pytest.raises(ValueError): + label_binarize( + y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True + ) + + +def test_invalid_input_label_binarize(): + with pytest.raises(ValueError): + label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1) + with pytest.raises(ValueError, match="continuous target data is not "): + label_binarize([1.2, 2.7], classes=[0, 1]) + with pytest.raises(ValueError, match="mismatch with the labels"): + label_binarize([[1, 3]], classes=[1, 2, 3]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_inverse_binarize_multiclass(csr_container): + got = _inverse_binarize_multiclass( + csr_container([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3) + ) + assert_array_equal(got, np.array([1, 1, 0])) + + +def test_nan_label_encoder(): + """Check that label encoder encodes nans in transform. + + Non-regression test for #22628. + """ + le = LabelEncoder() + le.fit(["a", "a", "b", np.nan]) + + y_trans = le.transform([np.nan]) + assert_array_equal(y_trans, [2]) + + +@pytest.mark.parametrize( + "encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()] +) +def test_label_encoders_do_not_have_set_output(encoder): + """Check that label encoders do not define set_output and work with y as a kwarg. + + Non-regression test for #26854. + """ + assert not hasattr(encoder, "set_output") + y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"]) + y_encoded_positional = encoder.fit_transform(["a", "b", "c"]) + assert_array_equal(y_encoded_with_kwarg, y_encoded_positional) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize( + "y", + [ + np.array([2, 1, 3, 1, 3]), + np.array([1, 1, 4, 5, -1, 0]), + np.array([3, 5, 9, 5, 9, 3]), + ], +) +def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype): + xp = _array_api_for_tests(array_namespace, device) + xp_y = xp.asarray(y, device=device) + with config_context(array_api_dispatch=True): + xp_label = LabelEncoder() + np_label = LabelEncoder() + xp_label = xp_label.fit(xp_y) + xp_transformed = xp_label.transform(xp_y) + xp_inv_transformed = xp_label.inverse_transform(xp_transformed) + np_label = np_label.fit(y) + np_transformed = np_label.transform(y) + assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__ + assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed) + assert_array_equal(_convert_to_numpy(xp_inv_transformed, xp), y) + assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_) + + xp_label = LabelEncoder() + np_label = LabelEncoder() + xp_transformed = xp_label.fit_transform(xp_y) + np_transformed = np_label.fit_transform(y) + assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__ + assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed) + assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_) diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_polynomial.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_polynomial.py new file mode 100644 index 0000000000000000000000000000000000000000..c83e5e35232c894839a047a16aaf3adc36a2f633 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_polynomial.py @@ -0,0 +1,1260 @@ +import sys + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal +from scipy import sparse +from scipy.interpolate import BSpline +from scipy.sparse import random as sparse_random + +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import ( + KBinsDiscretizer, + PolynomialFeatures, + SplineTransformer, +) +from sklearn.preprocessing._csr_polynomial_expansion import ( + _calc_expanded_nnz, + _calc_total_nnz, + _get_sizeof_LARGEST_INT_t, +) +from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils.fixes import ( + CSC_CONTAINERS, + CSR_CONTAINERS, + parse_version, + sp_version, +) + + +@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer)) +def test_polynomial_and_spline_array_order(est): + """Test that output array has the given order.""" + X = np.arange(10).reshape(5, 2) + + def is_c_contiguous(a): + return np.isfortran(a.T) + + assert is_c_contiguous(est().fit_transform(X)) + assert is_c_contiguous(est(order="C").fit_transform(X)) + assert np.isfortran(est(order="F").fit_transform(X)) + + +@pytest.mark.parametrize( + "params, err_msg", + [ + ({"knots": [[1]]}, r"Number of knots, knots.shape\[0\], must be >= 2."), + ({"knots": [[1, 1], [2, 2]]}, r"knots.shape\[1\] == n_features is violated"), + ({"knots": [[1], [0]]}, "knots must be sorted without duplicates."), + ], +) +def test_spline_transformer_input_validation(params, err_msg): + """Test that we raise errors for invalid input in SplineTransformer.""" + X = [[1], [2]] + + with pytest.raises(ValueError, match=err_msg): + SplineTransformer(**params).fit(X) + + +@pytest.mark.parametrize("extrapolation", ["continue", "periodic"]) +def test_spline_transformer_integer_knots(extrapolation): + """Test that SplineTransformer accepts integer value knot positions.""" + X = np.arange(20).reshape(10, 2) + knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]] + _ = SplineTransformer( + degree=3, knots=knots, extrapolation=extrapolation + ).fit_transform(X) + + +def test_spline_transformer_feature_names(): + """Test that SplineTransformer generates correct features name.""" + X = np.arange(20).reshape(10, 2) + splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X) + feature_names = splt.get_feature_names_out() + assert_array_equal( + feature_names, + [ + "x0_sp_0", + "x0_sp_1", + "x0_sp_2", + "x0_sp_3", + "x0_sp_4", + "x1_sp_0", + "x1_sp_1", + "x1_sp_2", + "x1_sp_3", + "x1_sp_4", + ], + ) + + splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X) + feature_names = splt.get_feature_names_out(["a", "b"]) + assert_array_equal( + feature_names, + [ + "a_sp_0", + "a_sp_1", + "a_sp_2", + "a_sp_3", + "b_sp_0", + "b_sp_1", + "b_sp_2", + "b_sp_3", + ], + ) + + +@pytest.mark.parametrize( + "extrapolation", + ["constant", "linear", "continue", "periodic"], +) +@pytest.mark.parametrize("degree", [2, 3]) +def test_split_transform_feature_names_extrapolation_degree(extrapolation, degree): + """Test feature names are correct for different extrapolations and degree. + + Non-regression test for gh-25292. + """ + X = np.arange(20).reshape(10, 2) + splt = SplineTransformer(degree=degree, extrapolation=extrapolation).fit(X) + feature_names = splt.get_feature_names_out(["a", "b"]) + assert len(feature_names) == splt.n_features_out_ + + X_trans = splt.transform(X) + assert X_trans.shape[1] == len(feature_names) + + +@pytest.mark.parametrize("degree", range(1, 5)) +@pytest.mark.parametrize("n_knots", range(3, 5)) +@pytest.mark.parametrize("knots", ["uniform", "quantile"]) +@pytest.mark.parametrize("extrapolation", ["constant", "periodic"]) +def test_spline_transformer_unity_decomposition(degree, n_knots, knots, extrapolation): + """Test that B-splines are indeed a decomposition of unity. + + Splines basis functions must sum up to 1 per row, if we stay in between boundaries. + """ + X = np.linspace(0, 1, 100)[:, None] + # make the boundaries 0 and 1 part of X_train, for sure. + X_train = np.r_[[[0]], X[::2, :], [[1]]] + X_test = X[1::2, :] + + if extrapolation == "periodic": + n_knots = n_knots + degree # periodic splines require degree < n_knots + + splt = SplineTransformer( + n_knots=n_knots, + degree=degree, + knots=knots, + include_bias=True, + extrapolation=extrapolation, + ) + splt.fit(X_train) + for X in [X_train, X_test]: + assert_allclose(np.sum(splt.transform(X), axis=1), 1) + + +@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)]) +def test_spline_transformer_linear_regression(bias, intercept): + """Test that B-splines fit a sinusodial curve pretty well.""" + X = np.linspace(0, 10, 100)[:, None] + y = np.sin(X[:, 0]) + 2 # +2 to avoid the value 0 in assert_allclose + pipe = Pipeline( + steps=[ + ( + "spline", + SplineTransformer( + n_knots=15, + degree=3, + include_bias=bias, + extrapolation="constant", + ), + ), + ("ols", LinearRegression(fit_intercept=intercept)), + ] + ) + pipe.fit(X, y) + assert_allclose(pipe.predict(X), y, rtol=1e-3) + + +@pytest.mark.parametrize( + ["knots", "n_knots", "sample_weight", "expected_knots"], + [ + ("uniform", 3, None, np.array([[0, 2], [3, 8], [6, 14]])), + ( + "uniform", + 3, + np.array([0, 0, 1, 1, 0, 3, 1]), + np.array([[2, 2], [4, 8], [6, 14]]), + ), + ("uniform", 4, None, np.array([[0, 2], [2, 6], [4, 10], [6, 14]])), + ("quantile", 3, None, np.array([[0, 2], [3, 3], [6, 14]])), + ( + "quantile", + 3, + np.array([0, 0, 1, 1, 0, 3, 1]), + np.array([[2, 2], [5, 8], [6, 14]]), + ), + ], +) +def test_spline_transformer_get_base_knot_positions( + knots, n_knots, sample_weight, expected_knots +): + """Check the behaviour to find knot positions with and without sample_weight.""" + X = np.array([[0, 2], [0, 2], [2, 2], [3, 3], [4, 6], [5, 8], [6, 14]]) + base_knots = SplineTransformer._get_base_knot_positions( + X=X, knots=knots, n_knots=n_knots, sample_weight=sample_weight + ) + assert_allclose(base_knots, expected_knots) + + +@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)]) +def test_spline_transformer_periodic_linear_regression(bias, intercept): + """Test that B-splines fit a periodic curve pretty well.""" + + # "+ 3" to avoid the value 0 in assert_allclose + def f(x): + return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3 + + X = np.linspace(0, 1, 101)[:, None] + pipe = Pipeline( + steps=[ + ( + "spline", + SplineTransformer( + n_knots=20, + degree=3, + include_bias=bias, + extrapolation="periodic", + ), + ), + ("ols", LinearRegression(fit_intercept=intercept)), + ] + ) + pipe.fit(X, f(X[:, 0])) + + # Generate larger array to check periodic extrapolation + X_ = np.linspace(-1, 2, 301)[:, None] + predictions = pipe.predict(X_) + assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01) + assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3) + + +def test_spline_transformer_periodic_spline_backport(): + """Test that the backport of extrapolate="periodic" works correctly""" + X = np.linspace(-2, 3.5, 10)[:, None] + degree = 2 + + # Use periodic extrapolation backport in SplineTransformer + transformer = SplineTransformer( + degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]] + ) + Xt = transformer.fit_transform(X) + + # Use periodic extrapolation in BSpline + coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]]) + spl = BSpline(np.arange(-3, 4), coef, degree, "periodic") + Xspl = spl(X[:, 0]) + assert_allclose(Xt, Xspl) + + +def test_spline_transformer_periodic_splines_periodicity(): + """Test if shifted knots result in the same transformation up to permutation.""" + X = np.linspace(0, 10, 101)[:, None] + + transformer_1 = SplineTransformer( + degree=3, + extrapolation="periodic", + knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]], + ) + + transformer_2 = SplineTransformer( + degree=3, + extrapolation="periodic", + knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]], + ) + + Xt_1 = transformer_1.fit_transform(X) + Xt_2 = transformer_2.fit_transform(X) + + assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]]) + + +@pytest.mark.parametrize("degree", [3, 5]) +def test_spline_transformer_periodic_splines_smoothness(degree): + """Test that spline transformation is smooth at first / last knot.""" + X = np.linspace(-2, 10, 10_000)[:, None] + + transformer = SplineTransformer( + degree=degree, + extrapolation="periodic", + knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]], + ) + Xt = transformer.fit_transform(X) + + delta = (X.max() - X.min()) / len(X) + tol = 10 * delta + + dXt = Xt + # We expect splines of degree `degree` to be (`degree`-1) times + # continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th + # derivative should be continuous. This is the case if the (d+1)-th + # numerical derivative is reasonably small (smaller than `tol` in absolute + # value). We thus compute d-th numeric derivatives for d = 1, ..., `degree` + # and compare them to `tol`. + # + # Note that the 0-th derivative is the function itself, such that we are + # also checking its continuity. + for d in range(1, degree + 1): + # Check continuity of the (d-1)-th derivative + diff = np.diff(dXt, axis=0) + assert np.abs(diff).max() < tol + # Compute d-th numeric derivative + dXt = diff / delta + + # As degree `degree` splines are not `degree` times continuously + # differentiable at the knots, the `degree + 1`-th numeric derivative + # should have spikes at the knots. + diff = np.diff(dXt, axis=0) + assert np.abs(diff).max() > 1 + + +@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)]) +@pytest.mark.parametrize("degree", [1, 2, 3, 4, 5]) +def test_spline_transformer_extrapolation(bias, intercept, degree): + """Test that B-spline extrapolation works correctly.""" + # we use a straight line for that + X = np.linspace(-1, 1, 100)[:, None] + y = X.squeeze() + + # 'constant' + pipe = Pipeline( + [ + [ + "spline", + SplineTransformer( + n_knots=4, + degree=degree, + include_bias=bias, + extrapolation="constant", + ), + ], + ["ols", LinearRegression(fit_intercept=intercept)], + ] + ) + pipe.fit(X, y) + assert_allclose(pipe.predict([[-10], [5]]), [-1, 1]) + + # 'linear' + pipe = Pipeline( + [ + [ + "spline", + SplineTransformer( + n_knots=4, + degree=degree, + include_bias=bias, + extrapolation="linear", + ), + ], + ["ols", LinearRegression(fit_intercept=intercept)], + ] + ) + pipe.fit(X, y) + assert_allclose(pipe.predict([[-10], [5]]), [-10, 5]) + + # 'error' + splt = SplineTransformer( + n_knots=4, degree=degree, include_bias=bias, extrapolation="error" + ) + splt.fit(X) + msg = "X contains values beyond the limits of the knots" + with pytest.raises(ValueError, match=msg): + splt.transform([[-10]]) + with pytest.raises(ValueError, match=msg): + splt.transform([[5]]) + + +def test_spline_transformer_kbindiscretizer(): + """Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer.""" + rng = np.random.RandomState(97531) + X = rng.randn(200).reshape(200, 1) + n_bins = 5 + n_knots = n_bins + 1 + + splt = SplineTransformer( + n_knots=n_knots, degree=0, knots="quantile", include_bias=True + ) + splines = splt.fit_transform(X) + + kbd = KBinsDiscretizer(n_bins=n_bins, encode="onehot-dense", strategy="quantile") + kbins = kbd.fit_transform(X) + + # Though they should be exactly equal, we test approximately with high + # accuracy. + assert_allclose(splines, kbins, rtol=1e-13) + + +@pytest.mark.skipif( + sp_version < parse_version("1.8.0"), + reason="The option `sparse_output` is available as of scipy 1.8.0", +) +@pytest.mark.parametrize("degree", range(1, 3)) +@pytest.mark.parametrize("knots", ["uniform", "quantile"]) +@pytest.mark.parametrize( + "extrapolation", ["error", "constant", "linear", "continue", "periodic"] +) +@pytest.mark.parametrize("include_bias", [False, True]) +def test_spline_transformer_sparse_output( + degree, knots, extrapolation, include_bias, global_random_seed +): + rng = np.random.RandomState(global_random_seed) + X = rng.randn(200).reshape(40, 5) + + splt_dense = SplineTransformer( + degree=degree, + knots=knots, + extrapolation=extrapolation, + include_bias=include_bias, + sparse_output=False, + ) + splt_sparse = SplineTransformer( + degree=degree, + knots=knots, + extrapolation=extrapolation, + include_bias=include_bias, + sparse_output=True, + ) + + splt_dense.fit(X) + splt_sparse.fit(X) + + X_trans_sparse = splt_sparse.transform(X) + X_trans_dense = splt_dense.transform(X) + assert sparse.issparse(X_trans_sparse) and X_trans_sparse.format == "csr" + assert_allclose(X_trans_dense, X_trans_sparse.toarray()) + + # extrapolation regime + X_min = np.amin(X, axis=0) + X_max = np.amax(X, axis=0) + X_extra = np.r_[ + np.linspace(X_min - 5, X_min, 10), np.linspace(X_max, X_max + 5, 10) + ] + if extrapolation == "error": + msg = "X contains values beyond the limits of the knots" + with pytest.raises(ValueError, match=msg): + splt_dense.transform(X_extra) + msg = "Out of bounds" + with pytest.raises(ValueError, match=msg): + splt_sparse.transform(X_extra) + else: + assert_allclose( + splt_dense.transform(X_extra), splt_sparse.transform(X_extra).toarray() + ) + + +@pytest.mark.skipif( + sp_version >= parse_version("1.8.0"), + reason="The option `sparse_output` is available as of scipy 1.8.0", +) +def test_spline_transformer_sparse_output_raise_error_for_old_scipy(): + """Test that SplineTransformer with sparse=True raises for scipy<1.8.0.""" + X = [[1], [2]] + with pytest.raises(ValueError, match="scipy>=1.8.0"): + SplineTransformer(sparse_output=True).fit(X) + + +@pytest.mark.parametrize("n_knots", [5, 10]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("degree", [3, 4]) +@pytest.mark.parametrize( + "extrapolation", ["error", "constant", "linear", "continue", "periodic"] +) +@pytest.mark.parametrize("sparse_output", [False, True]) +def test_spline_transformer_n_features_out( + n_knots, include_bias, degree, extrapolation, sparse_output +): + """Test that transform results in n_features_out_ features.""" + if sparse_output and sp_version < parse_version("1.8.0"): + pytest.skip("The option `sparse_output` is available as of scipy 1.8.0") + + splt = SplineTransformer( + n_knots=n_knots, + degree=degree, + include_bias=include_bias, + extrapolation=extrapolation, + sparse_output=sparse_output, + ) + X = np.linspace(0, 1, 10)[:, None] + splt.fit(X) + + assert splt.transform(X).shape[1] == splt.n_features_out_ + + +@pytest.mark.parametrize( + "params, err_msg", + [ + ({"degree": (-1, 2)}, r"degree=\(min_degree, max_degree\) must"), + ({"degree": (0, 1.5)}, r"degree=\(min_degree, max_degree\) must"), + ({"degree": (3, 2)}, r"degree=\(min_degree, max_degree\) must"), + ({"degree": (1, 2, 3)}, r"int or tuple \(min_degree, max_degree\)"), + ], +) +def test_polynomial_features_input_validation(params, err_msg): + """Test that we raise errors for invalid input in PolynomialFeatures.""" + X = [[1], [2]] + + with pytest.raises(ValueError, match=err_msg): + PolynomialFeatures(**params).fit(X) + + +@pytest.fixture() +def single_feature_degree3(): + X = np.arange(6)[:, np.newaxis] + P = np.hstack([np.ones_like(X), X, X**2, X**3]) + return X, P + + +@pytest.mark.parametrize( + "degree, include_bias, interaction_only, indices", + [ + (3, True, False, slice(None, None)), + (3, False, False, slice(1, None)), + (3, True, True, [0, 1]), + (3, False, True, [1]), + ((2, 3), True, False, [0, 2, 3]), + ((2, 3), False, False, [2, 3]), + ((2, 3), True, True, [0]), + ((2, 3), False, True, []), + ], +) +@pytest.mark.parametrize("X_container", [None] + CSR_CONTAINERS + CSC_CONTAINERS) +def test_polynomial_features_one_feature( + single_feature_degree3, + degree, + include_bias, + interaction_only, + indices, + X_container, +): + """Test PolynomialFeatures on single feature up to degree 3.""" + X, P = single_feature_degree3 + if X_container is not None: + X = X_container(X) + tf = PolynomialFeatures( + degree=degree, include_bias=include_bias, interaction_only=interaction_only + ).fit(X) + out = tf.transform(X) + if X_container is not None: + out = out.toarray() + assert_allclose(out, P[:, indices]) + if tf.n_output_features_ > 0: + assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_) + + +@pytest.fixture() +def two_features_degree3(): + X = np.arange(6).reshape((3, 2)) + x1 = X[:, :1] + x2 = X[:, 1:] + P = np.hstack( + [ + x1**0 * x2**0, # 0 + x1**1 * x2**0, # 1 + x1**0 * x2**1, # 2 + x1**2 * x2**0, # 3 + x1**1 * x2**1, # 4 + x1**0 * x2**2, # 5 + x1**3 * x2**0, # 6 + x1**2 * x2**1, # 7 + x1**1 * x2**2, # 8 + x1**0 * x2**3, # 9 + ] + ) + return X, P + + +@pytest.mark.parametrize( + "degree, include_bias, interaction_only, indices", + [ + (2, True, False, slice(0, 6)), + (2, False, False, slice(1, 6)), + (2, True, True, [0, 1, 2, 4]), + (2, False, True, [1, 2, 4]), + ((2, 2), True, False, [0, 3, 4, 5]), + ((2, 2), False, False, [3, 4, 5]), + ((2, 2), True, True, [0, 4]), + ((2, 2), False, True, [4]), + (3, True, False, slice(None, None)), + (3, False, False, slice(1, None)), + (3, True, True, [0, 1, 2, 4]), + (3, False, True, [1, 2, 4]), + ((2, 3), True, False, [0, 3, 4, 5, 6, 7, 8, 9]), + ((2, 3), False, False, slice(3, None)), + ((2, 3), True, True, [0, 4]), + ((2, 3), False, True, [4]), + ((3, 3), True, False, [0, 6, 7, 8, 9]), + ((3, 3), False, False, [6, 7, 8, 9]), + ((3, 3), True, True, [0]), + ((3, 3), False, True, []), # would need 3 input features + ], +) +@pytest.mark.parametrize("X_container", [None] + CSR_CONTAINERS + CSC_CONTAINERS) +def test_polynomial_features_two_features( + two_features_degree3, + degree, + include_bias, + interaction_only, + indices, + X_container, +): + """Test PolynomialFeatures on 2 features up to degree 3.""" + X, P = two_features_degree3 + if X_container is not None: + X = X_container(X) + tf = PolynomialFeatures( + degree=degree, include_bias=include_bias, interaction_only=interaction_only + ).fit(X) + out = tf.transform(X) + if X_container is not None: + out = out.toarray() + assert_allclose(out, P[:, indices]) + if tf.n_output_features_ > 0: + assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_) + + +def test_polynomial_feature_names(): + X = np.arange(30).reshape(10, 3) + poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) + feature_names = poly.get_feature_names_out() + assert_array_equal( + ["1", "x0", "x1", "x2", "x0^2", "x0 x1", "x0 x2", "x1^2", "x1 x2", "x2^2"], + feature_names, + ) + assert len(feature_names) == poly.transform(X).shape[1] + + poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) + feature_names = poly.get_feature_names_out(["a", "b", "c"]) + assert_array_equal( + [ + "a", + "b", + "c", + "a^2", + "a b", + "a c", + "b^2", + "b c", + "c^2", + "a^3", + "a^2 b", + "a^2 c", + "a b^2", + "a b c", + "a c^2", + "b^3", + "b^2 c", + "b c^2", + "c^3", + ], + feature_names, + ) + assert len(feature_names) == poly.transform(X).shape[1] + + poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X) + feature_names = poly.get_feature_names_out(["a", "b", "c"]) + assert_array_equal( + [ + "a^2", + "a b", + "a c", + "b^2", + "b c", + "c^2", + "a^3", + "a^2 b", + "a^2 c", + "a b^2", + "a b c", + "a c^2", + "b^3", + "b^2 c", + "b c^2", + "c^3", + ], + feature_names, + ) + assert len(feature_names) == poly.transform(X).shape[1] + + poly = PolynomialFeatures( + degree=(3, 3), include_bias=True, interaction_only=True + ).fit(X) + feature_names = poly.get_feature_names_out(["a", "b", "c"]) + assert_array_equal(["1", "a b c"], feature_names) + assert len(feature_names) == poly.transform(X).shape[1] + + # test some unicode + poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) + feature_names = poly.get_feature_names_out(["\u0001F40D", "\u262e", "\u05d0"]) + assert_array_equal(["1", "\u0001F40D", "\u262e", "\u05d0"], feature_names) + + +@pytest.mark.parametrize( + ["deg", "include_bias", "interaction_only", "dtype"], + [ + (1, True, False, int), + (2, True, False, int), + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64), + (4, False, False, np.float64), + (4, False, True, np.float64), + ], +) +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_polynomial_features_csc_X( + deg, include_bias, interaction_only, dtype, csc_container +): + rng = np.random.RandomState(0) + X = rng.randint(0, 2, (100, 2)) + X_csc = csc_container(X) + + est = PolynomialFeatures( + deg, include_bias=include_bias, interaction_only=interaction_only + ) + Xt_csc = est.fit_transform(X_csc.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype)) + + assert sparse.issparse(Xt_csc) and Xt_csc.format == "csc" + assert Xt_csc.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csc.toarray(), Xt_dense) + + +@pytest.mark.parametrize( + ["deg", "include_bias", "interaction_only", "dtype"], + [ + (1, True, False, int), + (2, True, False, int), + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64), + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_polynomial_features_csr_X( + deg, include_bias, interaction_only, dtype, csr_container +): + rng = np.random.RandomState(0) + X = rng.randint(0, 2, (100, 2)) + X_csr = csr_container(X) + + est = PolynomialFeatures( + deg, include_bias=include_bias, interaction_only=interaction_only + ) + Xt_csr = est.fit_transform(X_csr.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype, copy=False)) + + assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) + + +@pytest.mark.parametrize("n_features", [1, 4, 5]) +@pytest.mark.parametrize( + "min_degree, max_degree", [(0, 1), (0, 2), (1, 3), (0, 4), (3, 4)] +) +@pytest.mark.parametrize("interaction_only", [True, False]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_num_combinations( + n_features, min_degree, max_degree, interaction_only, include_bias, csr_container +): + """ + Test that n_output_features_ is calculated correctly. + """ + x = csr_container(([1], ([0], [n_features - 1]))) + est = PolynomialFeatures( + degree=max_degree, + interaction_only=interaction_only, + include_bias=include_bias, + ) + est.fit(x) + num_combos = est.n_output_features_ + + combos = PolynomialFeatures._combinations( + n_features=n_features, + min_degree=0, + max_degree=max_degree, + interaction_only=interaction_only, + include_bias=include_bias, + ) + assert num_combos == sum([1 for _ in combos]) + + +@pytest.mark.parametrize( + ["deg", "include_bias", "interaction_only", "dtype"], + [ + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64), + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_polynomial_features_csr_X_floats( + deg, include_bias, interaction_only, dtype, csr_container +): + X_csr = csr_container(sparse_random(1000, 10, 0.5, random_state=0)) + X = X_csr.toarray() + + est = PolynomialFeatures( + deg, include_bias=include_bias, interaction_only=interaction_only + ) + Xt_csr = est.fit_transform(X_csr.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype)) + + assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) + + +@pytest.mark.parametrize( + ["zero_row_index", "deg", "interaction_only"], + [ + (0, 2, True), + (1, 2, True), + (2, 2, True), + (0, 3, True), + (1, 3, True), + (2, 3, True), + (0, 2, False), + (1, 2, False), + (2, 2, False), + (0, 3, False), + (1, 3, False), + (2, 3, False), + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_polynomial_features_csr_X_zero_row( + zero_row_index, deg, interaction_only, csr_container +): + X_csr = csr_container(sparse_random(3, 10, 1.0, random_state=0)) + X_csr[zero_row_index, :] = 0.0 + X = X_csr.toarray() + + est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) + + +# This degree should always be one more than the highest degree supported by +# _csr_expansion. +@pytest.mark.parametrize( + ["include_bias", "interaction_only"], + [(True, True), (True, False), (False, True), (False, False)], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_polynomial_features_csr_X_degree_4( + include_bias, interaction_only, csr_container +): + X_csr = csr_container(sparse_random(1000, 10, 0.5, random_state=0)) + X = X_csr.toarray() + + est = PolynomialFeatures( + 4, include_bias=include_bias, interaction_only=interaction_only + ) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) + + +@pytest.mark.parametrize( + ["deg", "dim", "interaction_only"], + [ + (2, 1, True), + (2, 2, True), + (3, 1, True), + (3, 2, True), + (3, 3, True), + (2, 1, False), + (2, 2, False), + (3, 1, False), + (3, 2, False), + (3, 3, False), + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only, csr_container): + X_csr = csr_container(sparse_random(1000, dim, 0.5, random_state=0)) + X = X_csr.toarray() + + est = PolynomialFeatures(deg, interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) + + +@pytest.mark.parametrize("interaction_only", [True, False]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_csr_polynomial_expansion_index_overflow_non_regression( + interaction_only, include_bias, csr_container +): + """Check the automatic index dtype promotion to `np.int64` when needed. + + This ensures that sufficiently large input configurations get + properly promoted to use `np.int64` for index and indptr representation + while preserving data integrity. Non-regression test for gh-16803. + + Note that this is only possible for Python runtimes with a 64 bit address + space. On 32 bit platforms, a `ValueError` is raised instead. + """ + + def degree_2_calc(d, i, j): + if interaction_only: + return d * i - (i**2 + 3 * i) // 2 - 1 + j + else: + return d * i - (i**2 + i) // 2 + j + + n_samples = 13 + n_features = 120001 + data_dtype = np.float32 + data = np.arange(1, 5, dtype=np.int64) + row = np.array([n_samples - 2, n_samples - 2, n_samples - 1, n_samples - 1]) + # An int64 dtype is required to avoid overflow error on Windows within the + # `degree_2_calc` function. + col = np.array( + [n_features - 2, n_features - 1, n_features - 2, n_features - 1], dtype=np.int64 + ) + X = csr_container( + (data, (row, col)), + shape=(n_samples, n_features), + dtype=data_dtype, + ) + pf = PolynomialFeatures( + interaction_only=interaction_only, include_bias=include_bias, degree=2 + ) + + # Calculate the number of combinations a-priori, and if needed check for + # the correct ValueError and terminate the test early. + num_combinations = pf._num_combinations( + n_features=n_features, + min_degree=0, + max_degree=2, + interaction_only=pf.interaction_only, + include_bias=pf.include_bias, + ) + if num_combinations > np.iinfo(np.intp).max: + msg = ( + r"The output that would result from the current configuration would have" + r" \d* features which is too large to be indexed" + ) + with pytest.raises(ValueError, match=msg): + pf.fit(X) + return + X_trans = pf.fit_transform(X) + row_nonzero, col_nonzero = X_trans.nonzero() + n_degree_1_features_out = n_features + include_bias + max_degree_2_idx = ( + degree_2_calc(n_features, col[int(not interaction_only)], col[1]) + + n_degree_1_features_out + ) + + # Account for bias of all samples except last one which will be handled + # separately since there are distinct data values before it + data_target = [1] * (n_samples - 2) if include_bias else [] + col_nonzero_target = [0] * (n_samples - 2) if include_bias else [] + + for i in range(2): + x = data[2 * i] + y = data[2 * i + 1] + x_idx = col[2 * i] + y_idx = col[2 * i + 1] + if include_bias: + data_target.append(1) + col_nonzero_target.append(0) + data_target.extend([x, y]) + col_nonzero_target.extend( + [x_idx + int(include_bias), y_idx + int(include_bias)] + ) + if not interaction_only: + data_target.extend([x * x, x * y, y * y]) + col_nonzero_target.extend( + [ + degree_2_calc(n_features, x_idx, x_idx) + n_degree_1_features_out, + degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out, + degree_2_calc(n_features, y_idx, y_idx) + n_degree_1_features_out, + ] + ) + else: + data_target.extend([x * y]) + col_nonzero_target.append( + degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out + ) + + nnz_per_row = int(include_bias) + 3 + 2 * int(not interaction_only) + + assert pf.n_output_features_ == max_degree_2_idx + 1 + assert X_trans.dtype == data_dtype + assert X_trans.shape == (n_samples, max_degree_2_idx + 1) + assert X_trans.indptr.dtype == X_trans.indices.dtype == np.int64 + # Ensure that dtype promotion was actually required: + assert X_trans.indices.max() > np.iinfo(np.int32).max + + row_nonzero_target = list(range(n_samples - 2)) if include_bias else [] + row_nonzero_target.extend( + [n_samples - 2] * nnz_per_row + [n_samples - 1] * nnz_per_row + ) + + assert_allclose(X_trans.data, data_target) + assert_array_equal(row_nonzero, row_nonzero_target) + assert_array_equal(col_nonzero, col_nonzero_target) + + +@pytest.mark.parametrize( + "degree, n_features", + [ + # Needs promotion to int64 when interaction_only=False + (2, 65535), + (3, 2344), + # This guarantees that the intermediate operation when calculating + # output columns would overflow a C-long, hence checks that python- + # longs are being used. + (2, int(np.sqrt(np.iinfo(np.int64).max) + 1)), + (3, 65535), + # This case tests the second clause of the overflow check which + # takes into account the value of `n_features` itself. + (2, int(np.sqrt(np.iinfo(np.int64).max))), + ], +) +@pytest.mark.parametrize("interaction_only", [True, False]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_csr_polynomial_expansion_index_overflow( + degree, n_features, interaction_only, include_bias, csr_container +): + """Tests known edge-cases to the dtype promotion strategy and custom + Cython code, including a current bug in the upstream + `scipy.sparse.hstack`. + """ + data = [1.0] + # Use int32 indices as much as we can + indices_dtype = np.int32 if n_features - 1 <= np.iinfo(np.int32).max else np.int64 + row = np.array([0], dtype=indices_dtype) + col = np.array([n_features - 1], dtype=indices_dtype) + + # First degree index + expected_indices = [ + n_features - 1 + int(include_bias), + ] + # Second degree index + expected_indices.append(n_features * (n_features + 1) // 2 + expected_indices[0]) + # Third degree index + expected_indices.append( + n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1] + ) + + X = csr_container((data, (row, col))) + pf = PolynomialFeatures( + interaction_only=interaction_only, include_bias=include_bias, degree=degree + ) + + # Calculate the number of combinations a-priori, and if needed check for + # the correct ValueError and terminate the test early. + num_combinations = pf._num_combinations( + n_features=n_features, + min_degree=0, + max_degree=degree, + interaction_only=pf.interaction_only, + include_bias=pf.include_bias, + ) + if num_combinations > np.iinfo(np.intp).max: + msg = ( + r"The output that would result from the current configuration would have" + r" \d* features which is too large to be indexed" + ) + with pytest.raises(ValueError, match=msg): + pf.fit(X) + return + + # In SciPy < 1.8, a bug occurs when an intermediate matrix in + # `to_stack` in `hstack` fits within int32 however would require int64 when + # combined with all previous matrices in `to_stack`. + if sp_version < parse_version("1.8.0"): + has_bug = False + max_int32 = np.iinfo(np.int32).max + cumulative_size = n_features + include_bias + for deg in range(2, degree + 1): + max_indptr = _calc_total_nnz(X.indptr, interaction_only, deg) + max_indices = _calc_expanded_nnz(n_features, interaction_only, deg) - 1 + cumulative_size += max_indices + 1 + needs_int64 = max(max_indices, max_indptr) > max_int32 + has_bug |= not needs_int64 and cumulative_size > max_int32 + if has_bug: + msg = r"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`" + with pytest.raises(ValueError, match=msg): + X_trans = pf.fit_transform(X) + return + + # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right + # dtype for representing indices and indptr if `n_features` is still + # small enough so that each block matrix's indices and indptr arrays + # can be represented with `np.int32`. We test `n_features==65535` + # since it is guaranteed to run into this bug. + if ( + sp_version < parse_version("1.9.2") + and n_features == 65535 + and degree == 2 + and not interaction_only + ): # pragma: no cover + msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`" + with pytest.raises(ValueError, match=msg): + X_trans = pf.fit_transform(X) + return + X_trans = pf.fit_transform(X) + + expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32 + # Terms higher than first degree + non_bias_terms = 1 + (degree - 1) * int(not interaction_only) + expected_nnz = int(include_bias) + non_bias_terms + assert X_trans.dtype == X.dtype + assert X_trans.shape == (1, pf.n_output_features_) + assert X_trans.indptr.dtype == X_trans.indices.dtype == expected_dtype + assert X_trans.nnz == expected_nnz + + if include_bias: + assert X_trans[0, 0] == pytest.approx(1.0) + for idx in range(non_bias_terms): + assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0) + + offset = interaction_only * n_features + if degree == 3: + offset *= 1 + n_features + assert pf.n_output_features_ == expected_indices[degree - 1] + 1 - offset + + +@pytest.mark.parametrize("interaction_only", [True, False]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_csr_polynomial_expansion_too_large_to_index( + interaction_only, include_bias, csr_container +): + n_features = np.iinfo(np.int64).max // 2 + data = [1.0] + row = [0] + col = [n_features - 1] + X = csr_container((data, (row, col))) + pf = PolynomialFeatures( + interaction_only=interaction_only, include_bias=include_bias, degree=(2, 2) + ) + msg = ( + r"The output that would result from the current configuration would have \d*" + r" features which is too large to be indexed" + ) + with pytest.raises(ValueError, match=msg): + pf.fit(X) + with pytest.raises(ValueError, match=msg): + pf.fit_transform(X) + + +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_polynomial_features_behaviour_on_zero_degree(sparse_container): + """Check that PolynomialFeatures raises error when degree=0 and include_bias=False, + and output a single constant column when include_bias=True + """ + X = np.ones((10, 2)) + poly = PolynomialFeatures(degree=0, include_bias=False) + err_msg = ( + "Setting degree to zero and include_bias to False would result in" + " an empty output array." + ) + with pytest.raises(ValueError, match=err_msg): + poly.fit_transform(X) + + poly = PolynomialFeatures(degree=(0, 0), include_bias=False) + err_msg = ( + "Setting both min_degree and max_degree to zero and include_bias to" + " False would result in an empty output array." + ) + with pytest.raises(ValueError, match=err_msg): + poly.fit_transform(X) + + for _X in [X, sparse_container(X)]: + poly = PolynomialFeatures(degree=0, include_bias=True) + output = poly.fit_transform(_X) + # convert to dense array if needed + if sparse.issparse(output): + output = output.toarray() + assert_array_equal(output, np.ones((X.shape[0], 1))) + + +def test_sizeof_LARGEST_INT_t(): + # On Windows, scikit-learn is typically compiled with MSVC that + # does not support int128 arithmetic (at the time of writing): + # https://stackoverflow.com/a/6761962/163740 + if sys.platform == "win32" or ( + sys.maxsize <= 2**32 and sys.platform != "emscripten" + ): + expected_size = 8 + else: + expected_size = 16 + + assert _get_sizeof_LARGEST_INT_t() == expected_size + + +@pytest.mark.xfail( + sys.platform == "win32", + reason=( + "On Windows, scikit-learn is typically compiled with MSVC that does not support" + " int128 arithmetic (at the time of writing)" + ), + run=True, +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_csr_polynomial_expansion_windows_fail(csr_container): + # Minimum needed to ensure integer overflow occurs while guaranteeing an + # int64-indexable output. + n_features = int(np.iinfo(np.int64).max ** (1 / 3) + 3) + data = [1.0] + row = [0] + col = [n_features - 1] + + # First degree index + expected_indices = [ + n_features - 1, + ] + # Second degree index + expected_indices.append( + int(n_features * (n_features + 1) // 2 + expected_indices[0]) + ) + # Third degree index + expected_indices.append( + int(n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1]) + ) + + X = csr_container((data, (row, col))) + pf = PolynomialFeatures(interaction_only=False, include_bias=False, degree=3) + if sys.maxsize <= 2**32: + msg = ( + r"The output that would result from the current configuration would" + r" have \d*" + r" features which is too large to be indexed" + ) + with pytest.raises(ValueError, match=msg): + pf.fit_transform(X) + else: + X_trans = pf.fit_transform(X) + for idx in range(3): + assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0) diff --git a/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_target_encoder.py b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_target_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..0f6fff581f39f6bc9430f5e631d6d361fb6bc4ae --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/preprocessing/tests/test_target_encoder.py @@ -0,0 +1,714 @@ +import re + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +from sklearn.model_selection import ( + KFold, + ShuffleSplit, + StratifiedKFold, + cross_val_score, + train_test_split, +) +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import ( + KBinsDiscretizer, + LabelBinarizer, + LabelEncoder, + TargetEncoder, +) + + +def _encode_target(X_ordinal, y_numeric, n_categories, smooth): + """Simple Python implementation of target encoding.""" + cur_encodings = np.zeros(n_categories, dtype=np.float64) + y_mean = np.mean(y_numeric) + + if smooth == "auto": + y_variance = np.var(y_numeric) + for c in range(n_categories): + y_subset = y_numeric[X_ordinal == c] + n_i = y_subset.shape[0] + + if n_i == 0: + cur_encodings[c] = y_mean + continue + + y_subset_variance = np.var(y_subset) + m = y_subset_variance / y_variance + lambda_ = n_i / (n_i + m) + + cur_encodings[c] = lambda_ * np.mean(y_subset) + (1 - lambda_) * y_mean + return cur_encodings + else: # float + for c in range(n_categories): + y_subset = y_numeric[X_ordinal == c] + current_sum = np.sum(y_subset) + y_mean * smooth + current_cnt = y_subset.shape[0] + smooth + cur_encodings[c] = current_sum / current_cnt + return cur_encodings + + +@pytest.mark.parametrize( + "categories, unknown_value", + [ + ([np.array([0, 1, 2], dtype=np.int64)], 4), + ([np.array([1.0, 3.0, np.nan], dtype=np.float64)], 6.0), + ([np.array(["cat", "dog", "snake"], dtype=object)], "bear"), + ("auto", 3), + ], +) +@pytest.mark.parametrize("smooth", [5.0, "auto"]) +@pytest.mark.parametrize("target_type", ["binary", "continuous"]) +def test_encoding(categories, unknown_value, global_random_seed, smooth, target_type): + """Check encoding for binary and continuous targets. + + Compare the values returned by `TargetEncoder.fit_transform` against the + expected encodings for cv splits from a naive reference Python + implementation in _encode_target. + """ + + n_categories = 3 + X_train_int_array = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=np.int64).T + X_test_int_array = np.array([[0, 1, 2]], dtype=np.int64).T + n_samples = X_train_int_array.shape[0] + + if categories == "auto": + X_train = X_train_int_array + X_test = X_test_int_array + else: + X_train = categories[0][X_train_int_array] + X_test = categories[0][X_test_int_array] + + X_test = np.concatenate((X_test, [[unknown_value]])) + + data_rng = np.random.RandomState(global_random_seed) + n_splits = 3 + if target_type == "binary": + y_numeric = data_rng.randint(low=0, high=2, size=n_samples) + target_names = np.array(["cat", "dog"], dtype=object) + y_train = target_names[y_numeric] + + else: + assert target_type == "continuous" + y_numeric = data_rng.uniform(low=-10, high=20, size=n_samples) + y_train = y_numeric + + shuffled_idx = data_rng.permutation(n_samples) + X_train_int_array = X_train_int_array[shuffled_idx] + X_train = X_train[shuffled_idx] + y_train = y_train[shuffled_idx] + y_numeric = y_numeric[shuffled_idx] + + # Define our CV splitting strategy + if target_type == "binary": + cv = StratifiedKFold( + n_splits=n_splits, random_state=global_random_seed, shuffle=True + ) + else: + cv = KFold(n_splits=n_splits, random_state=global_random_seed, shuffle=True) + + # Compute the expected values using our reference Python implementation of + # target encoding: + expected_X_fit_transform = np.empty_like(X_train_int_array, dtype=np.float64) + + for train_idx, test_idx in cv.split(X_train_int_array, y_train): + X_, y_ = X_train_int_array[train_idx, 0], y_numeric[train_idx] + cur_encodings = _encode_target(X_, y_, n_categories, smooth) + expected_X_fit_transform[test_idx, 0] = cur_encodings[ + X_train_int_array[test_idx, 0] + ] + + # Check that we can obtain the same encodings by calling `fit_transform` on + # the estimator with the same CV parameters: + target_encoder = TargetEncoder( + smooth=smooth, + categories=categories, + cv=n_splits, + random_state=global_random_seed, + ) + + X_fit_transform = target_encoder.fit_transform(X_train, y_train) + + assert target_encoder.target_type_ == target_type + assert_allclose(X_fit_transform, expected_X_fit_transform) + assert len(target_encoder.encodings_) == 1 + if target_type == "binary": + assert_array_equal(target_encoder.classes_, target_names) + else: + assert target_encoder.classes_ is None + + # compute encodings for all data to validate `transform` + y_mean = np.mean(y_numeric) + expected_encodings = _encode_target( + X_train_int_array[:, 0], y_numeric, n_categories, smooth + ) + assert_allclose(target_encoder.encodings_[0], expected_encodings) + assert target_encoder.target_mean_ == pytest.approx(y_mean) + + # Transform on test data, the last value is unknown so it is encoded as the target + # mean + expected_X_test_transform = np.concatenate( + (expected_encodings, np.array([y_mean])) + ).reshape(-1, 1) + + X_test_transform = target_encoder.transform(X_test) + assert_allclose(X_test_transform, expected_X_test_transform) + + +@pytest.mark.parametrize( + "categories, unknown_values", + [ + ([np.array([0, 1, 2], dtype=np.int64)], "auto"), + ([np.array(["cat", "dog", "snake"], dtype=object)], ["bear", "rabbit"]), + ], +) +@pytest.mark.parametrize( + "target_labels", [np.array([1, 2, 3]), np.array(["a", "b", "c"])] +) +@pytest.mark.parametrize("smooth", [5.0, "auto"]) +def test_encoding_multiclass( + global_random_seed, categories, unknown_values, target_labels, smooth +): + """Check encoding for multiclass targets.""" + rng = np.random.RandomState(global_random_seed) + + n_samples = 80 + n_features = 2 + feat_1_int = np.array(rng.randint(low=0, high=2, size=n_samples)) + feat_2_int = np.array(rng.randint(low=0, high=3, size=n_samples)) + feat_1 = categories[0][feat_1_int] + feat_2 = categories[0][feat_2_int] + X_train = np.column_stack((feat_1, feat_2)) + X_train_int = np.column_stack((feat_1_int, feat_2_int)) + categories_ = [[0, 1], [0, 1, 2]] + + n_classes = 3 + y_train_int = np.array(rng.randint(low=0, high=n_classes, size=n_samples)) + y_train = target_labels[y_train_int] + y_train_enc = LabelBinarizer().fit_transform(y_train) + + n_splits = 3 + cv = StratifiedKFold( + n_splits=n_splits, random_state=global_random_seed, shuffle=True + ) + + # Manually compute encodings for cv splits to validate `fit_transform` + expected_X_fit_transform = np.empty( + (X_train_int.shape[0], X_train_int.shape[1] * n_classes), + dtype=np.float64, + ) + for f_idx, cats in enumerate(categories_): + for c_idx in range(n_classes): + for train_idx, test_idx in cv.split(X_train, y_train): + y_class = y_train_enc[:, c_idx] + X_, y_ = X_train_int[train_idx, f_idx], y_class[train_idx] + current_encoding = _encode_target(X_, y_, len(cats), smooth) + # f_idx: 0, 0, 0, 1, 1, 1 + # c_idx: 0, 1, 2, 0, 1, 2 + # exp_idx: 0, 1, 2, 3, 4, 5 + exp_idx = c_idx + (f_idx * n_classes) + expected_X_fit_transform[test_idx, exp_idx] = current_encoding[ + X_train_int[test_idx, f_idx] + ] + + target_encoder = TargetEncoder( + smooth=smooth, + cv=n_splits, + random_state=global_random_seed, + ) + X_fit_transform = target_encoder.fit_transform(X_train, y_train) + + assert target_encoder.target_type_ == "multiclass" + assert_allclose(X_fit_transform, expected_X_fit_transform) + + # Manually compute encoding to validate `transform` + expected_encodings = [] + for f_idx, cats in enumerate(categories_): + for c_idx in range(n_classes): + y_class = y_train_enc[:, c_idx] + current_encoding = _encode_target( + X_train_int[:, f_idx], y_class, len(cats), smooth + ) + expected_encodings.append(current_encoding) + + assert len(target_encoder.encodings_) == n_features * n_classes + for i in range(n_features * n_classes): + assert_allclose(target_encoder.encodings_[i], expected_encodings[i]) + assert_array_equal(target_encoder.classes_, target_labels) + + # Include unknown values at the end + X_test_int = np.array([[0, 1], [1, 2], [4, 5]]) + if unknown_values == "auto": + X_test = X_test_int + else: + X_test = np.empty_like(X_test_int[:-1, :], dtype=object) + for column_idx in range(X_test_int.shape[1]): + X_test[:, column_idx] = categories[0][X_test_int[:-1, column_idx]] + # Add unknown values at end + X_test = np.vstack((X_test, unknown_values)) + + y_mean = np.mean(y_train_enc, axis=0) + expected_X_test_transform = np.empty( + (X_test_int.shape[0], X_test_int.shape[1] * n_classes), + dtype=np.float64, + ) + n_rows = X_test_int.shape[0] + f_idx = [0, 0, 0, 1, 1, 1] + # Last row are unknowns, dealt with later + for row_idx in range(n_rows - 1): + for i, enc in enumerate(expected_encodings): + expected_X_test_transform[row_idx, i] = enc[X_test_int[row_idx, f_idx[i]]] + + # Unknowns encoded as target mean for each class + # `y_mean` contains target mean for each class, thus cycle through mean of + # each class, `n_features` times + mean_idx = [0, 1, 2, 0, 1, 2] + for i in range(n_classes * n_features): + expected_X_test_transform[n_rows - 1, i] = y_mean[mean_idx[i]] + + X_test_transform = target_encoder.transform(X_test) + assert_allclose(X_test_transform, expected_X_test_transform) + + +@pytest.mark.parametrize( + "X, categories", + [ + ( + np.array([[0] * 10 + [1] * 10 + [3]], dtype=np.int64).T, # 3 is unknown + [[0, 1, 2]], + ), + ( + np.array( + [["cat"] * 10 + ["dog"] * 10 + ["snake"]], dtype=object + ).T, # snake is unknown + [["dog", "cat", "cow"]], + ), + ], +) +@pytest.mark.parametrize("smooth", [4.0, "auto"]) +def test_custom_categories(X, categories, smooth): + """Custom categories with unknown categories that are not in training data.""" + rng = np.random.RandomState(0) + y = rng.uniform(low=-10, high=20, size=X.shape[0]) + enc = TargetEncoder(categories=categories, smooth=smooth, random_state=0).fit(X, y) + + # The last element is unknown and encoded as the mean + y_mean = y.mean() + X_trans = enc.transform(X[-1:]) + assert X_trans[0, 0] == pytest.approx(y_mean) + + assert len(enc.encodings_) == 1 + # custom category that is not in training data + assert enc.encodings_[0][-1] == pytest.approx(y_mean) + + +@pytest.mark.parametrize( + "y, msg", + [ + ([1, 2, 0, 1], "Found input variables with inconsistent"), + ( + np.array([[1, 2, 0], [1, 2, 3]]).T, + "Target type was inferred to be 'multiclass-multioutput'", + ), + ], +) +def test_errors(y, msg): + """Check invalidate input.""" + X = np.array([[1, 0, 1]]).T + + enc = TargetEncoder() + with pytest.raises(ValueError, match=msg): + enc.fit_transform(X, y) + + +def test_use_regression_target(): + """Check inferred and specified `target_type` on regression target.""" + X = np.array([[0, 1, 0, 1, 0, 1]]).T + y = np.array([1.0, 2.0, 3.0, 2.0, 3.0, 4.0]) + + enc = TargetEncoder(cv=2) + with pytest.warns( + UserWarning, + match=re.escape( + "The least populated class in y has only 1 members, which is less than" + " n_splits=2." + ), + ): + enc.fit_transform(X, y) + assert enc.target_type_ == "multiclass" + + enc = TargetEncoder(cv=2, target_type="continuous") + enc.fit_transform(X, y) + assert enc.target_type_ == "continuous" + + +@pytest.mark.parametrize( + "y, feature_names", + [ + ([1, 2] * 10, ["A", "B"]), + ([1, 2, 3] * 6 + [1, 2], ["A_1", "A_2", "A_3", "B_1", "B_2", "B_3"]), + ( + ["y1", "y2", "y3"] * 6 + ["y1", "y2"], + ["A_y1", "A_y2", "A_y3", "B_y1", "B_y2", "B_y3"], + ), + ], +) +def test_feature_names_out_set_output(y, feature_names): + """Check TargetEncoder works with set_output.""" + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"A": ["a", "b"] * 10, "B": [1, 2] * 10}) + + enc_default = TargetEncoder(cv=2, smooth=3.0, random_state=0) + enc_default.set_output(transform="default") + enc_pandas = TargetEncoder(cv=2, smooth=3.0, random_state=0) + enc_pandas.set_output(transform="pandas") + + X_default = enc_default.fit_transform(X_df, y) + X_pandas = enc_pandas.fit_transform(X_df, y) + + assert_allclose(X_pandas.to_numpy(), X_default) + assert_array_equal(enc_pandas.get_feature_names_out(), feature_names) + assert_array_equal(enc_pandas.get_feature_names_out(), X_pandas.columns) + + +@pytest.mark.parametrize("to_pandas", [True, False]) +@pytest.mark.parametrize("smooth", [1.0, "auto"]) +@pytest.mark.parametrize("target_type", ["binary-ints", "binary-str", "continuous"]) +def test_multiple_features_quick(to_pandas, smooth, target_type): + """Check target encoder with multiple features.""" + X_ordinal = np.array( + [[1, 1], [0, 1], [1, 1], [2, 1], [1, 0], [0, 1], [1, 0], [0, 0]], dtype=np.int64 + ) + if target_type == "binary-str": + y_train = np.array(["a", "b", "a", "a", "b", "b", "a", "b"]) + y_integer = LabelEncoder().fit_transform(y_train) + cv = StratifiedKFold(2, random_state=0, shuffle=True) + elif target_type == "binary-ints": + y_train = np.array([3, 4, 3, 3, 3, 4, 4, 4]) + y_integer = LabelEncoder().fit_transform(y_train) + cv = StratifiedKFold(2, random_state=0, shuffle=True) + else: + y_train = np.array([3.0, 5.1, 2.4, 3.5, 4.1, 5.5, 10.3, 7.3], dtype=np.float32) + y_integer = y_train + cv = KFold(2, random_state=0, shuffle=True) + y_mean = np.mean(y_integer) + categories = [[0, 1, 2], [0, 1]] + + X_test = np.array( + [ + [0, 1], + [3, 0], # 3 is unknown + [1, 10], # 10 is unknown + ], + dtype=np.int64, + ) + + if to_pandas: + pd = pytest.importorskip("pandas") + # convert second feature to an object + X_train = pd.DataFrame( + { + "feat0": X_ordinal[:, 0], + "feat1": np.array(["cat", "dog"], dtype=object)[X_ordinal[:, 1]], + } + ) + # "snake" is unknown + X_test = pd.DataFrame({"feat0": X_test[:, 0], "feat1": ["dog", "cat", "snake"]}) + else: + X_train = X_ordinal + + # manually compute encoding for fit_transform + expected_X_fit_transform = np.empty_like(X_ordinal, dtype=np.float64) + for f_idx, cats in enumerate(categories): + for train_idx, test_idx in cv.split(X_ordinal, y_integer): + X_, y_ = X_ordinal[train_idx, f_idx], y_integer[train_idx] + current_encoding = _encode_target(X_, y_, len(cats), smooth) + expected_X_fit_transform[test_idx, f_idx] = current_encoding[ + X_ordinal[test_idx, f_idx] + ] + + # manually compute encoding for transform + expected_encodings = [] + for f_idx, cats in enumerate(categories): + current_encoding = _encode_target( + X_ordinal[:, f_idx], y_integer, len(cats), smooth + ) + expected_encodings.append(current_encoding) + + expected_X_test_transform = np.array( + [ + [expected_encodings[0][0], expected_encodings[1][1]], + [y_mean, expected_encodings[1][0]], + [expected_encodings[0][1], y_mean], + ], + dtype=np.float64, + ) + + enc = TargetEncoder(smooth=smooth, cv=2, random_state=0) + X_fit_transform = enc.fit_transform(X_train, y_train) + assert_allclose(X_fit_transform, expected_X_fit_transform) + + assert len(enc.encodings_) == 2 + for i in range(2): + assert_allclose(enc.encodings_[i], expected_encodings[i]) + + X_test_transform = enc.transform(X_test) + assert_allclose(X_test_transform, expected_X_test_transform) + + +@pytest.mark.parametrize( + "y, y_mean", + [ + (np.array([3.4] * 20), 3.4), + (np.array([0] * 20), 0), + (np.array(["a"] * 20, dtype=object), 0), + ], + ids=["continuous", "binary", "binary-string"], +) +@pytest.mark.parametrize("smooth", ["auto", 4.0, 0.0]) +def test_constant_target_and_feature(y, y_mean, smooth): + """Check edge case where feature and target is constant.""" + X = np.array([[1] * 20]).T + n_samples = X.shape[0] + + enc = TargetEncoder(cv=2, smooth=smooth, random_state=0) + X_trans = enc.fit_transform(X, y) + assert_allclose(X_trans, np.repeat([[y_mean]], n_samples, axis=0)) + assert enc.encodings_[0][0] == pytest.approx(y_mean) + assert enc.target_mean_ == pytest.approx(y_mean) + + X_test = np.array([[1], [0]]) + X_test_trans = enc.transform(X_test) + assert_allclose(X_test_trans, np.repeat([[y_mean]], 2, axis=0)) + + +def test_fit_transform_not_associated_with_y_if_ordinal_categorical_is_not( + global_random_seed, +): + cardinality = 30 # not too large, otherwise we need a very large n_samples + n_samples = 3000 + rng = np.random.RandomState(global_random_seed) + y_train = rng.normal(size=n_samples) + X_train = rng.randint(0, cardinality, size=n_samples).reshape(-1, 1) + + # Sort by y_train to attempt to cause a leak + y_sorted_indices = y_train.argsort() + y_train = y_train[y_sorted_indices] + X_train = X_train[y_sorted_indices] + + target_encoder = TargetEncoder(shuffle=True, random_state=global_random_seed) + X_encoded_train_shuffled = target_encoder.fit_transform(X_train, y_train) + + target_encoder = TargetEncoder(shuffle=False) + X_encoded_train_no_shuffled = target_encoder.fit_transform(X_train, y_train) + + # Check that no information about y_train has leaked into X_train: + regressor = RandomForestRegressor( + n_estimators=10, min_samples_leaf=20, random_state=global_random_seed + ) + + # It's impossible to learn a good predictive model on the training set when + # using the original representation X_train or the target encoded + # representation with shuffled inner CV. For the latter, no information + # about y_train has inadvertently leaked into the prior used to generate + # `X_encoded_train_shuffled`: + cv = ShuffleSplit(n_splits=50, random_state=global_random_seed) + assert cross_val_score(regressor, X_train, y_train, cv=cv).mean() < 0.1 + assert ( + cross_val_score(regressor, X_encoded_train_shuffled, y_train, cv=cv).mean() + < 0.1 + ) + + # Without the inner CV shuffling, a lot of information about y_train goes into the + # the per-fold y_train.mean() priors: shrinkage is no longer effective in this + # case and would no longer be able to prevent downstream over-fitting. + assert ( + cross_val_score(regressor, X_encoded_train_no_shuffled, y_train, cv=cv).mean() + > 0.5 + ) + + +def test_smooth_zero(): + """Check edge case with zero smoothing and cv does not contain category.""" + X = np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]).T + y = np.array([2.1, 4.3, 1.2, 3.1, 1.0, 9.0, 10.3, 14.2, 13.3, 15.0]) + + enc = TargetEncoder(smooth=0.0, shuffle=False, cv=2) + X_trans = enc.fit_transform(X, y) + + # With cv = 2, category 0 does not exist in the second half, thus + # it will be encoded as the mean of the second half + assert_allclose(X_trans[0], np.mean(y[5:])) + + # category 1 does not exist in the first half, thus it will be encoded as + # the mean of the first half + assert_allclose(X_trans[-1], np.mean(y[:5])) + + +@pytest.mark.parametrize("smooth", [0.0, 1e3, "auto"]) +def test_invariance_of_encoding_under_label_permutation(smooth, global_random_seed): + # Check that the encoding does not depend on the integer of the value of + # the integer labels. This is quite a trivial property but it is helpful + # to understand the following test. + rng = np.random.RandomState(global_random_seed) + + # Random y and informative categorical X to make the test non-trivial when + # using smoothing. + y = rng.normal(size=1000) + n_categories = 30 + X = KBinsDiscretizer(n_bins=n_categories, encode="ordinal").fit_transform( + y.reshape(-1, 1) + ) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, random_state=global_random_seed + ) + + # Shuffle the labels to make sure that the encoding is invariant to the + # permutation of the labels + permutated_labels = rng.permutation(n_categories) + X_train_permuted = permutated_labels[X_train.astype(np.int32)] + X_test_permuted = permutated_labels[X_test.astype(np.int32)] + + target_encoder = TargetEncoder(smooth=smooth, random_state=global_random_seed) + X_train_encoded = target_encoder.fit_transform(X_train, y_train) + X_test_encoded = target_encoder.transform(X_test) + + X_train_permuted_encoded = target_encoder.fit_transform(X_train_permuted, y_train) + X_test_permuted_encoded = target_encoder.transform(X_test_permuted) + + assert_allclose(X_train_encoded, X_train_permuted_encoded) + assert_allclose(X_test_encoded, X_test_permuted_encoded) + + +@pytest.mark.parametrize("smooth", [0.0, "auto"]) +def test_target_encoding_for_linear_regression(smooth, global_random_seed): + # Check some expected statistical properties when fitting a linear + # regression model on target encoded features depending on their relation + # with that target. + + # In this test, we use the Ridge class with the "lsqr" solver and a little + # bit of regularization to implement a linear regression model that + # converges quickly for large `n_samples` and robustly in case of + # correlated features. Since we will fit this model on a mean centered + # target, we do not need to fit an intercept and this will help simplify + # the analysis with respect to the expected coefficients. + linear_regression = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False) + + # Construct a random target variable. We need a large number of samples for + # this test to be stable across all values of the random seed. + n_samples = 50_000 + rng = np.random.RandomState(global_random_seed) + y = rng.randn(n_samples) + + # Generate a single informative ordinal feature with medium cardinality. + # Inject some irreducible noise to make it harder for a multivariate model + # to identify the informative feature from other pure noise features. + noise = 0.8 * rng.randn(n_samples) + n_categories = 100 + X_informative = KBinsDiscretizer( + n_bins=n_categories, + encode="ordinal", + strategy="uniform", + random_state=rng, + ).fit_transform((y + noise).reshape(-1, 1)) + + # Let's permute the labels to hide the fact that this feature is + # informative to naive linear regression model trained on the raw ordinal + # values. As highlighted in the previous test, the target encoding should be + # invariant to such a permutation. + permutated_labels = rng.permutation(n_categories) + X_informative = permutated_labels[X_informative.astype(np.int32)] + + # Generate a shuffled copy of the informative feature to destroy the + # relationship with the target. + X_shuffled = rng.permutation(X_informative) + + # Also include a very high cardinality categorical feature that is by + # itself independent of the target variable: target encoding such a feature + # without internal cross-validation should cause catastrophic overfitting + # for the downstream regressor, even with shrinkage. This kind of features + # typically represents near unique identifiers of samples. In general they + # should be removed from a machine learning datasets but here we want to + # study the ability of the default behavior of TargetEncoder to mitigate + # them automatically. + X_near_unique_categories = rng.choice( + int(0.9 * n_samples), size=n_samples, replace=True + ).reshape(-1, 1) + + # Assemble the dataset and do a train-test split: + X = np.concatenate( + [X_informative, X_shuffled, X_near_unique_categories], + axis=1, + ) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + # Let's first check that a linear regression model trained on the raw + # features underfits because of the meaning-less ordinal encoding of the + # labels. + raw_model = linear_regression.fit(X_train, y_train) + assert raw_model.score(X_train, y_train) < 0.1 + assert raw_model.score(X_test, y_test) < 0.1 + + # Now do the same with target encoding using the internal CV mechanism + # implemented when using fit_transform. + model_with_cv = make_pipeline( + TargetEncoder(smooth=smooth, random_state=rng), linear_regression + ).fit(X_train, y_train) + + # This model should be able to fit the data well and also generalise to the + # test data (assuming that the binning is fine-grained enough). The R2 + # scores are not perfect because of the noise injected during the + # generation of the unique informative feature. + coef = model_with_cv[-1].coef_ + assert model_with_cv.score(X_train, y_train) > 0.5, coef + assert model_with_cv.score(X_test, y_test) > 0.5, coef + + # The target encoder recovers the linear relationship with slope 1 between + # the target encoded unique informative predictor and the target. Since the + # target encoding of the 2 other features is not informative thanks to the + # use of internal cross-validation, the multivariate linear regressor + # assigns a coef of 1 to the first feature and 0 to the other 2. + assert coef[0] == pytest.approx(1, abs=1e-2) + assert (np.abs(coef[1:]) < 0.2).all() + + # Let's now disable the internal cross-validation by calling fit and then + # transform separately on the training set: + target_encoder = TargetEncoder(smooth=smooth, random_state=rng).fit( + X_train, y_train + ) + X_enc_no_cv_train = target_encoder.transform(X_train) + X_enc_no_cv_test = target_encoder.transform(X_test) + model_no_cv = linear_regression.fit(X_enc_no_cv_train, y_train) + + # The linear regression model should always overfit because it assigns + # too much weight to the extremely high cardinality feature relatively to + # the informative feature. Note that this is the case even when using + # the empirical Bayes smoothing which is not enough to prevent such + # overfitting alone. + coef = model_no_cv.coef_ + assert model_no_cv.score(X_enc_no_cv_train, y_train) > 0.7, coef + assert model_no_cv.score(X_enc_no_cv_test, y_test) < 0.5, coef + + # The model overfits because it assigns too much weight to the high + # cardinality yet non-informative feature instead of the lower + # cardinality yet informative feature: + assert abs(coef[0]) < abs(coef[2]) + + +def test_pandas_copy_on_write(): + """ + Test target-encoder cython code when y is read-only. + + The numpy array underlying df["y"] is read-only when copy-on-write is enabled. + Non-regression test for gh-27879. + """ + pd = pytest.importorskip("pandas", minversion="2.0") + with pd.option_context("mode.copy_on_write", True): + df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]}) + TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"]) diff --git a/.venv/Lib/site-packages/sklearn/semi_supervised/__init__.py b/.venv/Lib/site-packages/sklearn/semi_supervised/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..790c51e98185099ae22267463f6492d840087fd0 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/semi_supervised/__init__.py @@ -0,0 +1,13 @@ +"""Semi-supervised learning algorithms. + +These algorithms utilize small amounts of labeled data and large amounts of unlabeled +data for classification tasks. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._label_propagation import LabelPropagation, LabelSpreading +from ._self_training import SelfTrainingClassifier + +__all__ = ["SelfTrainingClassifier", "LabelPropagation", "LabelSpreading"] diff --git a/.venv/Lib/site-packages/sklearn/semi_supervised/_label_propagation.py b/.venv/Lib/site-packages/sklearn/semi_supervised/_label_propagation.py new file mode 100644 index 0000000000000000000000000000000000000000..40e1be15ee5897312cc2e14febdce220d542df6f --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/semi_supervised/_label_propagation.py @@ -0,0 +1,625 @@ +# coding=utf8 +""" +Label propagation in the context of this module refers to a set of +semi-supervised classification algorithms. At a high level, these algorithms +work by forming a fully-connected graph between all points given and solving +for the steady-state distribution of labels at each point. + +These algorithms perform very well in practice. The cost of running can be very +expensive, at approximately O(N^3) where N is the number of (labeled and +unlabeled) points. The theory (why they perform so well) is motivated by +intuitions from random walk algorithms and geometric relationships in the data. +For more information see the references below. + +Model Features +-------------- +Label clamping: + The algorithm tries to learn distributions of labels over the dataset given + label assignments over an initial subset. In one variant, the algorithm does + not allow for any errors in the initial assignment (hard-clamping) while + in another variant, the algorithm allows for some wiggle room for the initial + assignments, allowing them to change by a fraction alpha in each iteration + (soft-clamping). + +Kernel: + A function which projects a vector into some higher dimensional space. This + implementation supports RBF and KNN kernels. Using the RBF kernel generates + a dense matrix of size O(N^2). KNN kernel will generate a sparse matrix of + size O(k*N) which will run much faster. See the documentation for SVMs for + more info on kernels. + +Examples +-------- +>>> import numpy as np +>>> from sklearn import datasets +>>> from sklearn.semi_supervised import LabelPropagation +>>> label_prop_model = LabelPropagation() +>>> iris = datasets.load_iris() +>>> rng = np.random.RandomState(42) +>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3 +>>> labels = np.copy(iris.target) +>>> labels[random_unlabeled_points] = -1 +>>> label_prop_model.fit(iris.data, labels) +LabelPropagation(...) + +Notes +----- +References: +[1] Yoshua Bengio, Olivier Delalleau, Nicolas Le Roux. In Semi-Supervised +Learning (2006), pp. 193-216 + +[2] Olivier Delalleau, Yoshua Bengio, Nicolas Le Roux. Efficient +Non-Parametric Function Induction in Semi-Supervised Learning. AISTAT 2005 +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real + +import numpy as np +from scipy import sparse + +from ..base import BaseEstimator, ClassifierMixin, _fit_context +from ..exceptions import ConvergenceWarning +from ..metrics.pairwise import rbf_kernel +from ..neighbors import NearestNeighbors +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import safe_sparse_dot +from ..utils.fixes import laplacian as csgraph_laplacian +from ..utils.multiclass import check_classification_targets +from ..utils.validation import check_is_fitted, validate_data + + +class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): + """Base class for label propagation module. + + Parameters + ---------- + kernel : {'knn', 'rbf'} or callable, default='rbf' + String identifier for kernel function to use or the kernel function + itself. Only 'rbf' and 'knn' strings are valid inputs. The function + passed should take two inputs, each of shape (n_samples, n_features), + and return a (n_samples, n_samples) shaped weight matrix. + + gamma : float, default=20 + Parameter for rbf kernel. + + n_neighbors : int, default=7 + Parameter for knn kernel. Need to be strictly positive. + + alpha : float, default=1.0 + Clamping factor. + + max_iter : int, default=30 + Change maximum number of iterations allowed. + + tol : float, default=1e-3 + Convergence tolerance: threshold to consider the system at steady + state. + + n_jobs : int, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + """ + + _parameter_constraints: dict = { + "kernel": [StrOptions({"knn", "rbf"}), callable], + "gamma": [Interval(Real, 0, None, closed="left")], + "n_neighbors": [Interval(Integral, 0, None, closed="neither")], + "alpha": [None, Interval(Real, 0, 1, closed="neither")], + "max_iter": [Interval(Integral, 0, None, closed="neither")], + "tol": [Interval(Real, 0, None, closed="left")], + "n_jobs": [None, Integral], + } + + def __init__( + self, + kernel="rbf", + *, + gamma=20, + n_neighbors=7, + alpha=1, + max_iter=30, + tol=1e-3, + n_jobs=None, + ): + self.max_iter = max_iter + self.tol = tol + + # kernel parameters + self.kernel = kernel + self.gamma = gamma + self.n_neighbors = n_neighbors + + # clamping factor + self.alpha = alpha + + self.n_jobs = n_jobs + + def _get_kernel(self, X, y=None): + if self.kernel == "rbf": + if y is None: + return rbf_kernel(X, X, gamma=self.gamma) + else: + return rbf_kernel(X, y, gamma=self.gamma) + elif self.kernel == "knn": + if self.nn_fit is None: + self.nn_fit = NearestNeighbors( + n_neighbors=self.n_neighbors, n_jobs=self.n_jobs + ).fit(X) + if y is None: + return self.nn_fit.kneighbors_graph( + self.nn_fit._fit_X, self.n_neighbors, mode="connectivity" + ) + else: + return self.nn_fit.kneighbors(y, return_distance=False) + elif callable(self.kernel): + if y is None: + return self.kernel(X, X) + else: + return self.kernel(X, y) + + @abstractmethod + def _build_graph(self): + raise NotImplementedError( + "Graph construction must be implemented to fit a label propagation model." + ) + + def predict(self, X): + """Perform inductive inference across the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + y : ndarray of shape (n_samples,) + Predictions for input data. + """ + # Note: since `predict` does not accept semi-supervised labels as input, + # `fit(X, y).predict(X) != fit(X, y).transduction_`. + # Hence, `fit_predict` is not implemented. + # See https://github.com/scikit-learn/scikit-learn/pull/24898 + probas = self.predict_proba(X) + return self.classes_[np.argmax(probas, axis=1)].ravel() + + def predict_proba(self, X): + """Predict probability for each possible outcome. + + Compute the probability estimates for each single sample in X + and each possible outcome seen during training (categorical + distribution). + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + probabilities : ndarray of shape (n_samples, n_classes) + Normalized probability distributions across + class labels. + """ + check_is_fitted(self) + + X_2d = validate_data( + self, + X, + accept_sparse=["csc", "csr", "coo", "dok", "bsr", "lil", "dia"], + reset=False, + ) + weight_matrices = self._get_kernel(self.X_, X_2d) + if self.kernel == "knn": + probabilities = np.array( + [ + np.sum(self.label_distributions_[weight_matrix], axis=0) + for weight_matrix in weight_matrices + ] + ) + else: + weight_matrices = weight_matrices.T + probabilities = safe_sparse_dot(weight_matrices, self.label_distributions_) + normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T + probabilities /= normalizer + return probabilities + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit a semi-supervised label propagation model to X. + + The input samples (labeled and unlabeled) are provided by matrix X, + and target labels are provided by matrix y. We conventionally apply the + label -1 to unlabeled samples in matrix y in a semi-supervised + classification. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target class values with unlabeled points marked as -1. + All unlabeled samples will be transductively assigned labels + internally, which are stored in `transduction_`. + + Returns + ------- + self : object + Returns the instance itself. + """ + X, y = validate_data( + self, + X, + y, + accept_sparse=["csr", "csc"], + reset=True, + ) + self.X_ = X + check_classification_targets(y) + + # actual graph construction (implementations should override this) + graph_matrix = self._build_graph() + + # label construction + # construct a categorical distribution for classification only + classes = np.unique(y) + classes = classes[classes != -1] + self.classes_ = classes + + n_samples, n_classes = len(y), len(classes) + + y = np.asarray(y) + unlabeled = y == -1 + + # initialize distributions + self.label_distributions_ = np.zeros((n_samples, n_classes)) + for label in classes: + self.label_distributions_[y == label, classes == label] = 1 + + y_static = np.copy(self.label_distributions_) + if self._variant == "propagation": + # LabelPropagation + y_static[unlabeled] = 0 + else: + # LabelSpreading + y_static *= 1 - self.alpha + + l_previous = np.zeros((self.X_.shape[0], n_classes)) + + unlabeled = unlabeled[:, np.newaxis] + if sparse.issparse(graph_matrix): + graph_matrix = graph_matrix.tocsr() + + for self.n_iter_ in range(self.max_iter): + if np.abs(self.label_distributions_ - l_previous).sum() < self.tol: + break + + l_previous = self.label_distributions_ + self.label_distributions_ = safe_sparse_dot( + graph_matrix, self.label_distributions_ + ) + + if self._variant == "propagation": + normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] + normalizer[normalizer == 0] = 1 + self.label_distributions_ /= normalizer + self.label_distributions_ = np.where( + unlabeled, self.label_distributions_, y_static + ) + else: + # clamp + self.label_distributions_ = ( + np.multiply(self.alpha, self.label_distributions_) + y_static + ) + else: + warnings.warn( + "max_iter=%d was reached without convergence." % self.max_iter, + category=ConvergenceWarning, + ) + self.n_iter_ += 1 + + normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] + normalizer[normalizer == 0] = 1 + self.label_distributions_ /= normalizer + + # set the transduction item + transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)] + self.transduction_ = transduction.ravel() + return self + + +class LabelPropagation(BaseLabelPropagation): + """Label Propagation classifier. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + kernel : {'knn', 'rbf'} or callable, default='rbf' + String identifier for kernel function to use or the kernel function + itself. Only 'rbf' and 'knn' strings are valid inputs. The function + passed should take two inputs, each of shape (n_samples, n_features), + and return a (n_samples, n_samples) shaped weight matrix. + + gamma : float, default=20 + Parameter for rbf kernel. + + n_neighbors : int, default=7 + Parameter for knn kernel which need to be strictly positive. + + max_iter : int, default=1000 + Change maximum number of iterations allowed. + + tol : float, 1e-3 + Convergence tolerance: threshold to consider the system at steady + state. + + n_jobs : int, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + X_ : {array-like, sparse matrix} of shape (n_samples, n_features) + Input array. + + classes_ : ndarray of shape (n_classes,) + The distinct labels used in classifying instances. + + label_distributions_ : ndarray of shape (n_samples, n_classes) + Categorical distribution for each item. + + transduction_ : ndarray of shape (n_samples) + Label assigned to each item during :term:`fit`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run. + + See Also + -------- + LabelSpreading : Alternate label propagation strategy more robust to noise. + + References + ---------- + Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled data + with label propagation. Technical Report CMU-CALD-02-107, Carnegie Mellon + University, 2002 http://pages.cs.wisc.edu/~jerryzhu/pub/CMU-CALD-02-107.pdf + + Examples + -------- + >>> import numpy as np + >>> from sklearn import datasets + >>> from sklearn.semi_supervised import LabelPropagation + >>> label_prop_model = LabelPropagation() + >>> iris = datasets.load_iris() + >>> rng = np.random.RandomState(42) + >>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3 + >>> labels = np.copy(iris.target) + >>> labels[random_unlabeled_points] = -1 + >>> label_prop_model.fit(iris.data, labels) + LabelPropagation(...) + """ + + _variant = "propagation" + + _parameter_constraints: dict = {**BaseLabelPropagation._parameter_constraints} + _parameter_constraints.pop("alpha") + + def __init__( + self, + kernel="rbf", + *, + gamma=20, + n_neighbors=7, + max_iter=1000, + tol=1e-3, + n_jobs=None, + ): + super().__init__( + kernel=kernel, + gamma=gamma, + n_neighbors=n_neighbors, + max_iter=max_iter, + tol=tol, + n_jobs=n_jobs, + alpha=None, + ) + + def _build_graph(self): + """Matrix representing a fully connected graph between each sample + + This basic implementation creates a non-stochastic affinity matrix, so + class distributions will exceed 1 (normalization may be desired). + """ + if self.kernel == "knn": + self.nn_fit = None + affinity_matrix = self._get_kernel(self.X_) + normalizer = affinity_matrix.sum(axis=0) + if sparse.issparse(affinity_matrix): + affinity_matrix.data /= np.diag(np.array(normalizer)) + else: + affinity_matrix /= normalizer[:, np.newaxis] + return affinity_matrix + + def fit(self, X, y): + """Fit a semi-supervised label propagation model to X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target class values with unlabeled points marked as -1. + All unlabeled samples will be transductively assigned labels + internally, which are stored in `transduction_`. + + Returns + ------- + self : object + Returns the instance itself. + """ + return super().fit(X, y) + + +class LabelSpreading(BaseLabelPropagation): + """LabelSpreading model for semi-supervised learning. + + This model is similar to the basic Label Propagation algorithm, + but uses affinity matrix based on the normalized graph Laplacian + and soft clamping across the labels. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + kernel : {'knn', 'rbf'} or callable, default='rbf' + String identifier for kernel function to use or the kernel function + itself. Only 'rbf' and 'knn' strings are valid inputs. The function + passed should take two inputs, each of shape (n_samples, n_features), + and return a (n_samples, n_samples) shaped weight matrix. + + gamma : float, default=20 + Parameter for rbf kernel. + + n_neighbors : int, default=7 + Parameter for knn kernel which is a strictly positive integer. + + alpha : float, default=0.2 + Clamping factor. A value in (0, 1) that specifies the relative amount + that an instance should adopt the information from its neighbors as + opposed to its initial label. + alpha=0 means keeping the initial label information; alpha=1 means + replacing all initial information. + + max_iter : int, default=30 + Maximum number of iterations allowed. + + tol : float, default=1e-3 + Convergence tolerance: threshold to consider the system at steady + state. + + n_jobs : int, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + X_ : ndarray of shape (n_samples, n_features) + Input array. + + classes_ : ndarray of shape (n_classes,) + The distinct labels used in classifying instances. + + label_distributions_ : ndarray of shape (n_samples, n_classes) + Categorical distribution for each item. + + transduction_ : ndarray of shape (n_samples,) + Label assigned to each item during :term:`fit`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run. + + See Also + -------- + LabelPropagation : Unregularized graph based semi-supervised learning. + + References + ---------- + `Dengyong Zhou, Olivier Bousquet, Thomas Navin Lal, Jason Weston, + Bernhard Schoelkopf. Learning with local and global consistency (2004) + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn import datasets + >>> from sklearn.semi_supervised import LabelSpreading + >>> label_prop_model = LabelSpreading() + >>> iris = datasets.load_iris() + >>> rng = np.random.RandomState(42) + >>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3 + >>> labels = np.copy(iris.target) + >>> labels[random_unlabeled_points] = -1 + >>> label_prop_model.fit(iris.data, labels) + LabelSpreading(...) + """ + + _variant = "spreading" + + _parameter_constraints: dict = {**BaseLabelPropagation._parameter_constraints} + _parameter_constraints["alpha"] = [Interval(Real, 0, 1, closed="neither")] + + def __init__( + self, + kernel="rbf", + *, + gamma=20, + n_neighbors=7, + alpha=0.2, + max_iter=30, + tol=1e-3, + n_jobs=None, + ): + # this one has different base parameters + super().__init__( + kernel=kernel, + gamma=gamma, + n_neighbors=n_neighbors, + alpha=alpha, + max_iter=max_iter, + tol=tol, + n_jobs=n_jobs, + ) + + def _build_graph(self): + """Graph matrix for Label Spreading computes the graph laplacian""" + # compute affinity matrix (or gram matrix) + if self.kernel == "knn": + self.nn_fit = None + n_samples = self.X_.shape[0] + affinity_matrix = self._get_kernel(self.X_) + laplacian = csgraph_laplacian(affinity_matrix, normed=True) + laplacian = -laplacian + if sparse.issparse(laplacian): + diag_mask = laplacian.row == laplacian.col + laplacian.data[diag_mask] = 0.0 + else: + laplacian.flat[:: n_samples + 1] = 0.0 # set diag to 0.0 + return laplacian diff --git a/.venv/Lib/site-packages/sklearn/semi_supervised/_self_training.py b/.venv/Lib/site-packages/sklearn/semi_supervised/_self_training.py new file mode 100644 index 0000000000000000000000000000000000000000..3d0627d36ac853f0ead43c237658c7cd800c01f4 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/semi_supervised/_self_training.py @@ -0,0 +1,615 @@ +import warnings +from numbers import Integral, Real +from warnings import warn + +import numpy as np + +from sklearn.base import ClassifierMixin + +from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone +from ..utils import Bunch, safe_mask +from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.metaestimators import available_if +from ..utils.validation import _estimator_has, check_is_fitted, validate_data + +__all__ = ["SelfTrainingClassifier"] + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +class SelfTrainingClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): + """Self-training classifier. + + This :term:`metaestimator` allows a given supervised classifier to function as a + semi-supervised classifier, allowing it to learn from unlabeled data. It + does this by iteratively predicting pseudo-labels for the unlabeled data + and adding them to the training set. + + The classifier will continue iterating until either max_iter is reached, or + no pseudo-labels were added to the training set in the previous iteration. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object + An estimator object implementing `fit` and `predict_proba`. + Invoking the `fit` method will fit a clone of the passed estimator, + which will be stored in the `estimator_` attribute. + + .. versionadded:: 1.6 + `estimator` was added to replace `base_estimator`. + + base_estimator : estimator object + An estimator object implementing `fit` and `predict_proba`. + Invoking the `fit` method will fit a clone of the passed estimator, + which will be stored in the `estimator_` attribute. + + .. deprecated:: 1.6 + `base_estimator` was deprecated in 1.6 and will be removed in 1.8. + Use `estimator` instead. + + threshold : float, default=0.75 + The decision threshold for use with `criterion='threshold'`. + Should be in [0, 1). When using the `'threshold'` criterion, a + :ref:`well calibrated classifier ` should be used. + + criterion : {'threshold', 'k_best'}, default='threshold' + The selection criterion used to select which labels to add to the + training set. If `'threshold'`, pseudo-labels with prediction + probabilities above `threshold` are added to the dataset. If `'k_best'`, + the `k_best` pseudo-labels with highest prediction probabilities are + added to the dataset. When using the 'threshold' criterion, a + :ref:`well calibrated classifier ` should be used. + + k_best : int, default=10 + The amount of samples to add in each iteration. Only used when + `criterion='k_best'`. + + max_iter : int or None, default=10 + Maximum number of iterations allowed. Should be greater than or equal + to 0. If it is `None`, the classifier will continue to predict labels + until no new pseudo-labels are added, or all unlabeled samples have + been labeled. + + verbose : bool, default=False + Enable verbose output. + + Attributes + ---------- + estimator_ : estimator object + The fitted estimator. + + classes_ : ndarray or list of ndarray of shape (n_classes,) + Class labels for each output. (Taken from the trained + `estimator_`). + + transduction_ : ndarray of shape (n_samples,) + The labels used for the final fit of the classifier, including + pseudo-labels added during fit. + + labeled_iter_ : ndarray of shape (n_samples,) + The iteration in which each sample was labeled. When a sample has + iteration 0, the sample was already labeled in the original dataset. + When a sample has iteration -1, the sample was not labeled in any + iteration. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + The number of rounds of self-training, that is the number of times the + base estimator is fitted on relabeled variants of the training set. + + termination_condition_ : {'max_iter', 'no_change', 'all_labeled'} + The reason that fitting was stopped. + + - `'max_iter'`: `n_iter_` reached `max_iter`. + - `'no_change'`: no new labels were predicted. + - `'all_labeled'`: all unlabeled samples were labeled before `max_iter` + was reached. + + See Also + -------- + LabelPropagation : Label propagation classifier. + LabelSpreading : Label spreading model for semi-supervised learning. + + References + ---------- + :doi:`David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling + supervised methods. In Proceedings of the 33rd annual meeting on + Association for Computational Linguistics (ACL '95). Association for + Computational Linguistics, Stroudsburg, PA, USA, 189-196. + <10.3115/981658.981684>` + + Examples + -------- + >>> import numpy as np + >>> from sklearn import datasets + >>> from sklearn.semi_supervised import SelfTrainingClassifier + >>> from sklearn.svm import SVC + >>> rng = np.random.RandomState(42) + >>> iris = datasets.load_iris() + >>> random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3 + >>> iris.target[random_unlabeled_points] = -1 + >>> svc = SVC(probability=True, gamma="auto") + >>> self_training_model = SelfTrainingClassifier(svc) + >>> self_training_model.fit(iris.data, iris.target) + SelfTrainingClassifier(...) + """ + + _parameter_constraints: dict = { + # We don't require `predic_proba` here to allow passing a meta-estimator + # that only exposes `predict_proba` after fitting. + # TODO(1.8) remove None option + "estimator": [None, HasMethods(["fit"])], + # TODO(1.8) remove + "base_estimator": [ + HasMethods(["fit"]), + Hidden(StrOptions({"deprecated"})), + ], + "threshold": [Interval(Real, 0.0, 1.0, closed="left")], + "criterion": [StrOptions({"threshold", "k_best"})], + "k_best": [Interval(Integral, 1, None, closed="left")], + "max_iter": [Interval(Integral, 0, None, closed="left"), None], + "verbose": ["verbose"], + } + + def __init__( + self, + estimator=None, + base_estimator="deprecated", + threshold=0.75, + criterion="threshold", + k_best=10, + max_iter=10, + verbose=False, + ): + self.estimator = estimator + self.threshold = threshold + self.criterion = criterion + self.k_best = k_best + self.max_iter = max_iter + self.verbose = verbose + + # TODO(1.8) remove + self.base_estimator = base_estimator + + def _get_estimator(self): + """Get the estimator. + + Returns + ------- + estimator_ : estimator object + The cloned estimator object. + """ + # TODO(1.8): remove and only keep clone(self.estimator) + if self.estimator is None and self.base_estimator != "deprecated": + estimator_ = clone(self.base_estimator) + + warn( + ( + "`base_estimator` has been deprecated in 1.6 and will be removed" + " in 1.8. Please use `estimator` instead." + ), + FutureWarning, + ) + # TODO(1.8) remove + elif self.estimator is None and self.base_estimator == "deprecated": + raise ValueError( + "You must pass an estimator to SelfTrainingClassifier." + " Use `estimator`." + ) + elif self.estimator is not None and self.base_estimator != "deprecated": + raise ValueError( + "You must pass only one estimator to SelfTrainingClassifier." + " Use `estimator`." + ) + else: + estimator_ = clone(self.estimator) + return estimator_ + + @_fit_context( + # SelfTrainingClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, **params): + """ + Fit self-training classifier using `X`, `y` as training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + y : {array-like, sparse matrix} of shape (n_samples,) + Array representing the labels. Unlabeled samples should have the + label -1. + + **params : dict + Parameters to pass to the underlying estimators. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Fitted estimator. + """ + _raise_for_params(params, self, "fit") + + self.estimator_ = self._get_estimator() + + # we need row slicing support for sparse matrices, but costly finiteness check + # can be delegated to the base estimator. + X, y = validate_data( + self, + X, + y, + accept_sparse=["csr", "csc", "lil", "dok"], + ensure_all_finite=False, + ) + + if y.dtype.kind in ["U", "S"]: + raise ValueError( + "y has dtype string. If you wish to predict on " + "string targets, use dtype object, and use -1" + " as the label for unlabeled samples." + ) + + has_label = y != -1 + + if np.all(has_label): + warnings.warn("y contains no unlabeled samples", UserWarning) + + if self.criterion == "k_best" and ( + self.k_best > X.shape[0] - np.sum(has_label) + ): + warnings.warn( + ( + "k_best is larger than the amount of unlabeled " + "samples. All unlabeled samples will be labeled in " + "the first iteration" + ), + UserWarning, + ) + + if _routing_enabled(): + routed_params = process_routing(self, "fit", **params) + else: + routed_params = Bunch(estimator=Bunch(fit={})) + + self.transduction_ = np.copy(y) + self.labeled_iter_ = np.full_like(y, -1) + self.labeled_iter_[has_label] = 0 + + self.n_iter_ = 0 + + while not np.all(has_label) and ( + self.max_iter is None or self.n_iter_ < self.max_iter + ): + self.n_iter_ += 1 + self.estimator_.fit( + X[safe_mask(X, has_label)], + self.transduction_[has_label], + **routed_params.estimator.fit, + ) + + # Predict on the unlabeled samples + prob = self.estimator_.predict_proba(X[safe_mask(X, ~has_label)]) + pred = self.estimator_.classes_[np.argmax(prob, axis=1)] + max_proba = np.max(prob, axis=1) + + # Select new labeled samples + if self.criterion == "threshold": + selected = max_proba > self.threshold + else: + n_to_select = min(self.k_best, max_proba.shape[0]) + if n_to_select == max_proba.shape[0]: + selected = np.ones_like(max_proba, dtype=bool) + else: + # NB these are indices, not a mask + selected = np.argpartition(-max_proba, n_to_select)[:n_to_select] + + # Map selected indices into original array + selected_full = np.nonzero(~has_label)[0][selected] + + # Add newly labeled confident predictions to the dataset + self.transduction_[selected_full] = pred[selected] + has_label[selected_full] = True + self.labeled_iter_[selected_full] = self.n_iter_ + + if selected_full.shape[0] == 0: + # no changed labels + self.termination_condition_ = "no_change" + break + + if self.verbose: + print( + f"End of iteration {self.n_iter_}," + f" added {selected_full.shape[0]} new labels." + ) + + if self.n_iter_ == self.max_iter: + self.termination_condition_ = "max_iter" + if np.all(has_label): + self.termination_condition_ = "all_labeled" + + self.estimator_.fit( + X[safe_mask(X, has_label)], + self.transduction_[has_label], + **routed_params.estimator.fit, + ) + self.classes_ = self.estimator_.classes_ + return self + + @available_if(_estimator_has("predict")) + def predict(self, X, **params): + """Predict the classes of `X`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + **params : dict of str -> object + Parameters to pass to the underlying estimator's ``predict`` method. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + y : ndarray of shape (n_samples,) + Array with predicted labels. + """ + check_is_fitted(self) + _raise_for_params(params, self, "predict") + + if _routing_enabled(): + # metadata routing is enabled. + routed_params = process_routing(self, "predict", **params) + else: + routed_params = Bunch(estimator=Bunch(predict={})) + + X = validate_data( + self, + X, + accept_sparse=True, + ensure_all_finite=False, + reset=False, + ) + return self.estimator_.predict(X, **routed_params.estimator.predict) + + @available_if(_estimator_has("predict_proba")) + def predict_proba(self, X, **params): + """Predict probability for each possible outcome. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + **params : dict of str -> object + Parameters to pass to the underlying estimator's + ``predict_proba`` method. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + y : ndarray of shape (n_samples, n_features) + Array with prediction probabilities. + """ + check_is_fitted(self) + _raise_for_params(params, self, "predict_proba") + + if _routing_enabled(): + # metadata routing is enabled. + routed_params = process_routing(self, "predict_proba", **params) + else: + routed_params = Bunch(estimator=Bunch(predict_proba={})) + + X = validate_data( + self, + X, + accept_sparse=True, + ensure_all_finite=False, + reset=False, + ) + return self.estimator_.predict_proba(X, **routed_params.estimator.predict_proba) + + @available_if(_estimator_has("decision_function")) + def decision_function(self, X, **params): + """Call decision function of the `estimator`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + **params : dict of str -> object + Parameters to pass to the underlying estimator's + ``decision_function`` method. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + y : ndarray of shape (n_samples, n_features) + Result of the decision function of the `estimator`. + """ + check_is_fitted(self) + _raise_for_params(params, self, "decision_function") + + if _routing_enabled(): + # metadata routing is enabled. + routed_params = process_routing(self, "decision_function", **params) + else: + routed_params = Bunch(estimator=Bunch(decision_function={})) + + X = validate_data( + self, + X, + accept_sparse=True, + ensure_all_finite=False, + reset=False, + ) + return self.estimator_.decision_function( + X, **routed_params.estimator.decision_function + ) + + @available_if(_estimator_has("predict_log_proba")) + def predict_log_proba(self, X, **params): + """Predict log probability for each possible outcome. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + **params : dict of str -> object + Parameters to pass to the underlying estimator's + ``predict_log_proba`` method. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + y : ndarray of shape (n_samples, n_features) + Array with log prediction probabilities. + """ + check_is_fitted(self) + _raise_for_params(params, self, "predict_log_proba") + + if _routing_enabled(): + # metadata routing is enabled. + routed_params = process_routing(self, "predict_log_proba", **params) + else: + routed_params = Bunch(estimator=Bunch(predict_log_proba={})) + + X = validate_data( + self, + X, + accept_sparse=True, + ensure_all_finite=False, + reset=False, + ) + return self.estimator_.predict_log_proba( + X, **routed_params.estimator.predict_log_proba + ) + + @available_if(_estimator_has("score")) + def score(self, X, y, **params): + """Call score on the `estimator`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + y : array-like of shape (n_samples,) + Array representing the labels. + + **params : dict of str -> object + Parameters to pass to the underlying estimator's ``score`` method. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + score : float + Result of calling score on the `estimator`. + """ + check_is_fitted(self) + _raise_for_params(params, self, "score") + + if _routing_enabled(): + # metadata routing is enabled. + routed_params = process_routing(self, "score", **params) + else: + routed_params = Bunch(estimator=Bunch(score={})) + + X = validate_data( + self, + X, + accept_sparse=True, + ensure_all_finite=False, + reset=False, + ) + return self.estimator_.score(X, y, **routed_params.estimator.score) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.6 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + router.add( + estimator=self.estimator, + method_mapping=( + MethodMapping() + .add(callee="fit", caller="fit") + .add(callee="score", caller="fit") + .add(callee="predict", caller="predict") + .add(callee="predict_proba", caller="predict_proba") + .add(callee="decision_function", caller="decision_function") + .add(callee="predict_log_proba", caller="predict_log_proba") + .add(callee="score", caller="score") + ), + ) + return router diff --git a/.venv/Lib/site-packages/sklearn/semi_supervised/tests/__init__.py b/.venv/Lib/site-packages/sklearn/semi_supervised/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/Lib/site-packages/sklearn/semi_supervised/tests/test_label_propagation.py b/.venv/Lib/site-packages/sklearn/semi_supervised/tests/test_label_propagation.py new file mode 100644 index 0000000000000000000000000000000000000000..b012c5bf7755a0b13ddc095ada57e036cb15b705 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/semi_supervised/tests/test_label_propagation.py @@ -0,0 +1,238 @@ +"""test the label propagation module""" + +import warnings + +import numpy as np +import pytest +from scipy.sparse import issparse + +from sklearn.datasets import make_classification +from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics.pairwise import rbf_kernel +from sklearn.model_selection import train_test_split +from sklearn.neighbors import NearestNeighbors +from sklearn.semi_supervised import _label_propagation as label_propagation +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_array_equal, +) + +CONSTRUCTOR_TYPES = ("array", "sparse_csr", "sparse_csc") + +ESTIMATORS = [ + (label_propagation.LabelPropagation, {"kernel": "rbf"}), + (label_propagation.LabelPropagation, {"kernel": "knn", "n_neighbors": 2}), + ( + label_propagation.LabelPropagation, + {"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)}, + ), + (label_propagation.LabelSpreading, {"kernel": "rbf"}), + (label_propagation.LabelSpreading, {"kernel": "knn", "n_neighbors": 2}), + ( + label_propagation.LabelSpreading, + {"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)}, + ), +] + + +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_fit_transduction(global_dtype, Estimator, parameters): + samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype) + labels = [0, 1, -1] + clf = Estimator(**parameters).fit(samples, labels) + assert clf.transduction_[2] == 1 + + +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_distribution(global_dtype, Estimator, parameters): + if parameters["kernel"] == "knn": + pytest.skip( + "Unstable test for this configuration: changes in k-NN ordering break it." + ) + samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=global_dtype) + labels = [0, 1, -1] + clf = Estimator(**parameters).fit(samples, labels) + assert_allclose(clf.label_distributions_[2], [0.5, 0.5], atol=1e-2) + + +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_predict(global_dtype, Estimator, parameters): + samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype) + labels = [0, 1, -1] + clf = Estimator(**parameters).fit(samples, labels) + assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1])) + + +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_predict_proba(global_dtype, Estimator, parameters): + samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], dtype=global_dtype) + labels = [0, 1, -1] + clf = Estimator(**parameters).fit(samples, labels) + assert_allclose(clf.predict_proba([[1.0, 1.0]]), np.array([[0.5, 0.5]])) + + +@pytest.mark.parametrize("alpha", [0.1, 0.3, 0.5, 0.7, 0.9]) +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_label_spreading_closed_form(global_dtype, Estimator, parameters, alpha): + n_classes = 2 + X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) + X = X.astype(global_dtype, copy=False) + y[::3] = -1 + + gamma = 0.1 + clf = label_propagation.LabelSpreading(gamma=gamma).fit(X, y) + # adopting notation from Zhou et al (2004): + S = clf._build_graph() + Y = np.zeros((len(y), n_classes + 1), dtype=X.dtype) + Y[np.arange(len(y)), y] = 1 + Y = Y[:, :-1] + + expected = np.dot(np.linalg.inv(np.eye(len(S), dtype=S.dtype) - alpha * S), Y) + expected /= expected.sum(axis=1)[:, np.newaxis] + + clf = label_propagation.LabelSpreading( + max_iter=100, alpha=alpha, tol=1e-10, gamma=gamma + ) + clf.fit(X, y) + + assert_allclose(expected, clf.label_distributions_) + + +def test_label_propagation_closed_form(global_dtype): + n_classes = 2 + X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) + X = X.astype(global_dtype, copy=False) + y[::3] = -1 + Y = np.zeros((len(y), n_classes + 1)) + Y[np.arange(len(y)), y] = 1 + unlabelled_idx = Y[:, (-1,)].nonzero()[0] + labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0] + + clf = label_propagation.LabelPropagation(max_iter=100, tol=1e-10, gamma=0.1) + clf.fit(X, y) + # adopting notation from Zhu et al 2002 + T_bar = clf._build_graph() + Tuu = T_bar[tuple(np.meshgrid(unlabelled_idx, unlabelled_idx, indexing="ij"))] + Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx, indexing="ij"))] + Y = Y[:, :-1] + Y_l = Y[labelled_idx, :] + Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l) + + expected = Y.copy() + expected[unlabelled_idx, :] = Y_u + expected /= expected.sum(axis=1)[:, np.newaxis] + + assert_allclose(expected, clf.label_distributions_, atol=1e-4) + + +@pytest.mark.parametrize("accepted_sparse_type", ["sparse_csr", "sparse_csc"]) +@pytest.mark.parametrize("index_dtype", [np.int32, np.int64]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_sparse_input_types( + accepted_sparse_type, index_dtype, dtype, Estimator, parameters +): + # This is non-regression test for #17085 + X = _convert_container([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], accepted_sparse_type) + X.data = X.data.astype(dtype, copy=False) + X.indices = X.indices.astype(index_dtype, copy=False) + X.indptr = X.indptr.astype(index_dtype, copy=False) + labels = [0, 1, -1] + clf = Estimator(**parameters).fit(X, labels) + assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1])) + + +@pytest.mark.parametrize("constructor_type", CONSTRUCTOR_TYPES) +def test_convergence_speed(constructor_type): + # This is a non-regression test for #5774 + X = _convert_container([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], constructor_type) + y = np.array([0, 1, -1]) + mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=5000) + mdl.fit(X, y) + + # this should converge quickly: + assert mdl.n_iter_ < 10 + assert_array_equal(mdl.predict(X), [0, 1, 1]) + + +def test_convergence_warning(): + # This is a non-regression test for #5774 + X = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]]) + y = np.array([0, 1, -1]) + mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=1) + warn_msg = "max_iter=1 was reached without convergence." + with pytest.warns(ConvergenceWarning, match=warn_msg): + mdl.fit(X, y) + assert mdl.n_iter_ == mdl.max_iter + + mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=1) + with pytest.warns(ConvergenceWarning, match=warn_msg): + mdl.fit(X, y) + assert mdl.n_iter_ == mdl.max_iter + + mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=500) + with warnings.catch_warnings(): + warnings.simplefilter("error", ConvergenceWarning) + mdl.fit(X, y) + + mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=500) + with warnings.catch_warnings(): + warnings.simplefilter("error", ConvergenceWarning) + mdl.fit(X, y) + + +@pytest.mark.parametrize( + "LabelPropagationCls", + [label_propagation.LabelSpreading, label_propagation.LabelPropagation], +) +def test_label_propagation_non_zero_normalizer(LabelPropagationCls): + # check that we don't divide by zero in case of null normalizer + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/pull/15946 + # https://github.com/scikit-learn/scikit-learn/issues/9292 + X = np.array([[100.0, 100.0], [100.0, 100.0], [0.0, 0.0], [0.0, 0.0]]) + y = np.array([0, 1, -1, -1]) + mdl = LabelPropagationCls(kernel="knn", max_iter=100, n_neighbors=1) + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + mdl.fit(X, y) + + +def test_predict_sparse_callable_kernel(global_dtype): + # This is a non-regression test for #15866 + + # Custom sparse kernel (top-K RBF) + def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5): + nn = NearestNeighbors(n_neighbors=10, metric="euclidean", n_jobs=2) + nn.fit(X) + W = -1 * nn.kneighbors_graph(Y, mode="distance").power(2) * gamma + np.exp(W.data, out=W.data) + assert issparse(W) + return W.T + + n_classes = 4 + n_samples = 500 + n_test = 10 + X, y = make_classification( + n_classes=n_classes, + n_samples=n_samples, + n_features=20, + n_informative=20, + n_redundant=0, + n_repeated=0, + random_state=0, + ) + X = X.astype(global_dtype) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=n_test, random_state=0 + ) + + model = label_propagation.LabelSpreading(kernel=topk_rbf) + model.fit(X_train, y_train) + assert model.score(X_test, y_test) >= 0.9 + + model = label_propagation.LabelPropagation(kernel=topk_rbf) + model.fit(X_train, y_train) + assert model.score(X_test, y_test) >= 0.9 diff --git a/.venv/Lib/site-packages/sklearn/semi_supervised/tests/test_self_training.py b/.venv/Lib/site-packages/sklearn/semi_supervised/tests/test_self_training.py new file mode 100644 index 0000000000000000000000000000000000000000..983355b32ab4572d266519b98350107665d032c3 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/semi_supervised/tests/test_self_training.py @@ -0,0 +1,395 @@ +from math import ceil + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from sklearn.datasets import load_iris, make_blobs +from sklearn.ensemble import StackingClassifier +from sklearn.exceptions import NotFittedError +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.semi_supervised import SelfTrainingClassifier +from sklearn.svm import SVC +from sklearn.tests.test_pipeline import SimpleEstimator +from sklearn.tree import DecisionTreeClassifier + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# load the iris dataset and randomly permute it +iris = load_iris() +X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=0 +) + +n_labeled_samples = 50 + +y_train_missing_labels = y_train.copy() +y_train_missing_labels[n_labeled_samples:] = -1 +mapping = {0: "A", 1: "B", 2: "C", -1: "-1"} +y_train_missing_strings = np.vectorize(mapping.get)(y_train_missing_labels).astype( + object +) +y_train_missing_strings[y_train_missing_labels == -1] = -1 + + +def test_warns_k_best(): + st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="k_best", k_best=1000) + with pytest.warns(UserWarning, match="k_best is larger than"): + st.fit(X_train, y_train_missing_labels) + + assert st.termination_condition_ == "all_labeled" + + +@pytest.mark.parametrize( + "estimator", + [KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)], +) +@pytest.mark.parametrize("selection_crit", ["threshold", "k_best"]) +def test_classification(estimator, selection_crit): + # Check classification for various parameter settings. + # Also assert that predictions for strings and numerical labels are equal. + # Also test for multioutput classification + threshold = 0.75 + max_iter = 10 + st = SelfTrainingClassifier( + estimator, max_iter=max_iter, threshold=threshold, criterion=selection_crit + ) + st.fit(X_train, y_train_missing_labels) + pred = st.predict(X_test) + proba = st.predict_proba(X_test) + + st_string = SelfTrainingClassifier( + estimator, max_iter=max_iter, criterion=selection_crit, threshold=threshold + ) + st_string.fit(X_train, y_train_missing_strings) + pred_string = st_string.predict(X_test) + proba_string = st_string.predict_proba(X_test) + + assert_array_equal(np.vectorize(mapping.get)(pred), pred_string) + assert_array_equal(proba, proba_string) + + assert st.termination_condition_ == st_string.termination_condition_ + # Check consistency between labeled_iter, n_iter and max_iter + labeled = y_train_missing_labels != -1 + # assert that labeled samples have labeled_iter = 0 + assert_array_equal(st.labeled_iter_ == 0, labeled) + # assert that labeled samples do not change label during training + assert_array_equal(y_train_missing_labels[labeled], st.transduction_[labeled]) + + # assert that the max of the iterations is less than the total amount of + # iterations + assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter + assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter + + # check shapes + assert st.labeled_iter_.shape == st.transduction_.shape + assert st_string.labeled_iter_.shape == st_string.transduction_.shape + + +def test_k_best(): + st = SelfTrainingClassifier( + KNeighborsClassifier(n_neighbors=1), + criterion="k_best", + k_best=10, + max_iter=None, + ) + y_train_only_one_label = np.copy(y_train) + y_train_only_one_label[1:] = -1 + n_samples = y_train.shape[0] + + n_expected_iter = ceil((n_samples - 1) / 10) + st.fit(X_train, y_train_only_one_label) + assert st.n_iter_ == n_expected_iter + + # Check labeled_iter_ + assert np.sum(st.labeled_iter_ == 0) == 1 + for i in range(1, n_expected_iter): + assert np.sum(st.labeled_iter_ == i) == 10 + assert np.sum(st.labeled_iter_ == n_expected_iter) == (n_samples - 1) % 10 + assert st.termination_condition_ == "all_labeled" + + +def test_sanity_classification(): + estimator = SVC(gamma="scale", probability=True) + estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:]) + + st = SelfTrainingClassifier(estimator) + st.fit(X_train, y_train_missing_labels) + + pred1, pred2 = estimator.predict(X_test), st.predict(X_test) + assert not np.array_equal(pred1, pred2) + score_supervised = accuracy_score(estimator.predict(X_test), y_test) + score_self_training = accuracy_score(st.predict(X_test), y_test) + + assert score_self_training > score_supervised + + +def test_none_iter(): + # Check that the all samples were labeled after a 'reasonable' number of + # iterations. + st = SelfTrainingClassifier(KNeighborsClassifier(), threshold=0.55, max_iter=None) + st.fit(X_train, y_train_missing_labels) + + assert st.n_iter_ < 10 + assert st.termination_condition_ == "all_labeled" + + +@pytest.mark.parametrize( + "estimator", + [KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)], +) +@pytest.mark.parametrize("y", [y_train_missing_labels, y_train_missing_strings]) +def test_zero_iterations(estimator, y): + # Check classification for zero iterations. + # Fitting a SelfTrainingClassifier with zero iterations should give the + # same results as fitting a supervised classifier. + # This also asserts that string arrays work as expected. + + clf1 = SelfTrainingClassifier(estimator, max_iter=0) + + clf1.fit(X_train, y) + + clf2 = estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples]) + + assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) + assert clf1.termination_condition_ == "max_iter" + + +def test_prefitted_throws_error(): + # Test that passing a pre-fitted classifier and calling predict throws an + # error + knn = KNeighborsClassifier() + knn.fit(X_train, y_train) + st = SelfTrainingClassifier(knn) + with pytest.raises( + NotFittedError, + match="This SelfTrainingClassifier instance is not fitted yet", + ): + st.predict(X_train) + + +@pytest.mark.parametrize("max_iter", range(1, 5)) +def test_labeled_iter(max_iter): + # Check that the amount of datapoints labeled in iteration 0 is equal to + # the amount of labeled datapoints we passed. + st = SelfTrainingClassifier(KNeighborsClassifier(), max_iter=max_iter) + + st.fit(X_train, y_train_missing_labels) + amount_iter_0 = len(st.labeled_iter_[st.labeled_iter_ == 0]) + assert amount_iter_0 == n_labeled_samples + # Check that the max of the iterations is less than the total amount of + # iterations + assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter + + +def test_no_unlabeled(): + # Test that training on a fully labeled dataset produces the same results + # as training the classifier by itself. + knn = KNeighborsClassifier() + knn.fit(X_train, y_train) + st = SelfTrainingClassifier(knn) + with pytest.warns(UserWarning, match="y contains no unlabeled samples"): + st.fit(X_train, y_train) + assert_array_equal(knn.predict(X_test), st.predict(X_test)) + # Assert that all samples were labeled in iteration 0 (since there were no + # unlabeled samples). + assert np.all(st.labeled_iter_ == 0) + assert st.termination_condition_ == "all_labeled" + + +def test_early_stopping(): + svc = SVC(gamma="scale", probability=True) + st = SelfTrainingClassifier(svc) + X_train_easy = [[1], [0], [1], [0.5]] + y_train_easy = [1, 0, -1, -1] + # X = [[0.5]] cannot be predicted on with a high confidence, so training + # stops early + st.fit(X_train_easy, y_train_easy) + assert st.n_iter_ == 1 + assert st.termination_condition_ == "no_change" + + +def test_strings_dtype(): + clf = SelfTrainingClassifier(KNeighborsClassifier()) + X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) + labels_multiclass = ["one", "two", "three"] + + y_strings = np.take(labels_multiclass, y) + + with pytest.raises(ValueError, match="dtype"): + clf.fit(X, y_strings) + + +@pytest.mark.parametrize("verbose", [True, False]) +def test_verbose(capsys, verbose): + clf = SelfTrainingClassifier(KNeighborsClassifier(), verbose=verbose) + clf.fit(X_train, y_train_missing_labels) + + captured = capsys.readouterr() + + if verbose: + assert "iteration" in captured.out + else: + assert "iteration" not in captured.out + + +def test_verbose_k_best(capsys): + st = SelfTrainingClassifier( + KNeighborsClassifier(n_neighbors=1), + criterion="k_best", + k_best=10, + verbose=True, + max_iter=None, + ) + + y_train_only_one_label = np.copy(y_train) + y_train_only_one_label[1:] = -1 + n_samples = y_train.shape[0] + + n_expected_iter = ceil((n_samples - 1) / 10) + st.fit(X_train, y_train_only_one_label) + + captured = capsys.readouterr() + + msg = "End of iteration {}, added {} new labels." + for i in range(1, n_expected_iter): + assert msg.format(i, 10) in captured.out + + assert msg.format(n_expected_iter, (n_samples - 1) % 10) in captured.out + + +def test_k_best_selects_best(): + # Tests that the labels added by st really are the 10 best labels. + svc = SVC(gamma="scale", probability=True, random_state=0) + st = SelfTrainingClassifier(svc, criterion="k_best", max_iter=1, k_best=10) + has_label = y_train_missing_labels != -1 + st.fit(X_train, y_train_missing_labels) + + got_label = ~has_label & (st.transduction_ != -1) + + svc.fit(X_train[has_label], y_train_missing_labels[has_label]) + pred = svc.predict_proba(X_train[~has_label]) + max_proba = np.max(pred, axis=1) + + most_confident_svc = X_train[~has_label][np.argsort(max_proba)[-10:]] + added_by_st = X_train[np.where(got_label)].tolist() + + for row in most_confident_svc.tolist(): + assert row in added_by_st + + +def test_estimator_meta_estimator(): + # Check that a meta-estimator relying on an estimator implementing + # `predict_proba` will work even if it does not expose this method before being + # fitted. + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/19119 + + estimator = StackingClassifier( + estimators=[ + ("svc_1", SVC(probability=True)), + ("svc_2", SVC(probability=True)), + ], + final_estimator=SVC(probability=True), + cv=2, + ) + + assert hasattr(estimator, "predict_proba") + clf = SelfTrainingClassifier(estimator=estimator) + clf.fit(X_train, y_train_missing_labels) + clf.predict_proba(X_test) + + estimator = StackingClassifier( + estimators=[ + ("svc_1", SVC(probability=False)), + ("svc_2", SVC(probability=False)), + ], + final_estimator=SVC(probability=False), + cv=2, + ) + + assert not hasattr(estimator, "predict_proba") + clf = SelfTrainingClassifier(estimator=estimator) + with pytest.raises(AttributeError): + clf.fit(X_train, y_train_missing_labels) + + +def test_self_training_estimator_attribute_error(): + """Check that we raise the proper AttributeErrors when the `estimator` + does not implement the `predict_proba` method, which is called from within + `fit`, or `decision_function`, which is decorated with `available_if`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/28108 + """ + # `SVC` with `probability=False` does not implement 'predict_proba' that + # is required internally in `fit` of `SelfTrainingClassifier`. We expect + # an AttributeError to be raised. + estimator = SVC(probability=False, gamma="scale") + self_training = SelfTrainingClassifier(estimator) + + with pytest.raises(AttributeError, match="has no attribute 'predict_proba'"): + self_training.fit(X_train, y_train_missing_labels) + + # `DecisionTreeClassifier` does not implement 'decision_function' and + # should raise an AttributeError + self_training = SelfTrainingClassifier(estimator=DecisionTreeClassifier()) + + outer_msg = "This 'SelfTrainingClassifier' has no attribute 'decision_function'" + inner_msg = "'DecisionTreeClassifier' object has no attribute 'decision_function'" + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + self_training.fit(X_train, y_train_missing_labels).decision_function(X_train) + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + +# TODO(1.8): remove in 1.8 +def test_deprecation_warning_base_estimator(): + warn_msg = "`base_estimator` has been deprecated in 1.6 and will be removed" + with pytest.warns(FutureWarning, match=warn_msg): + SelfTrainingClassifier(base_estimator=DecisionTreeClassifier()).fit( + X_train, y_train_missing_labels + ) + + error_msg = "You must pass an estimator to SelfTrainingClassifier" + with pytest.raises(ValueError, match=error_msg): + SelfTrainingClassifier().fit(X_train, y_train_missing_labels) + + error_msg = "You must pass only one estimator to SelfTrainingClassifier." + with pytest.raises(ValueError, match=error_msg): + SelfTrainingClassifier( + base_estimator=DecisionTreeClassifier(), estimator=DecisionTreeClassifier() + ).fit(X_train, y_train_missing_labels) + + +# Metadata routing tests +# ================================================================= + + +@pytest.mark.filterwarnings("ignore:y contains no unlabeled samples:UserWarning") +@pytest.mark.parametrize( + "method", ["decision_function", "predict_log_proba", "predict_proba", "predict"] +) +def test_routing_passed_metadata_not_supported(method): + """Test that the right error message is raised when metadata is passed while + not supported when `enable_metadata_routing=False`.""" + est = SelfTrainingClassifier(estimator=SimpleEstimator()) + with pytest.raises( + ValueError, match="is only supported if enable_metadata_routing=True" + ): + est.fit([[1], [1]], [1, 1], sample_weight=[1], prop="a") + + est = SelfTrainingClassifier(estimator=SimpleEstimator()) + with pytest.raises( + ValueError, match="is only supported if enable_metadata_routing=True" + ): + # make sure that the estimator thinks it is already fitted + est.fitted_params_ = True + getattr(est, method)([[1]], sample_weight=[1], prop="a") + + +# End of routing tests +# ==================== diff --git a/.venv/Lib/site-packages/sklearn/svm/__init__.py b/.venv/Lib/site-packages/sklearn/svm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cffb8cf8ecc0914e08172716536fff5afc35383c --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/__init__.py @@ -0,0 +1,21 @@ +"""Support vector machine algorithms.""" + +# See http://scikit-learn.sourceforge.net/modules/svm.html for complete +# documentation. + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._bounds import l1_min_c +from ._classes import SVC, SVR, LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM + +__all__ = [ + "LinearSVC", + "LinearSVR", + "NuSVC", + "NuSVR", + "OneClassSVM", + "SVC", + "SVR", + "l1_min_c", +] diff --git a/.venv/Lib/site-packages/sklearn/svm/_base.py b/.venv/Lib/site-packages/sklearn/svm/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..3a65beb0e48047f0daba1e3309c72f29f6951932 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/_base.py @@ -0,0 +1,1255 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real + +import numpy as np +import scipy.sparse as sp + +from ..base import BaseEstimator, ClassifierMixin, _fit_context +from ..exceptions import ConvergenceWarning, NotFittedError +from ..preprocessing import LabelEncoder +from ..utils import check_array, check_random_state, column_or_1d, compute_class_weight +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import safe_sparse_dot +from ..utils.metaestimators import available_if +from ..utils.multiclass import _ovr_decision_function, check_classification_targets +from ..utils.validation import ( + _check_large_sparse, + _check_sample_weight, + _num_samples, + check_consistent_length, + check_is_fitted, + validate_data, +) +from . import _liblinear as liblinear # type: ignore + +# mypy error: error: Module 'sklearn.svm' has no attribute '_libsvm' +# (and same for other imports) +from . import _libsvm as libsvm # type: ignore +from . import _libsvm_sparse as libsvm_sparse # type: ignore + +LIBSVM_IMPL = ["c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr"] + + +def _one_vs_one_coef(dual_coef, n_support, support_vectors): + """Generate primal coefficients from dual coefficients + for the one-vs-one multi class LibSVM in the case + of a linear kernel.""" + + # get 1vs1 weights for all n*(n-1) classifiers. + # this is somewhat messy. + # shape of dual_coef_ is nSV * (n_classes -1) + # see docs for details + n_class = dual_coef.shape[0] + 1 + + # XXX we could do preallocation of coef but + # would have to take care in the sparse case + coef = [] + sv_locs = np.cumsum(np.hstack([[0], n_support])) + for class1 in range(n_class): + # SVs for class1: + sv1 = support_vectors[sv_locs[class1] : sv_locs[class1 + 1], :] + for class2 in range(class1 + 1, n_class): + # SVs for class1: + sv2 = support_vectors[sv_locs[class2] : sv_locs[class2 + 1], :] + + # dual coef for class1 SVs: + alpha1 = dual_coef[class2 - 1, sv_locs[class1] : sv_locs[class1 + 1]] + # dual coef for class2 SVs: + alpha2 = dual_coef[class1, sv_locs[class2] : sv_locs[class2 + 1]] + # build weight for class1 vs class2 + + coef.append(safe_sparse_dot(alpha1, sv1) + safe_sparse_dot(alpha2, sv2)) + return coef + + +class BaseLibSVM(BaseEstimator, metaclass=ABCMeta): + """Base class for estimators that use libsvm as backing library. + + This implements support vector machine classification and regression. + + Parameter documentation is in the derived `SVC` class. + """ + + _parameter_constraints: dict = { + "kernel": [ + StrOptions({"linear", "poly", "rbf", "sigmoid", "precomputed"}), + callable, + ], + "degree": [Interval(Integral, 0, None, closed="left")], + "gamma": [ + StrOptions({"scale", "auto"}), + Interval(Real, 0.0, None, closed="left"), + ], + "coef0": [Interval(Real, None, None, closed="neither")], + "tol": [Interval(Real, 0.0, None, closed="neither")], + "C": [Interval(Real, 0.0, None, closed="right")], + "nu": [Interval(Real, 0.0, 1.0, closed="right")], + "epsilon": [Interval(Real, 0.0, None, closed="left")], + "shrinking": ["boolean"], + "probability": ["boolean"], + "cache_size": [Interval(Real, 0, None, closed="neither")], + "class_weight": [StrOptions({"balanced"}), dict, None], + "verbose": ["verbose"], + "max_iter": [Interval(Integral, -1, None, closed="left")], + "random_state": ["random_state"], + } + + # The order of these must match the integer values in LibSVM. + # XXX These are actually the same in the dense case. Need to factor + # this out. + _sparse_kernels = ["linear", "poly", "rbf", "sigmoid", "precomputed"] + + @abstractmethod + def __init__( + self, + kernel, + degree, + gamma, + coef0, + tol, + C, + nu, + epsilon, + shrinking, + probability, + cache_size, + class_weight, + verbose, + max_iter, + random_state, + ): + if self._impl not in LIBSVM_IMPL: + raise ValueError( + "impl should be one of %s, %s was given" % (LIBSVM_IMPL, self._impl) + ) + + self.kernel = kernel + self.degree = degree + self.gamma = gamma + self.coef0 = coef0 + self.tol = tol + self.C = C + self.nu = nu + self.epsilon = epsilon + self.shrinking = shrinking + self.probability = probability + self.cache_size = cache_size + self.class_weight = class_weight + self.verbose = verbose + self.max_iter = max_iter + self.random_state = random_state + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # Used by cross_val_score. + tags.input_tags.pairwise = self.kernel == "precomputed" + return tags + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit the SVM model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) \ + or (n_samples, n_samples) + Training vectors, where `n_samples` is the number of samples + and `n_features` is the number of features. + For kernel="precomputed", the expected shape of X is + (n_samples, n_samples). + + y : array-like of shape (n_samples,) + Target values (class labels in classification, real numbers in + regression). + + sample_weight : array-like of shape (n_samples,), default=None + Per-sample weights. Rescale C per sample. Higher weights + force the classifier to put more emphasis on these points. + + Returns + ------- + self : object + Fitted estimator. + + Notes + ----- + If X and y are not C-ordered and contiguous arrays of np.float64 and + X is not a scipy.sparse.csr_matrix, X and/or y may be copied. + + If X is a dense array, then the other methods will not support sparse + matrices as input. + """ + rnd = check_random_state(self.random_state) + + sparse = sp.issparse(X) + if sparse and self.kernel == "precomputed": + raise TypeError("Sparse precomputed kernels are not supported.") + self._sparse = sparse and not callable(self.kernel) + + if callable(self.kernel): + check_consistent_length(X, y) + else: + X, y = validate_data( + self, + X, + y, + dtype=np.float64, + order="C", + accept_sparse="csr", + accept_large_sparse=False, + ) + + y = self._validate_targets(y) + + sample_weight = np.asarray( + [] if sample_weight is None else sample_weight, dtype=np.float64 + ) + solver_type = LIBSVM_IMPL.index(self._impl) + + # input validation + n_samples = _num_samples(X) + if solver_type != 2 and n_samples != y.shape[0]: + raise ValueError( + "X and y have incompatible shapes.\n" + + "X has %s samples, but y has %s." % (n_samples, y.shape[0]) + ) + + if self.kernel == "precomputed" and n_samples != X.shape[1]: + raise ValueError( + "Precomputed matrix must be a square matrix." + " Input is a {}x{} matrix.".format(X.shape[0], X.shape[1]) + ) + + if sample_weight.shape[0] > 0 and sample_weight.shape[0] != n_samples: + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." + % (sample_weight.shape, X.shape) + ) + + kernel = "precomputed" if callable(self.kernel) else self.kernel + + if kernel == "precomputed": + # unused but needs to be a float for cython code that ignores + # it anyway + self._gamma = 0.0 + elif isinstance(self.gamma, str): + if self.gamma == "scale": + # var = E[X^2] - E[X]^2 if sparse + X_var = (X.multiply(X)).mean() - (X.mean()) ** 2 if sparse else X.var() + self._gamma = 1.0 / (X.shape[1] * X_var) if X_var != 0 else 1.0 + elif self.gamma == "auto": + self._gamma = 1.0 / X.shape[1] + elif isinstance(self.gamma, Real): + self._gamma = self.gamma + + fit = self._sparse_fit if self._sparse else self._dense_fit + if self.verbose: + print("[LibSVM]", end="") + + seed = rnd.randint(np.iinfo("i").max) + fit(X, y, sample_weight, solver_type, kernel, random_seed=seed) + # see comment on the other call to np.iinfo in this file + + self.shape_fit_ = X.shape if hasattr(X, "shape") else (n_samples,) + + # In binary case, we need to flip the sign of coef, intercept and + # decision function. Use self._intercept_ and self._dual_coef_ + # internally. + self._intercept_ = self.intercept_.copy() + self._dual_coef_ = self.dual_coef_ + if self._impl in ["c_svc", "nu_svc"] and len(self.classes_) == 2: + self.intercept_ *= -1 + self.dual_coef_ = -self.dual_coef_ + + dual_coef = self._dual_coef_.data if self._sparse else self._dual_coef_ + intercept_finiteness = np.isfinite(self._intercept_).all() + dual_coef_finiteness = np.isfinite(dual_coef).all() + if not (intercept_finiteness and dual_coef_finiteness): + raise ValueError( + "The dual coefficients or intercepts are not finite." + " The input data may contain large values and need to be" + " preprocessed." + ) + + # Since, in the case of SVC and NuSVC, the number of models optimized by + # libSVM could be greater than one (depending on the input), `n_iter_` + # stores an ndarray. + # For the other sub-classes (SVR, NuSVR, and OneClassSVM), the number of + # models optimized by libSVM is always one, so `n_iter_` stores an + # integer. + if self._impl in ["c_svc", "nu_svc"]: + self.n_iter_ = self._num_iter + else: + self.n_iter_ = self._num_iter.item() + + return self + + def _validate_targets(self, y): + """Validation of y and class_weight. + + Default implementation for SVR and one-class; overridden in BaseSVC. + """ + return column_or_1d(y, warn=True).astype(np.float64, copy=False) + + def _warn_from_fit_status(self): + assert self.fit_status_ in (0, 1) + if self.fit_status_ == 1: + warnings.warn( + "Solver terminated early (max_iter=%i)." + " Consider pre-processing your data with" + " StandardScaler or MinMaxScaler." % self.max_iter, + ConvergenceWarning, + ) + + def _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed): + if callable(self.kernel): + # you must store a reference to X to compute the kernel in predict + # TODO: add keyword copy to copy on demand + self.__Xfit = X + X = self._compute_kernel(X) + + if X.shape[0] != X.shape[1]: + raise ValueError("X.shape[0] should be equal to X.shape[1]") + + libsvm.set_verbosity_wrap(self.verbose) + + # we don't pass **self.get_params() to allow subclasses to + # add other parameters to __init__ + ( + self.support_, + self.support_vectors_, + self._n_support, + self.dual_coef_, + self.intercept_, + self._probA, + self._probB, + self.fit_status_, + self._num_iter, + ) = libsvm.fit( + X, + y, + svm_type=solver_type, + sample_weight=sample_weight, + class_weight=getattr(self, "class_weight_", np.empty(0)), + kernel=kernel, + C=self.C, + nu=self.nu, + probability=self.probability, + degree=self.degree, + shrinking=self.shrinking, + tol=self.tol, + cache_size=self.cache_size, + coef0=self.coef0, + gamma=self._gamma, + epsilon=self.epsilon, + max_iter=self.max_iter, + random_seed=random_seed, + ) + + self._warn_from_fit_status() + + def _sparse_fit(self, X, y, sample_weight, solver_type, kernel, random_seed): + X.data = np.asarray(X.data, dtype=np.float64, order="C") + X.sort_indices() + + kernel_type = self._sparse_kernels.index(kernel) + + libsvm_sparse.set_verbosity_wrap(self.verbose) + + ( + self.support_, + self.support_vectors_, + dual_coef_data, + self.intercept_, + self._n_support, + self._probA, + self._probB, + self.fit_status_, + self._num_iter, + ) = libsvm_sparse.libsvm_sparse_train( + X.shape[1], + X.data, + X.indices, + X.indptr, + y, + solver_type, + kernel_type, + self.degree, + self._gamma, + self.coef0, + self.tol, + self.C, + getattr(self, "class_weight_", np.empty(0)), + sample_weight, + self.nu, + self.cache_size, + self.epsilon, + int(self.shrinking), + int(self.probability), + self.max_iter, + random_seed, + ) + + self._warn_from_fit_status() + + if hasattr(self, "classes_"): + n_class = len(self.classes_) - 1 + else: # regression + n_class = 1 + n_SV = self.support_vectors_.shape[0] + + dual_coef_indices = np.tile(np.arange(n_SV), n_class) + if not n_SV: + self.dual_coef_ = sp.csr_matrix([]) + else: + dual_coef_indptr = np.arange( + 0, dual_coef_indices.size + 1, dual_coef_indices.size / n_class + ) + self.dual_coef_ = sp.csr_matrix( + (dual_coef_data, dual_coef_indices, dual_coef_indptr), (n_class, n_SV) + ) + + def predict(self, X): + """Perform regression on samples in X. + + For an one-class model, +1 (inlier) or -1 (outlier) is returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + The predicted values. + """ + X = self._validate_for_predict(X) + predict = self._sparse_predict if self._sparse else self._dense_predict + return predict(X) + + def _dense_predict(self, X): + X = self._compute_kernel(X) + if X.ndim == 1: + X = check_array(X, order="C", accept_large_sparse=False) + + kernel = self.kernel + if callable(self.kernel): + kernel = "precomputed" + if X.shape[1] != self.shape_fit_[0]: + raise ValueError( + "X.shape[1] = %d should be equal to %d, " + "the number of samples at training time" + % (X.shape[1], self.shape_fit_[0]) + ) + + svm_type = LIBSVM_IMPL.index(self._impl) + + return libsvm.predict( + X, + self.support_, + self.support_vectors_, + self._n_support, + self._dual_coef_, + self._intercept_, + self._probA, + self._probB, + svm_type=svm_type, + kernel=kernel, + degree=self.degree, + coef0=self.coef0, + gamma=self._gamma, + cache_size=self.cache_size, + ) + + def _sparse_predict(self, X): + # Precondition: X is a csr_matrix of dtype np.float64. + kernel = self.kernel + if callable(kernel): + kernel = "precomputed" + + kernel_type = self._sparse_kernels.index(kernel) + + C = 0.0 # C is not useful here + + return libsvm_sparse.libsvm_sparse_predict( + X.data, + X.indices, + X.indptr, + self.support_vectors_.data, + self.support_vectors_.indices, + self.support_vectors_.indptr, + self._dual_coef_.data, + self._intercept_, + LIBSVM_IMPL.index(self._impl), + kernel_type, + self.degree, + self._gamma, + self.coef0, + self.tol, + C, + getattr(self, "class_weight_", np.empty(0)), + self.nu, + self.epsilon, + self.shrinking, + self.probability, + self._n_support, + self._probA, + self._probB, + ) + + def _compute_kernel(self, X): + """Return the data transformed by a callable kernel""" + if callable(self.kernel): + # in the case of precomputed kernel given as a function, we + # have to compute explicitly the kernel matrix + kernel = self.kernel(X, self.__Xfit) + if sp.issparse(kernel): + kernel = kernel.toarray() + X = np.asarray(kernel, dtype=np.float64, order="C") + return X + + def _decision_function(self, X): + """Evaluates the decision function for the samples in X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + Returns + ------- + X : array-like of shape (n_samples, n_class * (n_class-1) / 2) + Returns the decision function of the sample for each class + in the model. + """ + # NOTE: _validate_for_predict contains check for is_fitted + # hence must be placed before any other attributes are used. + X = self._validate_for_predict(X) + X = self._compute_kernel(X) + + if self._sparse: + dec_func = self._sparse_decision_function(X) + else: + dec_func = self._dense_decision_function(X) + + # In binary case, we need to flip the sign of coef, intercept and + # decision function. + if self._impl in ["c_svc", "nu_svc"] and len(self.classes_) == 2: + return -dec_func.ravel() + + return dec_func + + def _dense_decision_function(self, X): + X = check_array(X, dtype=np.float64, order="C", accept_large_sparse=False) + + kernel = self.kernel + if callable(kernel): + kernel = "precomputed" + + return libsvm.decision_function( + X, + self.support_, + self.support_vectors_, + self._n_support, + self._dual_coef_, + self._intercept_, + self._probA, + self._probB, + svm_type=LIBSVM_IMPL.index(self._impl), + kernel=kernel, + degree=self.degree, + cache_size=self.cache_size, + coef0=self.coef0, + gamma=self._gamma, + ) + + def _sparse_decision_function(self, X): + X.data = np.asarray(X.data, dtype=np.float64, order="C") + + kernel = self.kernel + if hasattr(kernel, "__call__"): + kernel = "precomputed" + + kernel_type = self._sparse_kernels.index(kernel) + + return libsvm_sparse.libsvm_sparse_decision_function( + X.data, + X.indices, + X.indptr, + self.support_vectors_.data, + self.support_vectors_.indices, + self.support_vectors_.indptr, + self._dual_coef_.data, + self._intercept_, + LIBSVM_IMPL.index(self._impl), + kernel_type, + self.degree, + self._gamma, + self.coef0, + self.tol, + self.C, + getattr(self, "class_weight_", np.empty(0)), + self.nu, + self.epsilon, + self.shrinking, + self.probability, + self._n_support, + self._probA, + self._probB, + ) + + def _validate_for_predict(self, X): + check_is_fitted(self) + + if not callable(self.kernel): + X = validate_data( + self, + X, + accept_sparse="csr", + dtype=np.float64, + order="C", + accept_large_sparse=False, + reset=False, + ) + + if self._sparse and not sp.issparse(X): + X = sp.csr_matrix(X) + if self._sparse: + X.sort_indices() + + if sp.issparse(X) and not self._sparse and not callable(self.kernel): + raise ValueError( + "cannot use sparse input in %r trained on dense data" + % type(self).__name__ + ) + + if self.kernel == "precomputed": + if X.shape[1] != self.shape_fit_[0]: + raise ValueError( + "X.shape[1] = %d should be equal to %d, " + "the number of samples at training time" + % (X.shape[1], self.shape_fit_[0]) + ) + # Fixes https://nvd.nist.gov/vuln/detail/CVE-2020-28975 + # Check that _n_support is consistent with support_vectors + sv = self.support_vectors_ + if not self._sparse and sv.size > 0 and self.n_support_.sum() != sv.shape[0]: + raise ValueError( + f"The internal representation of {self.__class__.__name__} was altered" + ) + return X + + @property + def coef_(self): + """Weights assigned to the features when `kernel="linear"`. + + Returns + ------- + ndarray of shape (n_features, n_classes) + """ + if self.kernel != "linear": + raise AttributeError("coef_ is only available when using a linear kernel") + + coef = self._get_coef() + + # coef_ being a read-only property, it's better to mark the value as + # immutable to avoid hiding potential bugs for the unsuspecting user. + if sp.issparse(coef): + # sparse matrix do not have global flags + coef.data.flags.writeable = False + else: + # regular dense array + coef.flags.writeable = False + return coef + + def _get_coef(self): + return safe_sparse_dot(self._dual_coef_, self.support_vectors_) + + @property + def n_support_(self): + """Number of support vectors for each class.""" + try: + check_is_fitted(self) + except NotFittedError: + raise AttributeError + + svm_type = LIBSVM_IMPL.index(self._impl) + if svm_type in (0, 1): + return self._n_support + else: + # SVR and OneClass + # _n_support has size 2, we make it size 1 + return np.array([self._n_support[0]]) + + +class BaseSVC(ClassifierMixin, BaseLibSVM, metaclass=ABCMeta): + """ABC for LibSVM-based classifiers.""" + + _parameter_constraints: dict = { + **BaseLibSVM._parameter_constraints, + "decision_function_shape": [StrOptions({"ovr", "ovo"})], + "break_ties": ["boolean"], + } + for unused_param in ["epsilon", "nu"]: + _parameter_constraints.pop(unused_param) + + @abstractmethod + def __init__( + self, + kernel, + degree, + gamma, + coef0, + tol, + C, + nu, + shrinking, + probability, + cache_size, + class_weight, + verbose, + max_iter, + decision_function_shape, + random_state, + break_ties, + ): + self.decision_function_shape = decision_function_shape + self.break_ties = break_ties + super().__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=nu, + epsilon=0.0, + shrinking=shrinking, + probability=probability, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + random_state=random_state, + ) + + def _validate_targets(self, y): + y_ = column_or_1d(y, warn=True) + check_classification_targets(y) + cls, y = np.unique(y_, return_inverse=True) + self.class_weight_ = compute_class_weight(self.class_weight, classes=cls, y=y_) + if len(cls) < 2: + raise ValueError( + "The number of classes has to be greater than one; got %d class" + % len(cls) + ) + + self.classes_ = cls + + return np.asarray(y, dtype=np.float64, order="C") + + def decision_function(self, X): + """Evaluate the decision function for the samples in X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + + Returns + ------- + X : ndarray of shape (n_samples, n_classes * (n_classes-1) / 2) + Returns the decision function of the sample for each class + in the model. + If decision_function_shape='ovr', the shape is (n_samples, + n_classes). + + Notes + ----- + If decision_function_shape='ovo', the function values are proportional + to the distance of the samples X to the separating hyperplane. If the + exact distances are required, divide the function values by the norm of + the weight vector (``coef_``). See also `this question + `_ for further details. + If decision_function_shape='ovr', the decision function is a monotonic + transformation of ovo decision function. + """ + dec = self._decision_function(X) + if self.decision_function_shape == "ovr" and len(self.classes_) > 2: + return _ovr_decision_function(dec < 0, -dec, len(self.classes_)) + return dec + + def predict(self, X): + """Perform classification on samples in X. + + For an one-class model, +1 or -1 is returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples_test, n_samples_train) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Class labels for samples in X. + """ + check_is_fitted(self) + if self.break_ties and self.decision_function_shape == "ovo": + raise ValueError( + "break_ties must be False when decision_function_shape is 'ovo'" + ) + + if ( + self.break_ties + and self.decision_function_shape == "ovr" + and len(self.classes_) > 2 + ): + y = np.argmax(self.decision_function(X), axis=1) + else: + y = super().predict(X) + return self.classes_.take(np.asarray(y, dtype=np.intp)) + + # Hacky way of getting predict_proba to raise an AttributeError when + # probability=False using properties. Do not use this in new code; when + # probabilities are not available depending on a setting, introduce two + # estimators. + def _check_proba(self): + if not self.probability: + raise AttributeError( + "predict_proba is not available when probability=False" + ) + if self._impl not in ("c_svc", "nu_svc"): + raise AttributeError("predict_proba only implemented for SVC and NuSVC") + return True + + @available_if(_check_proba) + def predict_proba(self, X): + """Compute probabilities of possible outcomes for samples in X. + + The model needs to have probability information computed at training + time: fit with attribute `probability` set to True. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + Returns + ------- + T : ndarray of shape (n_samples, n_classes) + Returns the probability of the sample for each class in + the model. The columns correspond to the classes in sorted + order, as they appear in the attribute :term:`classes_`. + + Notes + ----- + The probability model is created using cross validation, so + the results can be slightly different than those obtained by + predict. Also, it will produce meaningless results on very small + datasets. + """ + X = self._validate_for_predict(X) + if self.probA_.size == 0 or self.probB_.size == 0: + raise NotFittedError( + "predict_proba is not available when fitted with probability=False" + ) + pred_proba = ( + self._sparse_predict_proba if self._sparse else self._dense_predict_proba + ) + return pred_proba(X) + + @available_if(_check_proba) + def predict_log_proba(self, X): + """Compute log probabilities of possible outcomes for samples in X. + + The model need to have probability information computed at training + time: fit with attribute `probability` set to True. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or \ + (n_samples_test, n_samples_train) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + Returns + ------- + T : ndarray of shape (n_samples, n_classes) + Returns the log-probabilities of the sample for each class in + the model. The columns correspond to the classes in sorted + order, as they appear in the attribute :term:`classes_`. + + Notes + ----- + The probability model is created using cross validation, so + the results can be slightly different than those obtained by + predict. Also, it will produce meaningless results on very small + datasets. + """ + return np.log(self.predict_proba(X)) + + def _dense_predict_proba(self, X): + X = self._compute_kernel(X) + + kernel = self.kernel + if callable(kernel): + kernel = "precomputed" + + svm_type = LIBSVM_IMPL.index(self._impl) + pprob = libsvm.predict_proba( + X, + self.support_, + self.support_vectors_, + self._n_support, + self._dual_coef_, + self._intercept_, + self._probA, + self._probB, + svm_type=svm_type, + kernel=kernel, + degree=self.degree, + cache_size=self.cache_size, + coef0=self.coef0, + gamma=self._gamma, + ) + + return pprob + + def _sparse_predict_proba(self, X): + X.data = np.asarray(X.data, dtype=np.float64, order="C") + + kernel = self.kernel + if callable(kernel): + kernel = "precomputed" + + kernel_type = self._sparse_kernels.index(kernel) + + return libsvm_sparse.libsvm_sparse_predict_proba( + X.data, + X.indices, + X.indptr, + self.support_vectors_.data, + self.support_vectors_.indices, + self.support_vectors_.indptr, + self._dual_coef_.data, + self._intercept_, + LIBSVM_IMPL.index(self._impl), + kernel_type, + self.degree, + self._gamma, + self.coef0, + self.tol, + self.C, + getattr(self, "class_weight_", np.empty(0)), + self.nu, + self.epsilon, + self.shrinking, + self.probability, + self._n_support, + self._probA, + self._probB, + ) + + def _get_coef(self): + if self.dual_coef_.shape[0] == 1: + # binary classifier + coef = safe_sparse_dot(self.dual_coef_, self.support_vectors_) + else: + # 1vs1 classifier + coef = _one_vs_one_coef( + self.dual_coef_, self._n_support, self.support_vectors_ + ) + if sp.issparse(coef[0]): + coef = sp.vstack(coef).tocsr() + else: + coef = np.vstack(coef) + + return coef + + @property + def probA_(self): + """Parameter learned in Platt scaling when `probability=True`. + + Returns + ------- + ndarray of shape (n_classes * (n_classes - 1) / 2) + """ + return self._probA + + @property + def probB_(self): + """Parameter learned in Platt scaling when `probability=True`. + + Returns + ------- + ndarray of shape (n_classes * (n_classes - 1) / 2) + """ + return self._probB + + +def _get_liblinear_solver_type(multi_class, penalty, loss, dual): + """Find the liblinear magic number for the solver. + + This number depends on the values of the following attributes: + - multi_class + - penalty + - loss + - dual + + The same number is also internally used by LibLinear to determine + which solver to use. + """ + # nested dicts containing level 1: available loss functions, + # level2: available penalties for the given loss function, + # level3: whether the dual solver is available for the specified + # combination of loss function and penalty + _solver_type_dict = { + "logistic_regression": {"l1": {False: 6}, "l2": {False: 0, True: 7}}, + "hinge": {"l2": {True: 3}}, + "squared_hinge": {"l1": {False: 5}, "l2": {False: 2, True: 1}}, + "epsilon_insensitive": {"l2": {True: 13}}, + "squared_epsilon_insensitive": {"l2": {False: 11, True: 12}}, + "crammer_singer": 4, + } + + if multi_class == "crammer_singer": + return _solver_type_dict[multi_class] + elif multi_class != "ovr": + raise ValueError( + "`multi_class` must be one of `ovr`, `crammer_singer`, got %r" % multi_class + ) + + _solver_pen = _solver_type_dict.get(loss, None) + if _solver_pen is None: + error_string = "loss='%s' is not supported" % loss + else: + _solver_dual = _solver_pen.get(penalty, None) + if _solver_dual is None: + error_string = ( + "The combination of penalty='%s' and loss='%s' is not supported" + % (penalty, loss) + ) + else: + solver_num = _solver_dual.get(dual, None) + if solver_num is None: + error_string = ( + "The combination of penalty='%s' and " + "loss='%s' are not supported when dual=%s" % (penalty, loss, dual) + ) + else: + return solver_num + raise ValueError( + "Unsupported set of arguments: %s, Parameters: penalty=%r, loss=%r, dual=%r" + % (error_string, penalty, loss, dual) + ) + + +def _fit_liblinear( + X, + y, + C, + fit_intercept, + intercept_scaling, + class_weight, + penalty, + dual, + verbose, + max_iter, + tol, + random_state=None, + multi_class="ovr", + loss="logistic_regression", + epsilon=0.1, + sample_weight=None, +): + """Used by Logistic Regression (and CV) and LinearSVC/LinearSVR. + + Preprocessing is done in this function before supplying it to liblinear. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X + + C : float + Inverse of cross-validation parameter. The lower the C, the higher + the penalization. + + fit_intercept : bool + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: ``[x_1, ..., x_n, 1]``, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). + + intercept_scaling : float + Liblinear internally penalizes the intercept, treating it like any + other term in the feature vector. To reduce the impact of the + regularization on the intercept, the `intercept_scaling` parameter can + be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. + + class_weight : dict or 'balanced', default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + + penalty : {'l1', 'l2'} + The norm of the penalty used in regularization. + + dual : bool + Dual or primal formulation, + + verbose : int + Set verbose to any positive number for verbosity. + + max_iter : int + Number of iterations. + + tol : float + Stopping condition. + + random_state : int, RandomState instance or None, default=None + Controls the pseudo random number generation for shuffling the data. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + multi_class : {'ovr', 'crammer_singer'}, default='ovr' + `ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer` + optimizes a joint objective over all classes. + While `crammer_singer` is interesting from an theoretical perspective + as it is consistent it is seldom used in practice and rarely leads to + better accuracy and is more expensive to compute. + If `crammer_singer` is chosen, the options loss, penalty and dual will + be ignored. + + loss : {'logistic_regression', 'hinge', 'squared_hinge', \ + 'epsilon_insensitive', 'squared_epsilon_insensitive}, \ + default='logistic_regression' + The loss function used to fit the model. + + epsilon : float, default=0.1 + Epsilon parameter in the epsilon-insensitive loss function. Note + that the value of this parameter depends on the scale of the target + variable y. If unsure, set epsilon=0. + + sample_weight : array-like of shape (n_samples,), default=None + Weights assigned to each sample. + + Returns + ------- + coef_ : ndarray of shape (n_features, n_features + 1) + The coefficient vector got by minimizing the objective function. + + intercept_ : float + The intercept term added to the vector. + + n_iter_ : array of int + Number of iterations run across for each class. + """ + if loss not in ["epsilon_insensitive", "squared_epsilon_insensitive"]: + enc = LabelEncoder() + y_ind = enc.fit_transform(y) + classes_ = enc.classes_ + if len(classes_) < 2: + raise ValueError( + "This solver needs samples of at least 2 classes" + " in the data, but the data contains only one" + " class: %r" % classes_[0] + ) + + class_weight_ = compute_class_weight(class_weight, classes=classes_, y=y) + else: + class_weight_ = np.empty(0, dtype=np.float64) + y_ind = y + liblinear.set_verbosity_wrap(verbose) + rnd = check_random_state(random_state) + if verbose: + print("[LibLinear]", end="") + + # LinearSVC breaks when intercept_scaling is <= 0 + bias = -1.0 + if fit_intercept: + if intercept_scaling <= 0: + raise ValueError( + "Intercept scaling is %r but needs to be greater " + "than 0. To disable fitting an intercept," + " set fit_intercept=False." % intercept_scaling + ) + else: + bias = intercept_scaling + + libsvm.set_verbosity_wrap(verbose) + libsvm_sparse.set_verbosity_wrap(verbose) + liblinear.set_verbosity_wrap(verbose) + + # Liblinear doesn't support 64bit sparse matrix indices yet + if sp.issparse(X): + _check_large_sparse(X) + + # LibLinear wants targets as doubles, even for classification + y_ind = np.asarray(y_ind, dtype=np.float64).ravel() + y_ind = np.require(y_ind, requirements="W") + + sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64) + + solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual) + raw_coef_, n_iter_ = liblinear.train_wrap( + X, + y_ind, + sp.issparse(X), + solver_type, + tol, + bias, + C, + class_weight_, + max_iter, + rnd.randint(np.iinfo("i").max), + epsilon, + sample_weight, + ) + # Regarding rnd.randint(..) in the above signature: + # seed for srand in range [0..INT_MAX); due to limitations in Numpy + # on 32-bit platforms, we can't get to the UINT_MAX limit that + # srand supports + n_iter_max = max(n_iter_) + if n_iter_max >= max_iter: + warnings.warn( + "Liblinear failed to converge, increase the number of iterations.", + ConvergenceWarning, + ) + + if fit_intercept: + coef_ = raw_coef_[:, :-1] + intercept_ = intercept_scaling * raw_coef_[:, -1] + else: + coef_ = raw_coef_ + intercept_ = 0.0 + + return coef_, intercept_, n_iter_ diff --git a/.venv/Lib/site-packages/sklearn/svm/_bounds.py b/.venv/Lib/site-packages/sklearn/svm/_bounds.py new file mode 100644 index 0000000000000000000000000000000000000000..704462ce23689dc76f1590e1c35cd8034f83dfdd --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/_bounds.py @@ -0,0 +1,95 @@ +"""Determination of parameter bounds""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Real + +import numpy as np + +from ..preprocessing import LabelBinarizer +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.extmath import safe_sparse_dot +from ..utils.validation import check_array, check_consistent_length + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like"], + "loss": [StrOptions({"squared_hinge", "log"})], + "fit_intercept": ["boolean"], + "intercept_scaling": [Interval(Real, 0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def l1_min_c(X, y, *, loss="squared_hinge", fit_intercept=True, intercept_scaling=1.0): + """Return the lowest bound for C. + + The lower bound for C is computed such that for C in (l1_min_C, infinity) + the model is guaranteed not to be empty. This applies to l1 penalized + classifiers, such as LinearSVC with penalty='l1' and + linear_model.LogisticRegression with penalty='l1'. + + This value is valid if class_weight parameter in fit() is not set. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X. + + loss : {'squared_hinge', 'log'}, default='squared_hinge' + Specifies the loss function. + With 'squared_hinge' it is the squared hinge loss (a.k.a. L2 loss). + With 'log' it is the loss of logistic regression models. + + fit_intercept : bool, default=True + Specifies if the intercept should be fitted by the model. + It must match the fit() method parameter. + + intercept_scaling : float, default=1.0 + When fit_intercept is True, instance vector x becomes + [x, intercept_scaling], + i.e. a "synthetic" feature with constant value equals to + intercept_scaling is appended to the instance vector. + It must match the fit() method parameter. + + Returns + ------- + l1_min_c : float + Minimum value for C. + + Examples + -------- + >>> from sklearn.svm import l1_min_c + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_samples=100, n_features=20, random_state=42) + >>> print(f"{l1_min_c(X, y, loss='squared_hinge', fit_intercept=True):.4f}") + 0.0044 + """ + + X = check_array(X, accept_sparse="csc") + check_consistent_length(X, y) + + Y = LabelBinarizer(neg_label=-1).fit_transform(y).T + # maximum absolute value over classes and features + den = np.max(np.abs(safe_sparse_dot(Y, X))) + if fit_intercept: + bias = np.full( + (np.size(y), 1), intercept_scaling, dtype=np.array(intercept_scaling).dtype + ) + den = max(den, abs(np.dot(Y, bias)).max()) + + if den == 0.0: + raise ValueError( + "Ill-posed l1_min_c calculation: l1 will always " + "select zero coefficients for this data" + ) + if loss == "squared_hinge": + return 0.5 / den + else: # loss == 'log': + return 2.0 / den diff --git a/.venv/Lib/site-packages/sklearn/svm/_classes.py b/.venv/Lib/site-packages/sklearn/svm/_classes.py new file mode 100644 index 0000000000000000000000000000000000000000..8d4b360f1efa59b9dd3bf94fb68f82ffeda6f74d --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/_classes.py @@ -0,0 +1,1779 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np + +from ..base import BaseEstimator, OutlierMixin, RegressorMixin, _fit_context +from ..linear_model._base import LinearClassifierMixin, LinearModel, SparseCoefMixin +from ..utils._param_validation import Interval, StrOptions +from ..utils.multiclass import check_classification_targets +from ..utils.validation import _num_samples, validate_data +from ._base import BaseLibSVM, BaseSVC, _fit_liblinear, _get_liblinear_solver_type + + +def _validate_dual_parameter(dual, loss, penalty, multi_class, X): + """Helper function to assign the value of dual parameter.""" + if dual == "auto": + if X.shape[0] < X.shape[1]: + try: + _get_liblinear_solver_type(multi_class, penalty, loss, True) + return True + except ValueError: # dual not supported for the combination + return False + else: + try: + _get_liblinear_solver_type(multi_class, penalty, loss, False) + return False + except ValueError: # primal not supported by the combination + return True + else: + return dual + + +class LinearSVC(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): + """Linear Support Vector Classification. + + Similar to SVC with parameter kernel='linear', but implemented in terms of + liblinear rather than libsvm, so it has more flexibility in the choice of + penalties and loss functions and should scale better to large numbers of + samples. + + The main differences between :class:`~sklearn.svm.LinearSVC` and + :class:`~sklearn.svm.SVC` lie in the loss function used by default, and in + the handling of intercept regularization between those two implementations. + + This class supports both dense and sparse input and the multiclass support + is handled according to a one-vs-the-rest scheme. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + penalty : {'l1', 'l2'}, default='l2' + Specifies the norm used in the penalization. The 'l2' + penalty is the standard used in SVC. The 'l1' leads to ``coef_`` + vectors that are sparse. + + loss : {'hinge', 'squared_hinge'}, default='squared_hinge' + Specifies the loss function. 'hinge' is the standard SVM loss + (used e.g. by the SVC class) while 'squared_hinge' is the + square of the hinge loss. The combination of ``penalty='l1'`` + and ``loss='hinge'`` is not supported. + + dual : "auto" or bool, default="auto" + Select the algorithm to either solve the dual or primal + optimization problem. Prefer dual=False when n_samples > n_features. + `dual="auto"` will choose the value of the parameter automatically, + based on the values of `n_samples`, `n_features`, `loss`, `multi_class` + and `penalty`. If `n_samples` < `n_features` and optimizer supports + chosen `loss`, `multi_class` and `penalty`, then dual will be set to True, + otherwise it will be set to False. + + .. versionchanged:: 1.3 + The `"auto"` option is added in version 1.3 and will be the default + in version 1.5. + + tol : float, default=1e-4 + Tolerance for stopping criteria. + + C : float, default=1.0 + Regularization parameter. The strength of the regularization is + inversely proportional to C. Must be strictly positive. + For an intuitive visualization of the effects of scaling + the regularization parameter C, see + :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`. + + multi_class : {'ovr', 'crammer_singer'}, default='ovr' + Determines the multi-class strategy if `y` contains more than + two classes. + ``"ovr"`` trains n_classes one-vs-rest classifiers, while + ``"crammer_singer"`` optimizes a joint objective over all classes. + While `crammer_singer` is interesting from a theoretical perspective + as it is consistent, it is seldom used in practice as it rarely leads + to better accuracy and is more expensive to compute. + If ``"crammer_singer"`` is chosen, the options loss, penalty and dual + will be ignored. + + fit_intercept : bool, default=True + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). + + intercept_scaling : float, default=1.0 + When `fit_intercept` is True, the instance vector x becomes ``[x_1, + ..., x_n, intercept_scaling]``, i.e. a "synthetic" feature with a + constant value equal to `intercept_scaling` is appended to the instance + vector. The intercept becomes intercept_scaling * synthetic feature + weight. Note that liblinear internally penalizes the intercept, + treating it like any other term in the feature vector. To reduce the + impact of the regularization on the intercept, the `intercept_scaling` + parameter can be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. + + class_weight : dict or 'balanced', default=None + Set the parameter C of class i to ``class_weight[i]*C`` for + SVC. If not given, all classes are supposed to have + weight one. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + verbose : int, default=0 + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in liblinear that, if enabled, may not work + properly in a multithreaded context. + + random_state : int, RandomState instance or None, default=None + Controls the pseudo random number generation for shuffling the data for + the dual coordinate descent (if ``dual=True``). When ``dual=False`` the + underlying implementation of :class:`LinearSVC` is not random and + ``random_state`` has no effect on the results. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + max_iter : int, default=1000 + The maximum number of iterations to be run. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) if n_classes == 2 \ + else (n_classes, n_features) + Weights assigned to the features (coefficients in the primal + problem). + + ``coef_`` is a readonly property derived from ``raw_coef_`` that + follows the internal memory layout of liblinear. + + intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,) + Constants in decision function. + + classes_ : ndarray of shape (n_classes,) + The unique classes labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Maximum number of iterations run across all classes. + + See Also + -------- + SVC : Implementation of Support Vector Machine classifier using libsvm: + the kernel can be non-linear but its SMO algorithm does not + scale to large number of samples as LinearSVC does. + + Furthermore SVC multi-class mode is implemented using one + vs one scheme while LinearSVC uses one vs the rest. It is + possible to implement one vs the rest with SVC by using the + :class:`~sklearn.multiclass.OneVsRestClassifier` wrapper. + + Finally SVC can fit dense data without memory copy if the input + is C-contiguous. Sparse data will still incur memory copy though. + + sklearn.linear_model.SGDClassifier : SGDClassifier can optimize the same + cost function as LinearSVC + by adjusting the penalty and loss parameters. In addition it requires + less memory, allows incremental (online) learning, and implements + various loss functions and regularization regimes. + + Notes + ----- + The underlying C implementation uses a random number generator to + select features when fitting the model. It is thus not uncommon + to have slightly different results for the same input data. If + that happens, try with a smaller ``tol`` parameter. + + The underlying implementation, liblinear, uses a sparse internal + representation for the data that will incur a memory copy. + + Predict output may not match that of standalone liblinear in certain + cases. See :ref:`differences from liblinear ` + in the narrative documentation. + + References + ---------- + `LIBLINEAR: A Library for Large Linear Classification + `__ + + Examples + -------- + >>> from sklearn.svm import LinearSVC + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_features=4, random_state=0) + >>> clf = make_pipeline(StandardScaler(), + ... LinearSVC(random_state=0, tol=1e-5)) + >>> clf.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('linearsvc', LinearSVC(random_state=0, tol=1e-05))]) + + >>> print(clf.named_steps['linearsvc'].coef_) + [[0.141... 0.526... 0.679... 0.493...]] + + >>> print(clf.named_steps['linearsvc'].intercept_) + [0.1693...] + >>> print(clf.predict([[0, 0, 0, 0]])) + [1] + """ + + _parameter_constraints: dict = { + "penalty": [StrOptions({"l1", "l2"})], + "loss": [StrOptions({"hinge", "squared_hinge"})], + "dual": ["boolean", StrOptions({"auto"})], + "tol": [Interval(Real, 0.0, None, closed="neither")], + "C": [Interval(Real, 0.0, None, closed="neither")], + "multi_class": [StrOptions({"ovr", "crammer_singer"})], + "fit_intercept": ["boolean"], + "intercept_scaling": [Interval(Real, 0, None, closed="neither")], + "class_weight": [None, dict, StrOptions({"balanced"})], + "verbose": ["verbose"], + "random_state": ["random_state"], + "max_iter": [Interval(Integral, 0, None, closed="left")], + } + + def __init__( + self, + penalty="l2", + loss="squared_hinge", + *, + dual="auto", + tol=1e-4, + C=1.0, + multi_class="ovr", + fit_intercept=True, + intercept_scaling=1, + class_weight=None, + verbose=0, + random_state=None, + max_iter=1000, + ): + self.dual = dual + self.tol = tol + self.C = C + self.multi_class = multi_class + self.fit_intercept = fit_intercept + self.intercept_scaling = intercept_scaling + self.class_weight = class_weight + self.verbose = verbose + self.random_state = random_state + self.max_iter = max_iter + self.penalty = penalty + self.loss = loss + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit the model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X. + + sample_weight : array-like of shape (n_samples,), default=None + Array of weights that are assigned to individual + samples. If not provided, + then each sample is given unit weight. + + .. versionadded:: 0.18 + + Returns + ------- + self : object + An instance of the estimator. + """ + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + dtype=np.float64, + order="C", + accept_large_sparse=False, + ) + check_classification_targets(y) + self.classes_ = np.unique(y) + + _dual = _validate_dual_parameter( + self.dual, self.loss, self.penalty, self.multi_class, X + ) + + self.coef_, self.intercept_, n_iter_ = _fit_liblinear( + X, + y, + self.C, + self.fit_intercept, + self.intercept_scaling, + self.class_weight, + self.penalty, + _dual, + self.verbose, + self.max_iter, + self.tol, + self.random_state, + self.multi_class, + self.loss, + sample_weight=sample_weight, + ) + # Backward compatibility: _fit_liblinear is used both by LinearSVC/R + # and LogisticRegression but LogisticRegression sets a structured + # `n_iter_` attribute with information about the underlying OvR fits + # while LinearSVC/R only reports the maximum value. + self.n_iter_ = n_iter_.max().item() + + if self.multi_class == "crammer_singer" and len(self.classes_) == 2: + self.coef_ = (self.coef_[1] - self.coef_[0]).reshape(1, -1) + if self.fit_intercept: + intercept = self.intercept_[1] - self.intercept_[0] + self.intercept_ = np.array([intercept]) + + return self + + +class LinearSVR(RegressorMixin, LinearModel): + """Linear Support Vector Regression. + + Similar to SVR with parameter kernel='linear', but implemented in terms of + liblinear rather than libsvm, so it has more flexibility in the choice of + penalties and loss functions and should scale better to large numbers of + samples. + + The main differences between :class:`~sklearn.svm.LinearSVR` and + :class:`~sklearn.svm.SVR` lie in the loss function used by default, and in + the handling of intercept regularization between those two implementations. + + This class supports both dense and sparse input. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.16 + + Parameters + ---------- + epsilon : float, default=0.0 + Epsilon parameter in the epsilon-insensitive loss function. Note + that the value of this parameter depends on the scale of the target + variable y. If unsure, set ``epsilon=0``. + + tol : float, default=1e-4 + Tolerance for stopping criteria. + + C : float, default=1.0 + Regularization parameter. The strength of the regularization is + inversely proportional to C. Must be strictly positive. + + loss : {'epsilon_insensitive', 'squared_epsilon_insensitive'}, \ + default='epsilon_insensitive' + Specifies the loss function. The epsilon-insensitive loss + (standard SVR) is the L1 loss, while the squared epsilon-insensitive + loss ('squared_epsilon_insensitive') is the L2 loss. + + fit_intercept : bool, default=True + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). + + intercept_scaling : float, default=1.0 + When `fit_intercept` is True, the instance vector x becomes `[x_1, ..., + x_n, intercept_scaling]`, i.e. a "synthetic" feature with a constant + value equal to `intercept_scaling` is appended to the instance vector. + The intercept becomes intercept_scaling * synthetic feature weight. + Note that liblinear internally penalizes the intercept, treating it + like any other term in the feature vector. To reduce the impact of the + regularization on the intercept, the `intercept_scaling` parameter can + be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. + + dual : "auto" or bool, default="auto" + Select the algorithm to either solve the dual or primal + optimization problem. Prefer dual=False when n_samples > n_features. + `dual="auto"` will choose the value of the parameter automatically, + based on the values of `n_samples`, `n_features` and `loss`. If + `n_samples` < `n_features` and optimizer supports chosen `loss`, + then dual will be set to True, otherwise it will be set to False. + + .. versionchanged:: 1.3 + The `"auto"` option is added in version 1.3 and will be the default + in version 1.5. + + verbose : int, default=0 + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in liblinear that, if enabled, may not work + properly in a multithreaded context. + + random_state : int, RandomState instance or None, default=None + Controls the pseudo random number generation for shuffling the data. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + max_iter : int, default=1000 + The maximum number of iterations to be run. + + Attributes + ---------- + coef_ : ndarray of shape (n_features) if n_classes == 2 \ + else (n_classes, n_features) + Weights assigned to the features (coefficients in the primal + problem). + + `coef_` is a readonly property derived from `raw_coef_` that + follows the internal memory layout of liblinear. + + intercept_ : ndarray of shape (1) if n_classes == 2 else (n_classes) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Maximum number of iterations run across all classes. + + See Also + -------- + LinearSVC : Implementation of Support Vector Machine classifier using the + same library as this class (liblinear). + + SVR : Implementation of Support Vector Machine regression using libsvm: + the kernel can be non-linear but its SMO algorithm does not scale to + large number of samples as :class:`~sklearn.svm.LinearSVR` does. + + sklearn.linear_model.SGDRegressor : SGDRegressor can optimize the same cost + function as LinearSVR + by adjusting the penalty and loss parameters. In addition it requires + less memory, allows incremental (online) learning, and implements + various loss functions and regularization regimes. + + Examples + -------- + >>> from sklearn.svm import LinearSVR + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_features=4, random_state=0) + >>> regr = make_pipeline(StandardScaler(), + ... LinearSVR(random_state=0, tol=1e-5)) + >>> regr.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('linearsvr', LinearSVR(random_state=0, tol=1e-05))]) + + >>> print(regr.named_steps['linearsvr'].coef_) + [18.582... 27.023... 44.357... 64.522...] + >>> print(regr.named_steps['linearsvr'].intercept_) + [-4...] + >>> print(regr.predict([[0, 0, 0, 0]])) + [-2.384...] + """ + + _parameter_constraints: dict = { + "epsilon": [Real], + "tol": [Interval(Real, 0.0, None, closed="neither")], + "C": [Interval(Real, 0.0, None, closed="neither")], + "loss": [StrOptions({"epsilon_insensitive", "squared_epsilon_insensitive"})], + "fit_intercept": ["boolean"], + "intercept_scaling": [Interval(Real, 0, None, closed="neither")], + "dual": ["boolean", StrOptions({"auto"})], + "verbose": ["verbose"], + "random_state": ["random_state"], + "max_iter": [Interval(Integral, 0, None, closed="left")], + } + + def __init__( + self, + *, + epsilon=0.0, + tol=1e-4, + C=1.0, + loss="epsilon_insensitive", + fit_intercept=True, + intercept_scaling=1.0, + dual="auto", + verbose=0, + random_state=None, + max_iter=1000, + ): + self.tol = tol + self.C = C + self.epsilon = epsilon + self.fit_intercept = fit_intercept + self.intercept_scaling = intercept_scaling + self.verbose = verbose + self.random_state = random_state + self.max_iter = max_iter + self.dual = dual + self.loss = loss + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit the model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X. + + sample_weight : array-like of shape (n_samples,), default=None + Array of weights that are assigned to individual + samples. If not provided, + then each sample is given unit weight. + + .. versionadded:: 0.18 + + Returns + ------- + self : object + An instance of the estimator. + """ + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + dtype=np.float64, + order="C", + accept_large_sparse=False, + ) + penalty = "l2" # SVR only accepts l2 penalty + + _dual = _validate_dual_parameter(self.dual, self.loss, penalty, "ovr", X) + + self.coef_, self.intercept_, n_iter_ = _fit_liblinear( + X, + y, + self.C, + self.fit_intercept, + self.intercept_scaling, + None, + penalty, + _dual, + self.verbose, + self.max_iter, + self.tol, + self.random_state, + loss=self.loss, + epsilon=self.epsilon, + sample_weight=sample_weight, + ) + self.coef_ = self.coef_.ravel() + # Backward compatibility: _fit_liblinear is used both by LinearSVC/R + # and LogisticRegression but LogisticRegression sets a structured + # `n_iter_` attribute with information about the underlying OvR fits + # while LinearSVC/R only reports the maximum value. + self.n_iter_ = n_iter_.max().item() + + return self + + +class SVC(BaseSVC): + """C-Support Vector Classification. + + The implementation is based on libsvm. The fit time scales at least + quadratically with the number of samples and may be impractical + beyond tens of thousands of samples. For large datasets + consider using :class:`~sklearn.svm.LinearSVC` or + :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a + :class:`~sklearn.kernel_approximation.Nystroem` transformer or + other :ref:`kernel_approximation`. + + The multiclass support is handled according to a one-vs-one scheme. + + For details on the precise mathematical formulation of the provided + kernel functions and how `gamma`, `coef0` and `degree` affect each + other, see the corresponding section in the narrative documentation: + :ref:`svm_kernels`. + + To learn how to tune SVC's hyperparameters, see the following example: + :ref:`sphx_glr_auto_examples_model_selection_plot_nested_cross_validation_iris.py` + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + C : float, default=1.0 + Regularization parameter. The strength of the regularization is + inversely proportional to C. Must be strictly positive. The penalty + is a squared l2 penalty. For an intuitive visualization of the effects + of scaling the regularization parameter C, see + :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`. + + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. If + none is given, 'rbf' will be used. If a callable is given it is used to + pre-compute the kernel matrix from data matrices; that matrix should be + an array of shape ``(n_samples, n_samples)``. For an intuitive + visualization of different kernel types see + :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`. + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Must be non-negative. Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features + - if float, must be non-negative. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + probability : bool, default=False + Whether to enable probability estimates. This must be enabled prior + to calling `fit`, will slow down that method as it internally uses + 5-fold cross-validation, and `predict_proba` may be inconsistent with + `predict`. Read more in the :ref:`User Guide `. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + class_weight : dict or 'balanced', default=None + Set the parameter C of class i to class_weight[i]*C for + SVC. If not given, all classes are supposed to have + weight one. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + decision_function_shape : {'ovo', 'ovr'}, default='ovr' + Whether to return a one-vs-rest ('ovr') decision function of shape + (n_samples, n_classes) as all other classifiers, or the original + one-vs-one ('ovo') decision function of libsvm which has shape + (n_samples, n_classes * (n_classes - 1) / 2). However, note that + internally, one-vs-one ('ovo') is always used as a multi-class strategy + to train models; an ovr matrix is only constructed from the ovo matrix. + The parameter is ignored for binary classification. + + .. versionchanged:: 0.19 + decision_function_shape is 'ovr' by default. + + .. versionadded:: 0.17 + *decision_function_shape='ovr'* is recommended. + + .. versionchanged:: 0.17 + Deprecated *decision_function_shape='ovo' and None*. + + break_ties : bool, default=False + If true, ``decision_function_shape='ovr'``, and number of classes > 2, + :term:`predict` will break ties according to the confidence values of + :term:`decision_function`; otherwise the first class among the tied + classes is returned. Please note that breaking ties comes at a + relatively high computational cost compared to a simple predict. See + :ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an + example of its usage with ``decision_function_shape='ovr'``. + + .. versionadded:: 0.22 + + random_state : int, RandomState instance or None, default=None + Controls the pseudo random number generation for shuffling the data for + probability estimates. Ignored when `probability` is False. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + class_weight_ : ndarray of shape (n_classes,) + Multipliers of parameter C for each class. + Computed based on the ``class_weight`` parameter. + + classes_ : ndarray of shape (n_classes,) + The classes labels. + + coef_ : ndarray of shape (n_classes * (n_classes - 1) / 2, n_features) + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is a readonly property derived from `dual_coef_` and + `support_vectors_`. + + dual_coef_ : ndarray of shape (n_classes -1, n_SV) + Dual coefficients of the support vector in the decision + function (see :ref:`sgd_mathematical_formulation`), multiplied by + their targets. + For multiclass, coefficient for all 1-vs-1 classifiers. + The layout of the coefficients in the multiclass case is somewhat + non-trivial. See the :ref:`multi-class section of the User Guide + ` for details. + + fit_status_ : int + 0 if correctly fitted, 1 otherwise (will raise warning) + + intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : ndarray of shape (n_classes * (n_classes - 1) // 2,) + Number of iterations run by the optimization routine to fit the model. + The shape of this attribute depends on the number of models optimized + which in turn depends on the number of classes. + + .. versionadded:: 1.1 + + support_ : ndarray of shape (n_SV) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. An empty array if kernel is precomputed. + + n_support_ : ndarray of shape (n_classes,), dtype=int32 + Number of support vectors for each class. + + probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2) + probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2) + If `probability=True`, it corresponds to the parameters learned in + Platt scaling to produce probability estimates from decision values. + If `probability=False`, it's an empty array. Platt scaling uses the + logistic function + ``1 / (1 + exp(decision_value * probA_ + probB_))`` + where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For + more information on the multiclass case and training procedure see + section 8 of [1]_. + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + See Also + -------- + SVR : Support Vector Machine for Regression implemented using libsvm. + + LinearSVC : Scalable Linear Support Vector Machine for classification + implemented using liblinear. Check the See Also section of + LinearSVC for more comparison element. + + References + ---------- + .. [1] `LIBSVM: A Library for Support Vector Machines + `_ + + .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector + Machines and Comparisons to Regularized Likelihood Methods" + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) + >>> y = np.array([1, 1, 2, 2]) + >>> from sklearn.svm import SVC + >>> clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) + >>> clf.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('svc', SVC(gamma='auto'))]) + + >>> print(clf.predict([[-0.8, -1]])) + [1] + + For a comaprison of the SVC with other classifiers see: + :ref:`sphx_glr_auto_examples_classification_plot_classification_probability.py`. + """ + + _impl = "c_svc" + + def __init__( + self, + *, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): + super().__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=0.0, + shrinking=shrinking, + probability=probability, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) + + +class NuSVC(BaseSVC): + """Nu-Support Vector Classification. + + Similar to SVC but uses a parameter to control the number of support + vectors. + + The implementation is based on libsvm. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + nu : float, default=0.5 + An upper bound on the fraction of margin errors (see :ref:`User Guide + `) and a lower bound of the fraction of support vectors. + Should be in the interval (0, 1]. + + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. For an intuitive + visualization of different kernel types see + :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`. + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Must be non-negative. Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features + - if float, must be non-negative. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + probability : bool, default=False + Whether to enable probability estimates. This must be enabled prior + to calling `fit`, will slow down that method as it internally uses + 5-fold cross-validation, and `predict_proba` may be inconsistent with + `predict`. Read more in the :ref:`User Guide `. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + class_weight : {dict, 'balanced'}, default=None + Set the parameter C of class i to class_weight[i]*C for + SVC. If not given, all classes are supposed to have + weight one. The "balanced" mode uses the values of y to automatically + adjust weights inversely proportional to class frequencies as + ``n_samples / (n_classes * np.bincount(y))``. + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + decision_function_shape : {'ovo', 'ovr'}, default='ovr' + Whether to return a one-vs-rest ('ovr') decision function of shape + (n_samples, n_classes) as all other classifiers, or the original + one-vs-one ('ovo') decision function of libsvm which has shape + (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one + ('ovo') is always used as multi-class strategy. The parameter is + ignored for binary classification. + + .. versionchanged:: 0.19 + decision_function_shape is 'ovr' by default. + + .. versionadded:: 0.17 + *decision_function_shape='ovr'* is recommended. + + .. versionchanged:: 0.17 + Deprecated *decision_function_shape='ovo' and None*. + + break_ties : bool, default=False + If true, ``decision_function_shape='ovr'``, and number of classes > 2, + :term:`predict` will break ties according to the confidence values of + :term:`decision_function`; otherwise the first class among the tied + classes is returned. Please note that breaking ties comes at a + relatively high computational cost compared to a simple predict. + See :ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an + example of its usage with ``decision_function_shape='ovr'``. + + .. versionadded:: 0.22 + + random_state : int, RandomState instance or None, default=None + Controls the pseudo random number generation for shuffling the data for + probability estimates. Ignored when `probability` is False. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + class_weight_ : ndarray of shape (n_classes,) + Multipliers of parameter C of each class. + Computed based on the ``class_weight`` parameter. + + classes_ : ndarray of shape (n_classes,) + The unique classes labels. + + coef_ : ndarray of shape (n_classes * (n_classes -1) / 2, n_features) + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is readonly property derived from `dual_coef_` and + `support_vectors_`. + + dual_coef_ : ndarray of shape (n_classes - 1, n_SV) + Dual coefficients of the support vector in the decision + function (see :ref:`sgd_mathematical_formulation`), multiplied by + their targets. + For multiclass, coefficient for all 1-vs-1 classifiers. + The layout of the coefficients in the multiclass case is somewhat + non-trivial. See the :ref:`multi-class section of the User Guide + ` for details. + + fit_status_ : int + 0 if correctly fitted, 1 if the algorithm did not converge. + + intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : ndarray of shape (n_classes * (n_classes - 1) // 2,) + Number of iterations run by the optimization routine to fit the model. + The shape of this attribute depends on the number of models optimized + which in turn depends on the number of classes. + + .. versionadded:: 1.1 + + support_ : ndarray of shape (n_SV,) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. + + n_support_ : ndarray of shape (n_classes,), dtype=int32 + Number of support vectors for each class. + + fit_status_ : int + 0 if correctly fitted, 1 if the algorithm did not converge. + + probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2,) + + probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2,) + If `probability=True`, it corresponds to the parameters learned in + Platt scaling to produce probability estimates from decision values. + If `probability=False`, it's an empty array. Platt scaling uses the + logistic function + ``1 / (1 + exp(decision_value * probA_ + probB_))`` + where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For + more information on the multiclass case and training procedure see + section 8 of [1]_. + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + See Also + -------- + SVC : Support Vector Machine for classification using libsvm. + + LinearSVC : Scalable linear Support Vector Machine for classification using + liblinear. + + References + ---------- + .. [1] `LIBSVM: A Library for Support Vector Machines + `_ + + .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector + Machines and Comparisons to Regularized Likelihood Methods" + `_ + + Examples + -------- + >>> import numpy as np + >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) + >>> y = np.array([1, 1, 2, 2]) + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.svm import NuSVC + >>> clf = make_pipeline(StandardScaler(), NuSVC()) + >>> clf.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), ('nusvc', NuSVC())]) + >>> print(clf.predict([[-0.8, -1]])) + [1] + """ + + _impl = "nu_svc" + + _parameter_constraints: dict = { + **BaseSVC._parameter_constraints, + "nu": [Interval(Real, 0.0, 1.0, closed="right")], + } + _parameter_constraints.pop("C") + + def __init__( + self, + *, + nu=0.5, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): + super().__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=0.0, + nu=nu, + shrinking=shrinking, + probability=probability, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) + + +class SVR(RegressorMixin, BaseLibSVM): + """Epsilon-Support Vector Regression. + + The free parameters in the model are C and epsilon. + + The implementation is based on libsvm. The fit time complexity + is more than quadratic with the number of samples which makes it hard + to scale to datasets with more than a couple of 10000 samples. For large + datasets consider using :class:`~sklearn.svm.LinearSVR` or + :class:`~sklearn.linear_model.SGDRegressor` instead, possibly after a + :class:`~sklearn.kernel_approximation.Nystroem` transformer or + other :ref:`kernel_approximation`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. + For an intuitive visualization of different kernel types + see :ref:`sphx_glr_auto_examples_svm_plot_svm_regression.py` + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Must be non-negative. Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features + - if float, must be non-negative. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + C : float, default=1.0 + Regularization parameter. The strength of the regularization is + inversely proportional to C. Must be strictly positive. + The penalty is a squared l2. For an intuitive visualization of the + effects of scaling the regularization parameter C, see + :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`. + + epsilon : float, default=0.1 + Epsilon in the epsilon-SVR model. It specifies the epsilon-tube + within which no penalty is associated in the training loss function + with points predicted within a distance epsilon from the actual + value. Must be non-negative. + + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is readonly property derived from `dual_coef_` and + `support_vectors_`. + + dual_coef_ : ndarray of shape (1, n_SV) + Coefficients of the support vector in the decision function. + + fit_status_ : int + 0 if correctly fitted, 1 otherwise (will raise warning) + + intercept_ : ndarray of shape (1,) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run by the optimization routine to fit the model. + + .. versionadded:: 1.1 + + n_support_ : ndarray of shape (1,), dtype=int32 + Number of support vectors. + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + support_ : ndarray of shape (n_SV,) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. + + See Also + -------- + NuSVR : Support Vector Machine for regression implemented using libsvm + using a parameter to control the number of support vectors. + + LinearSVR : Scalable Linear Support Vector Machine for regression + implemented using liblinear. + + References + ---------- + .. [1] `LIBSVM: A Library for Support Vector Machines + `_ + + .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector + Machines and Comparisons to Regularized Likelihood Methods" + `_ + + Examples + -------- + >>> from sklearn.svm import SVR + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> import numpy as np + >>> n_samples, n_features = 10, 5 + >>> rng = np.random.RandomState(0) + >>> y = rng.randn(n_samples) + >>> X = rng.randn(n_samples, n_features) + >>> regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2)) + >>> regr.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('svr', SVR(epsilon=0.2))]) + """ + + _impl = "epsilon_svr" + + _parameter_constraints: dict = {**BaseLibSVM._parameter_constraints} + for unused_param in ["class_weight", "nu", "probability", "random_state"]: + _parameter_constraints.pop(unused_param) + + def __init__( + self, + *, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + C=1.0, + epsilon=0.1, + shrinking=True, + cache_size=200, + verbose=False, + max_iter=-1, + ): + super().__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=0.0, + epsilon=epsilon, + verbose=verbose, + shrinking=shrinking, + probability=False, + cache_size=cache_size, + class_weight=None, + max_iter=max_iter, + random_state=None, + ) + + +class NuSVR(RegressorMixin, BaseLibSVM): + """Nu Support Vector Regression. + + Similar to NuSVC, for regression, uses a parameter nu to control + the number of support vectors. However, unlike NuSVC, where nu + replaces C, here nu replaces the parameter epsilon of epsilon-SVR. + + The implementation is based on libsvm. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + nu : float, default=0.5 + An upper bound on the fraction of training errors and a lower bound of + the fraction of support vectors. Should be in the interval (0, 1]. By + default 0.5 will be taken. + + C : float, default=1.0 + Penalty parameter C of the error term. For an intuitive visualization + of the effects of scaling the regularization parameter C, see + :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`. + + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. + For an intuitive visualization of different kernel types see + See :ref:`sphx_glr_auto_examples_svm_plot_svm_regression.py` + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Must be non-negative. Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features + - if float, must be non-negative. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is readonly property derived from `dual_coef_` and + `support_vectors_`. + + dual_coef_ : ndarray of shape (1, n_SV) + Coefficients of the support vector in the decision function. + + fit_status_ : int + 0 if correctly fitted, 1 otherwise (will raise warning) + + intercept_ : ndarray of shape (1,) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run by the optimization routine to fit the model. + + .. versionadded:: 1.1 + + n_support_ : ndarray of shape (1,), dtype=int32 + Number of support vectors. + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + support_ : ndarray of shape (n_SV,) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. + + See Also + -------- + NuSVC : Support Vector Machine for classification implemented with libsvm + with a parameter to control the number of support vectors. + + SVR : Epsilon Support Vector Machine for regression implemented with + libsvm. + + References + ---------- + .. [1] `LIBSVM: A Library for Support Vector Machines + `_ + + .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector + Machines and Comparisons to Regularized Likelihood Methods" + `_ + + Examples + -------- + >>> from sklearn.svm import NuSVR + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> import numpy as np + >>> n_samples, n_features = 10, 5 + >>> np.random.seed(0) + >>> y = np.random.randn(n_samples) + >>> X = np.random.randn(n_samples, n_features) + >>> regr = make_pipeline(StandardScaler(), NuSVR(C=1.0, nu=0.1)) + >>> regr.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('nusvr', NuSVR(nu=0.1))]) + """ + + _impl = "nu_svr" + + _parameter_constraints: dict = {**BaseLibSVM._parameter_constraints} + for unused_param in ["class_weight", "epsilon", "probability", "random_state"]: + _parameter_constraints.pop(unused_param) + + def __init__( + self, + *, + nu=0.5, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + tol=1e-3, + cache_size=200, + verbose=False, + max_iter=-1, + ): + super().__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=nu, + epsilon=0.0, + shrinking=shrinking, + probability=False, + cache_size=cache_size, + class_weight=None, + verbose=verbose, + max_iter=max_iter, + random_state=None, + ) + + +class OneClassSVM(OutlierMixin, BaseLibSVM): + """Unsupervised Outlier Detection. + + Estimate the support of a high-dimensional distribution. + + The implementation is based on libsvm. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Must be non-negative. Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features + - if float, must be non-negative. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + nu : float, default=0.5 + An upper bound on the fraction of training + errors and a lower bound of the fraction of support + vectors. Should be in the interval (0, 1]. By default 0.5 + will be taken. + + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is readonly property derived from `dual_coef_` and + `support_vectors_`. + + dual_coef_ : ndarray of shape (1, n_SV) + Coefficients of the support vectors in the decision function. + + fit_status_ : int + 0 if correctly fitted, 1 otherwise (will raise warning) + + intercept_ : ndarray of shape (1,) + Constant in the decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run by the optimization routine to fit the model. + + .. versionadded:: 1.1 + + n_support_ : ndarray of shape (n_classes,), dtype=int32 + Number of support vectors for each class. + + offset_ : float + Offset used to define the decision function from the raw scores. + We have the relation: decision_function = score_samples - `offset_`. + The offset is the opposite of `intercept_` and is provided for + consistency with other outlier detection algorithms. + + .. versionadded:: 0.20 + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + support_ : ndarray of shape (n_SV,) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. + + See Also + -------- + sklearn.linear_model.SGDOneClassSVM : Solves linear One-Class SVM using + Stochastic Gradient Descent. + sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection using + Local Outlier Factor (LOF). + sklearn.ensemble.IsolationForest : Isolation Forest Algorithm. + + Examples + -------- + >>> from sklearn.svm import OneClassSVM + >>> X = [[0], [0.44], [0.45], [0.46], [1]] + >>> clf = OneClassSVM(gamma='auto').fit(X) + >>> clf.predict(X) + array([-1, 1, 1, 1, -1]) + >>> clf.score_samples(X) + array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...]) + + For a more extended example, + see :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` + """ + + _impl = "one_class" + + _parameter_constraints: dict = {**BaseLibSVM._parameter_constraints} + for unused_param in ["C", "class_weight", "epsilon", "probability", "random_state"]: + _parameter_constraints.pop(unused_param) + + def __init__( + self, + *, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + nu=0.5, + shrinking=True, + cache_size=200, + verbose=False, + max_iter=-1, + ): + super().__init__( + kernel, + degree, + gamma, + coef0, + tol, + 0.0, + nu, + 0.0, + shrinking, + False, + cache_size, + None, + verbose, + max_iter, + random_state=None, + ) + + def fit(self, X, y=None, sample_weight=None): + """Detect the soft boundary of the set of samples X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Set of samples, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + Per-sample weights. Rescale C per sample. Higher weights + force the classifier to put more emphasis on these points. + + Returns + ------- + self : object + Fitted estimator. + + Notes + ----- + If X is not a C-ordered contiguous array it is copied. + """ + super().fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight) + self.offset_ = -self._intercept_ + return self + + def decision_function(self, X): + """Signed distance to the separating hyperplane. + + Signed distance is positive for an inlier and negative for an outlier. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + dec : ndarray of shape (n_samples,) + Returns the decision function of the samples. + """ + dec = self._decision_function(X).ravel() + return dec + + def score_samples(self, X): + """Raw scoring function of the samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + score_samples : ndarray of shape (n_samples,) + Returns the (unshifted) scoring function of the samples. + """ + return self.decision_function(X) + self.offset_ + + def predict(self, X): + """Perform classification on samples in X. + + For a one-class model, +1 or -1 is returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples_test, n_samples_train) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Class labels for samples in X. + """ + y = super().predict(X) + return np.asarray(y, dtype=np.intp) diff --git a/.venv/Lib/site-packages/sklearn/svm/_liblinear.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/svm/_liblinear.cp39-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..2cfbeaf2d4ca495e37b430e9b5f5acc2657d2b27 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/svm/_liblinear.cp39-win_amd64.lib differ diff --git a/.venv/Lib/site-packages/sklearn/svm/_liblinear.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/svm/_liblinear.cp39-win_amd64.pyd new file mode 100644 index 0000000000000000000000000000000000000000..4b9ed10afedb70bcb17cdac0c6d2e921e83672de Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/svm/_liblinear.cp39-win_amd64.pyd differ diff --git a/.venv/Lib/site-packages/sklearn/svm/_liblinear.pxi b/.venv/Lib/site-packages/sklearn/svm/_liblinear.pxi new file mode 100644 index 0000000000000000000000000000000000000000..af0d0b38c2c662275eee069ba2f56dd8617a3c78 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/_liblinear.pxi @@ -0,0 +1,43 @@ +from ..utils._typedefs cimport intp_t + +cdef extern from "_cython_blas_helpers.h": + ctypedef double (*dot_func)(int, const double*, int, const double*, int) + ctypedef void (*axpy_func)(int, double, const double*, int, double*, int) + ctypedef void (*scal_func)(int, double, const double*, int) + ctypedef double (*nrm2_func)(int, const double*, int) + cdef struct BlasFunctions: + dot_func dot + axpy_func axpy + scal_func scal + nrm2_func nrm2 + + +cdef extern from "linear.h": + cdef struct feature_node + cdef struct problem + cdef struct model + cdef struct parameter + ctypedef problem* problem_const_ptr "problem const *" + ctypedef parameter* parameter_const_ptr "parameter const *" + ctypedef char* char_const_ptr "char const *" + char_const_ptr check_parameter(problem_const_ptr prob, parameter_const_ptr param) + model *train(problem_const_ptr prob, parameter_const_ptr param, BlasFunctions *blas_functions) nogil + int get_nr_feature (model *model) + int get_nr_class (model *model) + void get_n_iter (model *model, int *n_iter) + void free_and_destroy_model (model **) + void destroy_param (parameter *) + + +cdef extern from "liblinear_helper.c": + void copy_w(void *, model *, int) + parameter *set_parameter(int, double, double, int, char *, char *, int, int, double) + problem *set_problem (char *, int, int, int, int, double, char *, char *) + problem *csr_set_problem (char *, int, char *, char *, int, int, int, double, char *, char *) + + model *set_model(parameter *, char *, intp_t *, char *, double) + + double get_bias(model *) + void free_problem (problem *) + void free_parameter (parameter *) + void set_verbosity(int) diff --git a/.venv/Lib/site-packages/sklearn/svm/_liblinear.pyx b/.venv/Lib/site-packages/sklearn/svm/_liblinear.pyx new file mode 100644 index 0000000000000000000000000000000000000000..6360e705ba04ae3f4692bdcd6a2958bcc0478a66 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/_liblinear.pyx @@ -0,0 +1,147 @@ +""" +Wrapper for liblinear + +Author: fabian.pedregosa@inria.fr +""" + +import numpy as np + +from ..utils._cython_blas cimport _dot, _axpy, _scal, _nrm2 +from ..utils._typedefs cimport float32_t, float64_t, int32_t + +include "_liblinear.pxi" + + +def train_wrap( + object X, + const float64_t[::1] Y, + bint is_sparse, + int solver_type, + double eps, + double bias, + double C, + const float64_t[:] class_weight, + int max_iter, + unsigned random_seed, + double epsilon, + const float64_t[::1] sample_weight +): + cdef parameter *param + cdef problem *problem + cdef model *model + cdef char_const_ptr error_msg + cdef int len_w + cdef bint X_has_type_float64 = X.dtype == np.float64 + cdef char * X_data_bytes_ptr + cdef const float64_t[::1] X_data_64 + cdef const float32_t[::1] X_data_32 + cdef const int32_t[::1] X_indices + cdef const int32_t[::1] X_indptr + + if is_sparse: + X_indices = X.indices + X_indptr = X.indptr + if X_has_type_float64: + X_data_64 = X.data + X_data_bytes_ptr = &X_data_64[0] + else: + X_data_32 = X.data + X_data_bytes_ptr = &X_data_32[0] + + problem = csr_set_problem( + X_data_bytes_ptr, + X_has_type_float64, + &X_indices[0], + &X_indptr[0], + (X.shape[0]), + (X.shape[1]), + (X.nnz), + bias, + &sample_weight[0], + &Y[0] + ) + else: + X_as_1d_array = X.reshape(-1) + if X_has_type_float64: + X_data_64 = X_as_1d_array + X_data_bytes_ptr = &X_data_64[0] + else: + X_data_32 = X_as_1d_array + X_data_bytes_ptr = &X_data_32[0] + + problem = set_problem( + X_data_bytes_ptr, + X_has_type_float64, + (X.shape[0]), + (X.shape[1]), + (np.count_nonzero(X)), + bias, + &sample_weight[0], + &Y[0] + ) + + cdef int32_t[::1] class_weight_label = np.arange(class_weight.shape[0], dtype=np.intc) + param = set_parameter( + solver_type, + eps, + C, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + max_iter, + random_seed, + epsilon + ) + + error_msg = check_parameter(problem, param) + if error_msg: + free_problem(problem) + free_parameter(param) + raise ValueError(error_msg) + + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + blas_functions.axpy = _axpy[double] + blas_functions.scal = _scal[double] + blas_functions.nrm2 = _nrm2[double] + + # early return + with nogil: + model = train(problem, param, &blas_functions) + + # FREE + free_problem(problem) + free_parameter(param) + # destroy_param(param) don't call this or it will destroy class_weight_label and class_weight + + # coef matrix holder created as fortran since that's what's used in liblinear + cdef float64_t[::1, :] w + cdef int nr_class = get_nr_class(model) + + cdef int labels_ = nr_class + if nr_class == 2: + labels_ = 1 + cdef int32_t[::1] n_iter = np.zeros(labels_, dtype=np.intc) + get_n_iter(model, &n_iter[0]) + + cdef int nr_feature = get_nr_feature(model) + if bias > 0: + nr_feature = nr_feature + 1 + if nr_class == 2 and solver_type != 4: # solver is not Crammer-Singer + w = np.empty((1, nr_feature), order='F') + copy_w(&w[0, 0], model, nr_feature) + else: + len_w = (nr_class) * nr_feature + w = np.empty((nr_class, nr_feature), order='F') + copy_w(&w[0, 0], model, len_w) + + free_and_destroy_model(&model) + + return w.base, n_iter.base + + +def set_verbosity_wrap(int verbosity): + """ + Control verbosity of libsvm library + """ + set_verbosity(verbosity) diff --git a/.venv/Lib/site-packages/sklearn/svm/_libsvm.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/svm/_libsvm.cp39-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..fee32a6a6507fe1a3388abe56979d2e3ad51d3bd Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/svm/_libsvm.cp39-win_amd64.lib differ diff --git a/.venv/Lib/site-packages/sklearn/svm/_libsvm.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/svm/_libsvm.cp39-win_amd64.pyd new file mode 100644 index 0000000000000000000000000000000000000000..d8b6b575582bf79672d465f7c628722f05761d1e Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/svm/_libsvm.cp39-win_amd64.pyd differ diff --git a/.venv/Lib/site-packages/sklearn/svm/_libsvm.pxi b/.venv/Lib/site-packages/sklearn/svm/_libsvm.pxi new file mode 100644 index 0000000000000000000000000000000000000000..6eab92b9d7b375928771759d2b4c2bb1d5ed2677 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/_libsvm.pxi @@ -0,0 +1,75 @@ +################################################################################ +# Includes +from ..utils._typedefs cimport intp_t + +cdef extern from "_svm_cython_blas_helpers.h": + ctypedef double (*dot_func)(int, const double*, int, const double*, int) + cdef struct BlasFunctions: + dot_func dot + + +cdef extern from "svm.h": + cdef struct svm_node + cdef struct svm_model + cdef struct svm_parameter: + int svm_type + int kernel_type + int degree # for poly + double gamma # for poly/rbf/sigmoid + double coef0 # for poly/sigmoid + + # these are for training only + double cache_size # in MB + double eps # stopping criteria + double C # for C_SVC, EPSILON_SVR and NU_SVR + int nr_weight # for C_SVC + int *weight_label # for C_SVC + double* weight # for C_SVC + double nu # for NU_SVC, ONE_CLASS, and NU_SVR + double p # for EPSILON_SVR + int shrinking # use the shrinking heuristics + int probability # do probability estimates + int max_iter # ceiling on Solver runtime + int random_seed # seed for random generator in probability estimation + + cdef struct svm_problem: + int l + double *y + svm_node *x + double *W # instance weights + + char *svm_check_parameter(svm_problem *, svm_parameter *) + svm_model *svm_train(svm_problem *, svm_parameter *, int *, BlasFunctions *) nogil + void svm_free_and_destroy_model(svm_model** model_ptr_ptr) + void svm_cross_validation(svm_problem *, svm_parameter *, int nr_fold, double *target, BlasFunctions *) nogil + + +cdef extern from "libsvm_helper.c": + # this file contains methods for accessing libsvm 'hidden' fields + svm_node **dense_to_sparse (char *, intp_t *) + void set_parameter (svm_parameter *, int , int , int , double, double , + double , double , double , double, + double, int, int, int, char *, char *, int, + int) + void set_problem (svm_problem *, char *, char *, char *, intp_t *, int) + + svm_model *set_model (svm_parameter *, int, char *, intp_t *, + char *, intp_t *, intp_t *, char *, + char *, char *, char *, char *) + + void copy_sv_coef (char *, svm_model *) + void copy_n_iter (char *, svm_model *) + void copy_intercept (char *, svm_model *, intp_t *) + void copy_SV (char *, svm_model *, intp_t *) + int copy_support (char *data, svm_model *model) + int copy_predict (char *, svm_model *, intp_t *, char *, BlasFunctions *) nogil + int copy_predict_proba (char *, svm_model *, intp_t *, char *, BlasFunctions *) nogil + int copy_predict_values(char *, svm_model *, intp_t *, char *, int, BlasFunctions *) nogil + void copy_nSV (char *, svm_model *) + void copy_probA (char *, svm_model *, intp_t *) + void copy_probB (char *, svm_model *, intp_t *) + intp_t get_l (svm_model *) + intp_t get_nr (svm_model *) + int free_problem (svm_problem *) + int free_model (svm_model *) + void set_verbosity(int) diff --git a/.venv/Lib/site-packages/sklearn/svm/_libsvm.pyx b/.venv/Lib/site-packages/sklearn/svm/_libsvm.pyx new file mode 100644 index 0000000000000000000000000000000000000000..6abe5045972bf49c30d649c4675f13185d91b9a4 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/_libsvm.pyx @@ -0,0 +1,917 @@ +""" +Binding for libsvm_skl +---------------------- + +These are the bindings for libsvm_skl, which is a fork of libsvm[1] +that adds to libsvm some capabilities, like index of support vectors +and efficient representation of dense matrices. + +These are low-level routines, but can be used for flexibility or +performance reasons. See sklearn.svm for a higher-level API. + +Low-level memory management is done in libsvm_helper.c. If we happen +to run out of memory a MemoryError will be raised. In practice this is +not very helpful since high chances are malloc fails inside svm.cpp, +where no sort of memory checks are done. + +[1] https://www.csie.ntu.edu.tw/~cjlin/libsvm/ + +Notes +----- +The signature mode='c' is somewhat superficial, since we already +check that arrays are C-contiguous in svm.py + +Authors +------- +2010: Fabian Pedregosa + Gael Varoquaux +""" + +import numpy as np +from libc.stdlib cimport free +from ..utils._cython_blas cimport _dot +from ..utils._typedefs cimport float64_t, int32_t, intp_t + +include "_libsvm.pxi" + +cdef extern from *: + ctypedef struct svm_parameter: + pass + + +################################################################################ +# Internal variables +LIBSVM_KERNEL_TYPES = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'] + + +################################################################################ +# Wrapper functions + +def fit( + const float64_t[:, ::1] X, + const float64_t[::1] Y, + int svm_type=0, + kernel='rbf', + int degree=3, + double gamma=0.1, + double coef0=0.0, + double tol=1e-3, + double C=1.0, + double nu=0.5, + double epsilon=0.1, + const float64_t[::1] class_weight=np.empty(0), + const float64_t[::1] sample_weight=np.empty(0), + int shrinking=1, + int probability=0, + double cache_size=100., + int max_iter=-1, + int random_seed=0, +): + """ + Train the model using libsvm (low-level method) + + Parameters + ---------- + X : array-like, dtype=float64 of shape (n_samples, n_features) + + Y : array, dtype=float64 of shape (n_samples,) + target vector + + svm_type : {0, 1, 2, 3, 4}, default=0 + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR + respectively. + + kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf" + Kernel to use in the model: linear, polynomial, RBF, sigmoid + or precomputed. + + degree : int32, default=3 + Degree of the polynomial kernel (only relevant if kernel is + set to polynomial). + + gamma : float64, default=0.1 + Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other + kernels. + + coef0 : float64, default=0 + Independent parameter in poly/sigmoid kernel. + + tol : float64, default=1e-3 + Numeric stopping criterion (WRITEME). + + C : float64, default=1 + C parameter in C-Support Vector Classification. + + nu : float64, default=0.5 + An upper bound on the fraction of training errors and a lower bound of + the fraction of support vectors. Should be in the interval (0, 1]. + + epsilon : double, default=0.1 + Epsilon parameter in the epsilon-insensitive loss function. + + class_weight : array, dtype=float64, shape (n_classes,), \ + default=np.empty(0) + Set the parameter C of class i to class_weight[i]*C for + SVC. If not given, all classes are supposed to have + weight one. + + sample_weight : array, dtype=float64, shape (n_samples,), \ + default=np.empty(0) + Weights assigned to each sample. + + shrinking : int, default=1 + Whether to use the shrinking heuristic. + + probability : int, default=0 + Whether to enable probability estimates. + + cache_size : float64, default=100 + Cache size for gram matrix columns (in megabytes). + + max_iter : int (-1 for no limit), default=-1 + Stop solver after this many iterations regardless of accuracy + (XXX Currently there is no API to know whether this kicked in.) + + random_seed : int, default=0 + Seed for the random number generator used for probability estimates. + + Returns + ------- + support : array of shape (n_support,) + Index of support vectors. + + support_vectors : array of shape (n_support, n_features) + Support vectors (equivalent to X[support]). Will return an + empty array in the case of precomputed kernel. + + n_class_SV : array of shape (n_class,) + Number of support vectors in each class. + + sv_coef : array of shape (n_class-1, n_support) + Coefficients of support vectors in decision function. + + intercept : array of shape (n_class*(n_class-1)/2,) + Intercept in decision function. + + probA, probB : array of shape (n_class*(n_class-1)/2,) + Probability estimates, empty array for probability=False. + + n_iter : ndarray of shape (max(1, (n_class * (n_class - 1) // 2)),) + Number of iterations run by the optimization routine to fit the model. + """ + + cdef svm_parameter param + cdef svm_problem problem + cdef svm_model *model + cdef const char *error_msg + cdef intp_t SV_len + + if len(sample_weight) == 0: + sample_weight = np.ones(X.shape[0], dtype=np.float64) + else: + assert sample_weight.shape[0] == X.shape[0], ( + f"sample_weight and X have incompatible shapes: sample_weight has " + f"{sample_weight.shape[0]} samples while X has {X.shape[0]}" + ) + + kernel_index = LIBSVM_KERNEL_TYPES.index(kernel) + set_problem( + &problem, + &X[0, 0], + &Y[0], + &sample_weight[0], + X.shape, + kernel_index, + ) + if problem.x == NULL: + raise MemoryError("Seems we've run out of memory") + cdef int32_t[::1] class_weight_label = np.arange( + class_weight.shape[0], dtype=np.int32 + ) + set_parameter( + ¶m, + svm_type, + kernel_index, + degree, + gamma, + coef0, + nu, + cache_size, + C, + tol, + epsilon, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + max_iter, + random_seed, + ) + + error_msg = svm_check_parameter(&problem, ¶m) + if error_msg: + # for SVR: epsilon is called p in libsvm + error_repl = error_msg.decode('utf-8').replace("p < 0", "epsilon < 0") + raise ValueError(error_repl) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + # this does the real work + cdef int fit_status = 0 + with nogil: + model = svm_train(&problem, ¶m, &fit_status, &blas_functions) + + # from here until the end, we just copy the data returned by + # svm_train + SV_len = get_l(model) + n_class = get_nr(model) + + cdef int[::1] n_iter = np.empty(max(1, n_class * (n_class - 1) // 2), dtype=np.intc) + copy_n_iter( &n_iter[0], model) + + cdef float64_t[:, ::1] sv_coef = np.empty((n_class-1, SV_len), dtype=np.float64) + copy_sv_coef( &sv_coef[0, 0] if sv_coef.size > 0 else NULL, model) + + # the intercept is just model.rho but with sign changed + cdef float64_t[::1] intercept = np.empty( + int((n_class*(n_class-1))/2), dtype=np.float64 + ) + copy_intercept( &intercept[0], model, intercept.shape) + + cdef int32_t[::1] support = np.empty(SV_len, dtype=np.int32) + copy_support( &support[0] if support.size > 0 else NULL, model) + + # copy model.SV + cdef float64_t[:, ::1] support_vectors + if kernel_index == 4: + # precomputed kernel + support_vectors = np.empty((0, 0), dtype=np.float64) + else: + support_vectors = np.empty((SV_len, X.shape[1]), dtype=np.float64) + copy_SV( + &support_vectors[0, 0] if support_vectors.size > 0 else NULL, + model, + support_vectors.shape, + ) + + cdef int32_t[::1] n_class_SV + if svm_type == 0 or svm_type == 1: + n_class_SV = np.empty(n_class, dtype=np.int32) + copy_nSV( &n_class_SV[0] if n_class_SV.size > 0 else NULL, model) + else: + # OneClass and SVR are considered to have 2 classes + n_class_SV = np.array([SV_len, SV_len], dtype=np.int32) + + cdef float64_t[::1] probA + cdef float64_t[::1] probB + if probability != 0: + if svm_type < 2: # SVC and NuSVC + probA = np.empty(int(n_class*(n_class-1)/2), dtype=np.float64) + probB = np.empty(int(n_class*(n_class-1)/2), dtype=np.float64) + copy_probB( &probB[0], model, probB.shape) + else: + probA = np.empty(1, dtype=np.float64) + probB = np.empty(0, dtype=np.float64) + copy_probA( &probA[0], model, probA.shape) + else: + probA = np.empty(0, dtype=np.float64) + probB = np.empty(0, dtype=np.float64) + + svm_free_and_destroy_model(&model) + free(problem.x) + + return ( + support.base, + support_vectors.base, + n_class_SV.base, + sv_coef.base, + intercept.base, + probA.base, + probB.base, + fit_status, + n_iter.base, + ) + + +cdef void set_predict_params( + svm_parameter *param, + int svm_type, + kernel, + int degree, + double gamma, + double coef0, + double cache_size, + int probability, + int nr_weight, + char *weight_label, + char *weight, +) except *: + """Fill param with prediction time-only parameters.""" + + # training-time only parameters + cdef double C = 0.0 + cdef double epsilon = 0.1 + cdef int max_iter = 0 + cdef double nu = 0.5 + cdef int shrinking = 0 + cdef double tol = 0.1 + cdef int random_seed = -1 + + kernel_index = LIBSVM_KERNEL_TYPES.index(kernel) + + set_parameter( + param, + svm_type, + kernel_index, + degree, + gamma, + coef0, + nu, + cache_size, + C, + tol, + epsilon, + shrinking, + probability, + nr_weight, + weight_label, + weight, + max_iter, + random_seed, + ) + + +def predict( + const float64_t[:, ::1] X, + const int32_t[::1] support, + const float64_t[:, ::1] SV, + const int32_t[::1] nSV, + const float64_t[:, ::1] sv_coef, + const float64_t[::1] intercept, + const float64_t[::1] probA=np.empty(0), + const float64_t[::1] probB=np.empty(0), + int svm_type=0, + kernel='rbf', + int degree=3, + double gamma=0.1, + double coef0=0.0, + const float64_t[::1] class_weight=np.empty(0), + const float64_t[::1] sample_weight=np.empty(0), + double cache_size=100.0, +): + """ + Predict target values of X given a model (low-level method) + + Parameters + ---------- + X : array-like, dtype=float of shape (n_samples, n_features) + + support : array of shape (n_support,) + Index of support vectors in training set. + + SV : array of shape (n_support, n_features) + Support vectors. + + nSV : array of shape (n_class,) + Number of support vectors in each class. + + sv_coef : array of shape (n_class-1, n_support) + Coefficients of support vectors in decision function. + + intercept : array of shape (n_class*(n_class-1)/2) + Intercept in decision function. + + probA, probB : array of shape (n_class*(n_class-1)/2,) + Probability estimates. + + svm_type : {0, 1, 2, 3, 4}, default=0 + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR + respectively. + + kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf" + Kernel to use in the model: linear, polynomial, RBF, sigmoid + or precomputed. + + degree : int32, default=3 + Degree of the polynomial kernel (only relevant if kernel is + set to polynomial). + + gamma : float64, default=0.1 + Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other + kernels. + + coef0 : float64, default=0.0 + Independent parameter in poly/sigmoid kernel. + + Returns + ------- + dec_values : array + Predicted values. + """ + cdef float64_t[::1] dec_values + cdef svm_parameter param + cdef svm_model *model + cdef int rv + + cdef int32_t[::1] class_weight_label = np.arange( + class_weight.shape[0], dtype=np.int32 + ) + + set_predict_params( + ¶m, + svm_type, + kernel, + degree, + gamma, + coef0, + cache_size, + 0, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + ) + model = set_model( + ¶m, + nSV.shape[0], + &SV[0, 0] if SV.size > 0 else NULL, + SV.shape, + &support[0] if support.size > 0 else NULL, + support.shape, + sv_coef.strides, + &sv_coef[0, 0] if sv_coef.size > 0 else NULL, + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + # TODO: use check_model + try: + dec_values = np.empty(X.shape[0]) + with nogil: + rv = copy_predict( + &X[0, 0], + model, + X.shape, + &dec_values[0], + &blas_functions, + ) + if rv < 0: + raise MemoryError("We've run out of memory") + finally: + free_model(model) + + return dec_values.base + + +def predict_proba( + const float64_t[:, ::1] X, + const int32_t[::1] support, + const float64_t[:, ::1] SV, + const int32_t[::1] nSV, + float64_t[:, ::1] sv_coef, + float64_t[::1] intercept, + float64_t[::1] probA=np.empty(0), + float64_t[::1] probB=np.empty(0), + int svm_type=0, + kernel='rbf', + int degree=3, + double gamma=0.1, + double coef0=0.0, + float64_t[::1] class_weight=np.empty(0), + float64_t[::1] sample_weight=np.empty(0), + double cache_size=100.0, +): + """ + Predict probabilities + + svm_model stores all parameters needed to predict a given value. + + For speed, all real work is done at the C level in function + copy_predict (libsvm_helper.c). + + We have to reconstruct model and parameters to make sure we stay + in sync with the python object. + + See sklearn.svm.predict for a complete list of parameters. + + Parameters + ---------- + X : array-like, dtype=float of shape (n_samples, n_features) + + support : array of shape (n_support,) + Index of support vectors in training set. + + SV : array of shape (n_support, n_features) + Support vectors. + + nSV : array of shape (n_class,) + Number of support vectors in each class. + + sv_coef : array of shape (n_class-1, n_support) + Coefficients of support vectors in decision function. + + intercept : array of shape (n_class*(n_class-1)/2,) + Intercept in decision function. + + probA, probB : array of shape (n_class*(n_class-1)/2,) + Probability estimates. + + svm_type : {0, 1, 2, 3, 4}, default=0 + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR + respectively. + + kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf" + Kernel to use in the model: linear, polynomial, RBF, sigmoid + or precomputed. + + degree : int32, default=3 + Degree of the polynomial kernel (only relevant if kernel is + set to polynomial). + + gamma : float64, default=0.1 + Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other + kernels. + + coef0 : float64, default=0.0 + Independent parameter in poly/sigmoid kernel. + + Returns + ------- + dec_values : array + Predicted values. + """ + cdef float64_t[:, ::1] dec_values + cdef svm_parameter param + cdef svm_model *model + cdef int32_t[::1] class_weight_label = np.arange( + class_weight.shape[0], dtype=np.int32 + ) + cdef int rv + + set_predict_params( + ¶m, + svm_type, + kernel, + degree, + gamma, + coef0, + cache_size, + 1, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + ) + model = set_model( + ¶m, + nSV.shape[0], + &SV[0, 0] if SV.size > 0 else NULL, + SV.shape, + &support[0], + support.shape, + sv_coef.strides, + &sv_coef[0, 0], + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + + cdef intp_t n_class = get_nr(model) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + try: + dec_values = np.empty((X.shape[0], n_class), dtype=np.float64) + with nogil: + rv = copy_predict_proba( + &X[0, 0], + model, + X.shape, + &dec_values[0, 0], + &blas_functions, + ) + if rv < 0: + raise MemoryError("We've run out of memory") + finally: + free_model(model) + + return dec_values.base + + +def decision_function( + const float64_t[:, ::1] X, + const int32_t[::1] support, + const float64_t[:, ::1] SV, + const int32_t[::1] nSV, + const float64_t[:, ::1] sv_coef, + const float64_t[::1] intercept, + const float64_t[::1] probA=np.empty(0), + const float64_t[::1] probB=np.empty(0), + int svm_type=0, + kernel='rbf', + int degree=3, + double gamma=0.1, + double coef0=0.0, + const float64_t[::1] class_weight=np.empty(0), + const float64_t[::1] sample_weight=np.empty(0), + double cache_size=100.0, +): + """ + Predict margin (libsvm name for this is predict_values) + + We have to reconstruct model and parameters to make sure we stay + in sync with the python object. + + Parameters + ---------- + X : array-like, dtype=float, size=[n_samples, n_features] + + support : array, shape=[n_support] + Index of support vectors in training set. + + SV : array, shape=[n_support, n_features] + Support vectors. + + nSV : array, shape=[n_class] + Number of support vectors in each class. + + sv_coef : array, shape=[n_class-1, n_support] + Coefficients of support vectors in decision function. + + intercept : array, shape=[n_class*(n_class-1)/2] + Intercept in decision function. + + probA, probB : array, shape=[n_class*(n_class-1)/2] + Probability estimates. + + svm_type : {0, 1, 2, 3, 4}, optional + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR + respectively. 0 by default. + + kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, optional + Kernel to use in the model: linear, polynomial, RBF, sigmoid + or precomputed. 'rbf' by default. + + degree : int32, optional + Degree of the polynomial kernel (only relevant if kernel is + set to polynomial), 3 by default. + + gamma : float64, optional + Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other + kernels. 0.1 by default. + + coef0 : float64, optional + Independent parameter in poly/sigmoid kernel. 0 by default. + + Returns + ------- + dec_values : array + Predicted values. + """ + cdef float64_t[:, ::1] dec_values + cdef svm_parameter param + cdef svm_model *model + cdef intp_t n_class + + cdef int32_t[::1] class_weight_label = np.arange( + class_weight.shape[0], dtype=np.int32 + ) + + cdef int rv + + set_predict_params( + ¶m, + svm_type, + kernel, + degree, + gamma, + coef0, + cache_size, + 0, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + ) + + model = set_model( + ¶m, + nSV.shape[0], + &SV[0, 0] if SV.size > 0 else NULL, + SV.shape, + &support[0], + support.shape, + sv_coef.strides, + &sv_coef[0, 0], + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + + if svm_type > 1: + n_class = 1 + else: + n_class = get_nr(model) + n_class = n_class * (n_class - 1) // 2 + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + try: + dec_values = np.empty((X.shape[0], n_class), dtype=np.float64) + with nogil: + rv = copy_predict_values( + &X[0, 0], + model, + X.shape, + &dec_values[0, 0], + n_class, + &blas_functions, + ) + if rv < 0: + raise MemoryError("We've run out of memory") + finally: + free_model(model) + + return dec_values.base + + +def cross_validation( + const float64_t[:, ::1] X, + const float64_t[::1] Y, + int n_fold, + int svm_type=0, + kernel='rbf', + int degree=3, + double gamma=0.1, + double coef0=0.0, + double tol=1e-3, + double C=1.0, + double nu=0.5, + double epsilon=0.1, + float64_t[::1] class_weight=np.empty(0), + float64_t[::1] sample_weight=np.empty(0), + int shrinking=0, + int probability=0, + double cache_size=100.0, + int max_iter=-1, + int random_seed=0, +): + """ + Binding of the cross-validation routine (low-level routine) + + Parameters + ---------- + + X : array-like, dtype=float of shape (n_samples, n_features) + + Y : array, dtype=float of shape (n_samples,) + target vector + + n_fold : int32 + Number of folds for cross validation. + + svm_type : {0, 1, 2, 3, 4}, default=0 + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR + respectively. + + kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default='rbf' + Kernel to use in the model: linear, polynomial, RBF, sigmoid + or precomputed. + + degree : int32, default=3 + Degree of the polynomial kernel (only relevant if kernel is + set to polynomial). + + gamma : float64, default=0.1 + Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other + kernels. + + coef0 : float64, default=0.0 + Independent parameter in poly/sigmoid kernel. + + tol : float64, default=1e-3 + Numeric stopping criterion (WRITEME). + + C : float64, default=1 + C parameter in C-Support Vector Classification. + + nu : float64, default=0.5 + An upper bound on the fraction of training errors and a lower bound of + the fraction of support vectors. Should be in the interval (0, 1]. + + epsilon : double, default=0.1 + Epsilon parameter in the epsilon-insensitive loss function. + + class_weight : array, dtype=float64, shape (n_classes,), \ + default=np.empty(0) + Set the parameter C of class i to class_weight[i]*C for + SVC. If not given, all classes are supposed to have + weight one. + + sample_weight : array, dtype=float64, shape (n_samples,), \ + default=np.empty(0) + Weights assigned to each sample. + + shrinking : int, default=1 + Whether to use the shrinking heuristic. + + probability : int, default=0 + Whether to enable probability estimates. + + cache_size : float64, default=100 + Cache size for gram matrix columns (in megabytes). + + max_iter : int (-1 for no limit), default=-1 + Stop solver after this many iterations regardless of accuracy + (XXX Currently there is no API to know whether this kicked in.) + + random_seed : int, default=0 + Seed for the random number generator used for probability estimates. + + Returns + ------- + target : array, float + + """ + + cdef svm_parameter param + cdef svm_problem problem + cdef const char *error_msg + + if len(sample_weight) == 0: + sample_weight = np.ones(X.shape[0], dtype=np.float64) + else: + assert sample_weight.shape[0] == X.shape[0], ( + f"sample_weight and X have incompatible shapes: sample_weight has " + f"{sample_weight.shape[0]} samples while X has {X.shape[0]}" + ) + + if X.shape[0] < n_fold: + raise ValueError("Number of samples is less than number of folds") + + # set problem + kernel_index = LIBSVM_KERNEL_TYPES.index(kernel) + set_problem( + &problem, + &X[0, 0], + &Y[0], + &sample_weight[0] if sample_weight.size > 0 else NULL, + X.shape, + kernel_index, + ) + if problem.x == NULL: + raise MemoryError("Seems we've run out of memory") + cdef int32_t[::1] class_weight_label = np.arange( + class_weight.shape[0], dtype=np.int32 + ) + + # set parameters + set_parameter( + ¶m, + svm_type, + kernel_index, + degree, + gamma, + coef0, + nu, + cache_size, + C, + tol, + tol, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + max_iter, + random_seed, + ) + + error_msg = svm_check_parameter(&problem, ¶m) + if error_msg: + raise ValueError(error_msg) + + cdef float64_t[::1] target + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + try: + target = np.empty((X.shape[0]), dtype=np.float64) + with nogil: + svm_cross_validation( + &problem, + ¶m, + n_fold, + &target[0], + &blas_functions, + ) + finally: + free(problem.x) + + return target.base + + +def set_verbosity_wrap(int verbosity): + """ + Control verbosity of libsvm library + """ + set_verbosity(verbosity) diff --git a/.venv/Lib/site-packages/sklearn/svm/_libsvm_sparse.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/svm/_libsvm_sparse.cp39-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..41d212390667b6780c4a569477067426392223aa Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/svm/_libsvm_sparse.cp39-win_amd64.lib differ diff --git a/.venv/Lib/site-packages/sklearn/svm/_libsvm_sparse.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/svm/_libsvm_sparse.cp39-win_amd64.pyd new file mode 100644 index 0000000000000000000000000000000000000000..12402160f3240a3b7990a976fe349e367e153dd2 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/svm/_libsvm_sparse.cp39-win_amd64.pyd differ diff --git a/.venv/Lib/site-packages/sklearn/svm/_libsvm_sparse.pyx b/.venv/Lib/site-packages/sklearn/svm/_libsvm_sparse.pyx new file mode 100644 index 0000000000000000000000000000000000000000..f1b5e8edf167ea86cb5f1b893bb61e371b822666 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/_libsvm_sparse.pyx @@ -0,0 +1,550 @@ +import numpy as np +from scipy import sparse +from ..utils._cython_blas cimport _dot +from ..utils._typedefs cimport float64_t, int32_t, intp_t + +cdef extern from *: + ctypedef char* const_char_p "const char*" + +################################################################################ +# Includes + +cdef extern from "_svm_cython_blas_helpers.h": + ctypedef double (*dot_func)(int, const double*, int, const double*, int) + cdef struct BlasFunctions: + dot_func dot + +cdef extern from "svm.h": + cdef struct svm_csr_node + cdef struct svm_csr_model + cdef struct svm_parameter + cdef struct svm_csr_problem + char *svm_csr_check_parameter(svm_csr_problem *, svm_parameter *) + svm_csr_model *svm_csr_train(svm_csr_problem *, svm_parameter *, int *, BlasFunctions *) nogil + void svm_csr_free_and_destroy_model(svm_csr_model** model_ptr_ptr) + +cdef extern from "libsvm_sparse_helper.c": + # this file contains methods for accessing libsvm 'hidden' fields + svm_csr_problem * csr_set_problem ( + char *, intp_t *, char *, intp_t *, char *, char *, char *, int) + svm_csr_model *csr_set_model(svm_parameter *param, int nr_class, + char *SV_data, intp_t *SV_indices_dims, + char *SV_indices, intp_t *SV_intptr_dims, + char *SV_intptr, + char *sv_coef, char *rho, char *nSV, + char *probA, char *probB) + svm_parameter *set_parameter (int , int , int , double, double , + double , double , double , double, + double, int, int, int, char *, char *, int, + int) + void copy_sv_coef (char *, svm_csr_model *) + void copy_n_iter (char *, svm_csr_model *) + void copy_support (char *, svm_csr_model *) + void copy_intercept (char *, svm_csr_model *, intp_t *) + int copy_predict (char *, svm_csr_model *, intp_t *, char *, BlasFunctions *) + int csr_copy_predict_values (intp_t *data_size, char *data, intp_t *index_size, + char *index, intp_t *intptr_size, char *size, + svm_csr_model *model, char *dec_values, int nr_class, BlasFunctions *) + int csr_copy_predict (intp_t *data_size, char *data, intp_t *index_size, + char *index, intp_t *intptr_size, char *size, + svm_csr_model *model, char *dec_values, BlasFunctions *) nogil + int csr_copy_predict_proba (intp_t *data_size, char *data, intp_t *index_size, + char *index, intp_t *intptr_size, char *size, + svm_csr_model *model, char *dec_values, BlasFunctions *) nogil + + int copy_predict_values(char *, svm_csr_model *, intp_t *, char *, int, BlasFunctions *) + int csr_copy_SV (char *values, intp_t *n_indices, + char *indices, intp_t *n_indptr, char *indptr, + svm_csr_model *model, int n_features) + intp_t get_nonzero_SV (svm_csr_model *) + void copy_nSV (char *, svm_csr_model *) + void copy_probA (char *, svm_csr_model *, intp_t *) + void copy_probB (char *, svm_csr_model *, intp_t *) + intp_t get_l (svm_csr_model *) + intp_t get_nr (svm_csr_model *) + int free_problem (svm_csr_problem *) + int free_model (svm_csr_model *) + int free_param (svm_parameter *) + int free_model_SV(svm_csr_model *model) + void set_verbosity(int) + + +def libsvm_sparse_train (int n_features, + const float64_t[::1] values, + const int32_t[::1] indices, + const int32_t[::1] indptr, + const float64_t[::1] Y, + int svm_type, int kernel_type, int degree, double gamma, + double coef0, double eps, double C, + const float64_t[::1] class_weight, + const float64_t[::1] sample_weight, + double nu, double cache_size, double p, int + shrinking, int probability, int max_iter, + int random_seed): + """ + Wrap svm_train from libsvm using a scipy.sparse.csr matrix + + Work in progress. + + Parameters + ---------- + n_features : number of features. + XXX: can we retrieve this from any other parameter ? + + X : array-like, dtype=float, size=[N, D] + + Y : array, dtype=float, size=[N] + target vector + + ... + + Notes + ------------------- + See sklearn.svm.predict for a complete list of parameters. + + """ + + cdef svm_parameter *param + cdef svm_csr_problem *problem + cdef svm_csr_model *model + cdef const_char_p error_msg + + if len(sample_weight) == 0: + sample_weight = np.ones(Y.shape[0], dtype=np.float64) + else: + assert sample_weight.shape[0] == indptr.shape[0] - 1, \ + "sample_weight and X have incompatible shapes: " + \ + "sample_weight has %s samples while X has %s" % \ + (sample_weight.shape[0], indptr.shape[0] - 1) + + # we should never end up here with a precomputed kernel matrix, + # as this is always dense. + assert(kernel_type != 4) + + # set libsvm problem + problem = csr_set_problem( + &values[0], + indices.shape, + &indices[0], + indptr.shape, + &indptr[0], + &Y[0], + &sample_weight[0], + kernel_type, + ) + + cdef int32_t[::1] \ + class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32) + + # set parameters + param = set_parameter( + svm_type, + kernel_type, + degree, + gamma, + coef0, + nu, + cache_size, + C, + eps, + p, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, max_iter, + random_seed, + ) + + # check parameters + if (param == NULL or problem == NULL): + raise MemoryError("Seems we've run out of memory") + error_msg = svm_csr_check_parameter(problem, param) + if error_msg: + free_problem(problem) + free_param(param) + raise ValueError(error_msg) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + # call svm_train, this does the real work + cdef int fit_status = 0 + with nogil: + model = svm_csr_train(problem, param, &fit_status, &blas_functions) + + cdef intp_t SV_len = get_l(model) + cdef intp_t n_class = get_nr(model) + + cdef int[::1] n_iter + n_iter = np.empty(max(1, n_class * (n_class - 1) // 2), dtype=np.intc) + copy_n_iter( &n_iter[0], model) + + # copy model.sv_coef + # we create a new array instead of resizing, otherwise + # it would not erase previous information + cdef float64_t[::1] sv_coef_data + sv_coef_data = np.empty((n_class-1)*SV_len, dtype=np.float64) + copy_sv_coef ( &sv_coef_data[0] if sv_coef_data.size > 0 else NULL, model) + + cdef int32_t[::1] support + support = np.empty(SV_len, dtype=np.int32) + copy_support( &support[0] if support.size > 0 else NULL, model) + + # copy model.rho into the intercept + # the intercept is just model.rho but with sign changed + cdef float64_t[::1]intercept + intercept = np.empty(n_class*(n_class-1)//2, dtype=np.float64) + copy_intercept ( &intercept[0], model, intercept.shape) + + # copy model.SV + # we erase any previous information in SV + # TODO: custom kernel + cdef intp_t nonzero_SV + nonzero_SV = get_nonzero_SV (model) + + cdef float64_t[::1] SV_data + cdef int32_t[::1] SV_indices, SV_indptr + SV_data = np.empty(nonzero_SV, dtype=np.float64) + SV_indices = np.empty(nonzero_SV, dtype=np.int32) + SV_indptr = np.empty(SV_len + 1, dtype=np.int32) + csr_copy_SV( + &SV_data[0] if SV_data.size > 0 else NULL, + SV_indices.shape, + &SV_indices[0] if SV_indices.size > 0 else NULL, + SV_indptr.shape, + &SV_indptr[0] if SV_indptr.size > 0 else NULL, + model, + n_features, + ) + support_vectors_ = sparse.csr_matrix( + (SV_data, SV_indices, SV_indptr), (SV_len, n_features) + ) + + # copy model.nSV + # TODO: do only in classification + cdef int32_t[::1]n_class_SV + n_class_SV = np.empty(n_class, dtype=np.int32) + copy_nSV( &n_class_SV[0], model) + + # # copy probabilities + cdef float64_t[::1] probA, probB + if probability != 0: + if svm_type < 2: # SVC and NuSVC + probA = np.empty(n_class*(n_class-1)//2, dtype=np.float64) + probB = np.empty(n_class*(n_class-1)//2, dtype=np.float64) + copy_probB( &probB[0], model, probB.shape) + else: + probA = np.empty(1, dtype=np.float64) + probB = np.empty(0, dtype=np.float64) + copy_probA( &probA[0], model, probA.shape) + else: + probA = np.empty(0, dtype=np.float64) + probB = np.empty(0, dtype=np.float64) + + svm_csr_free_and_destroy_model (&model) + free_problem(problem) + free_param(param) + + return ( + support.base, + support_vectors_, + sv_coef_data.base, + intercept.base, + n_class_SV.base, + probA.base, + probB.base, + fit_status, + n_iter.base, + ) + + +def libsvm_sparse_predict (const float64_t[::1] T_data, + const int32_t[::1] T_indices, + const int32_t[::1] T_indptr, + const float64_t[::1] SV_data, + const int32_t[::1] SV_indices, + const int32_t[::1] SV_indptr, + const float64_t[::1] sv_coef, + const float64_t[::1] + intercept, int svm_type, int kernel_type, int + degree, double gamma, double coef0, double + eps, double C, + const float64_t[:] class_weight, + double nu, double p, int + shrinking, int probability, + const int32_t[::1] nSV, + const float64_t[::1] probA, + const float64_t[::1] probB): + """ + Predict values T given a model. + + For speed, all real work is done at the C level in function + copy_predict (libsvm_helper.c). + + We have to reconstruct model and parameters to make sure we stay + in sync with the python object. + + See sklearn.svm.predict for a complete list of parameters. + + Parameters + ---------- + X : array-like, dtype=float + Y : array + target vector + + Returns + ------- + dec_values : array + predicted values. + """ + cdef float64_t[::1] dec_values + cdef svm_parameter *param + cdef svm_csr_model *model + cdef int32_t[::1] \ + class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32) + cdef int rv + param = set_parameter( + svm_type, + kernel_type, + degree, + gamma, + coef0, + nu, + 100.0, # cache size has no effect on predict + C, + eps, + p, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + -1, + -1, # random seed has no effect on predict either + ) + + model = csr_set_model( + param, nSV.shape[0], + &SV_data[0] if SV_data.size > 0 else NULL, + SV_indices.shape, + &SV_indices[0] if SV_indices.size > 0 else NULL, + SV_indptr.shape, + &SV_indptr[0] if SV_indptr.size > 0 else NULL, + &sv_coef[0] if sv_coef.size > 0 else NULL, + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + # TODO: use check_model + dec_values = np.empty(T_indptr.shape[0]-1) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + with nogil: + rv = csr_copy_predict( + T_data.shape, + &T_data[0], + T_indices.shape, + &T_indices[0], + T_indptr.shape, + &T_indptr[0], + model, + &dec_values[0], + &blas_functions, + ) + if rv < 0: + raise MemoryError("We've run out of memory") + # free model and param + free_model_SV(model) + free_model(model) + free_param(param) + return dec_values.base + + +def libsvm_sparse_predict_proba( + const float64_t[::1] T_data, + const int32_t[::1] T_indices, + const int32_t[::1] T_indptr, + const float64_t[::1] SV_data, + const int32_t[::1] SV_indices, + const int32_t[::1] SV_indptr, + const float64_t[::1] sv_coef, + const float64_t[::1] + intercept, int svm_type, int kernel_type, int + degree, double gamma, double coef0, double + eps, double C, + const float64_t[:] class_weight, + double nu, double p, int shrinking, int probability, + const int32_t[::1] nSV, + const float64_t[::1] probA, + const float64_t[::1] probB, +): + """ + Predict values T given a model. + """ + cdef float64_t[:, ::1] dec_values + cdef svm_parameter *param + cdef svm_csr_model *model + cdef int32_t[::1] \ + class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32) + param = set_parameter( + svm_type, + kernel_type, + degree, + gamma, + coef0, + nu, + 100.0, # cache size has no effect on predict + C, + eps, + p, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + -1, + -1, # random seed has no effect on predict either + ) + + model = csr_set_model( + param, + nSV.shape[0], + &SV_data[0] if SV_data.size > 0 else NULL, + SV_indices.shape, + &SV_indices[0] if SV_indices.size > 0 else NULL, + SV_indptr.shape, + &SV_indptr[0] if SV_indptr.size > 0 else NULL, + &sv_coef[0] if sv_coef.size > 0 else NULL, + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + # TODO: use check_model + cdef intp_t n_class = get_nr(model) + cdef int rv + dec_values = np.empty((T_indptr.shape[0]-1, n_class), dtype=np.float64) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + with nogil: + rv = csr_copy_predict_proba( + T_data.shape, + &T_data[0], + T_indices.shape, + &T_indices[0], + T_indptr.shape, + &T_indptr[0], + model, + &dec_values[0, 0], + &blas_functions, + ) + if rv < 0: + raise MemoryError("We've run out of memory") + # free model and param + free_model_SV(model) + free_model(model) + free_param(param) + return dec_values.base + + +def libsvm_sparse_decision_function( + const float64_t[::1] T_data, + const int32_t[::1] T_indices, + const int32_t[::1] T_indptr, + const float64_t[::1] SV_data, + const int32_t[::1] SV_indices, + const int32_t[::1] SV_indptr, + const float64_t[::1] sv_coef, + const float64_t[::1] + intercept, int svm_type, int kernel_type, int + degree, double gamma, double coef0, double + eps, double C, + const float64_t[:] class_weight, + double nu, double p, int shrinking, int probability, + const int32_t[::1] nSV, + const float64_t[::1] probA, + const float64_t[::1] probB, +): + """ + Predict margin (libsvm name for this is predict_values) + + We have to reconstruct model and parameters to make sure we stay + in sync with the python object. + """ + cdef float64_t[:, ::1] dec_values + cdef svm_parameter *param + cdef intp_t n_class + + cdef svm_csr_model *model + cdef int32_t[::1] \ + class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32) + param = set_parameter( + svm_type, + kernel_type, + degree, + gamma, + coef0, + nu, + 100.0, # cache size has no effect on predict + C, + eps, + p, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + -1, + -1, + ) + + model = csr_set_model( + param, + nSV.shape[0], + &SV_data[0] if SV_data.size > 0 else NULL, + SV_indices.shape, + &SV_indices[0] if SV_indices.size > 0 else NULL, + SV_indptr.shape, + &SV_indptr[0] if SV_indptr.size > 0 else NULL, + &sv_coef[0] if sv_coef.size > 0 else NULL, + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + + if svm_type > 1: + n_class = 1 + else: + n_class = get_nr(model) + n_class = n_class * (n_class - 1) // 2 + + dec_values = np.empty((T_indptr.shape[0] - 1, n_class), dtype=np.float64) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + if csr_copy_predict_values( + T_data.shape, + &T_data[0], + T_indices.shape, + &T_indices[0], + T_indptr.shape, + &T_indptr[0], + model, + &dec_values[0, 0], + n_class, + &blas_functions, + ) < 0: + raise MemoryError("We've run out of memory") + # free model and param + free_model_SV(model) + free_model(model) + free_param(param) + + return dec_values.base + + +def set_verbosity_wrap(int verbosity): + """ + Control verbosity of libsvm library + """ + set_verbosity(verbosity) diff --git a/.venv/Lib/site-packages/sklearn/svm/_newrand.cp39-win_amd64.lib b/.venv/Lib/site-packages/sklearn/svm/_newrand.cp39-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..222817860b0a597634521664a21617f937f695f0 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/svm/_newrand.cp39-win_amd64.lib differ diff --git a/.venv/Lib/site-packages/sklearn/svm/_newrand.cp39-win_amd64.pyd b/.venv/Lib/site-packages/sklearn/svm/_newrand.cp39-win_amd64.pyd new file mode 100644 index 0000000000000000000000000000000000000000..cf1ba27dff1ebaf46cd1cc34f6e4724a30979973 Binary files /dev/null and b/.venv/Lib/site-packages/sklearn/svm/_newrand.cp39-win_amd64.pyd differ diff --git a/.venv/Lib/site-packages/sklearn/svm/_newrand.pyx b/.venv/Lib/site-packages/sklearn/svm/_newrand.pyx new file mode 100644 index 0000000000000000000000000000000000000000..585a01fc88c12074f7b4b52d8db05d7a630fcdb9 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/_newrand.pyx @@ -0,0 +1,13 @@ +"""Wrapper for newrand.h""" + +cdef extern from "newrand.h": + void set_seed(unsigned int) + unsigned int bounded_rand_int(unsigned int) + + +def set_seed_wrap(unsigned int custom_seed): + set_seed(custom_seed) + + +def bounded_rand_int_wrap(unsigned int range_): + return bounded_rand_int(range_) diff --git a/.venv/Lib/site-packages/sklearn/svm/meson.build b/.venv/Lib/site-packages/sklearn/svm/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..859b6ae5d8331ebb3e8526a5fb5f454df115eb5a --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/meson.build @@ -0,0 +1,53 @@ +newrand_include = include_directories('src/newrand') +libsvm_include = include_directories('src/libsvm') +liblinear_include = include_directories('src/liblinear') + +_newrand = py.extension_module( + '_newrand', + '_newrand.pyx', + override_options: ['cython_language=cpp'], + include_directories: [newrand_include], + cython_args: cython_args, + subdir: 'sklearn/svm', + install: true +) + +libsvm_skl = static_library( + 'libsvm-skl', + ['src/libsvm/libsvm_template.cpp'], +) + +py.extension_module( + '_libsvm', + ['_libsvm.pyx', utils_cython_tree], + include_directories: [newrand_include, libsvm_include], + link_with: libsvm_skl, + cython_args: cython_args, + subdir: 'sklearn/svm', + install: true +) + +py.extension_module( + '_libsvm_sparse', + ['_libsvm_sparse.pyx', utils_cython_tree], + include_directories: [newrand_include, libsvm_include], + link_with: libsvm_skl, + cython_args: cython_args, + subdir: 'sklearn/svm', + install: true +) + +liblinear_skl = static_library( + 'liblinear-skl', + ['src/liblinear/linear.cpp', 'src/liblinear/tron.cpp'], +) + +py.extension_module( + '_liblinear', + ['_liblinear.pyx', utils_cython_tree], + include_directories: [newrand_include, liblinear_include], + link_with: [liblinear_skl], + cython_args: cython_args, + subdir: 'sklearn/svm', + install: true +) diff --git a/.venv/Lib/site-packages/sklearn/svm/src/liblinear/COPYRIGHT b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/COPYRIGHT new file mode 100644 index 0000000000000000000000000000000000000000..9f4fdcf69d75e59d7ad9cd15a47742768c1c9032 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/COPYRIGHT @@ -0,0 +1,31 @@ + +Copyright (c) 2007-2014 The LIBLINEAR Project. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither name of copyright holders nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/.venv/Lib/site-packages/sklearn/svm/src/liblinear/_cython_blas_helpers.h b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/_cython_blas_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..a28468112399c0e4a52a5ee87f65b990b9b9a276 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/_cython_blas_helpers.h @@ -0,0 +1,16 @@ +#ifndef _CYTHON_BLAS_HELPERS_H +#define _CYTHON_BLAS_HELPERS_H + +typedef double (*dot_func)(int, const double*, int, const double*, int); +typedef void (*axpy_func)(int, double, const double*, int, double*, int); +typedef void (*scal_func)(int, double, const double*, int); +typedef double (*nrm2_func)(int, const double*, int); + +typedef struct BlasFunctions{ + dot_func dot; + axpy_func axpy; + scal_func scal; + nrm2_func nrm2; +} BlasFunctions; + +#endif diff --git a/.venv/Lib/site-packages/sklearn/svm/src/liblinear/liblinear_helper.c b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/liblinear_helper.c new file mode 100644 index 0000000000000000000000000000000000000000..c7a05315f87ded23005059a581dce5ed369a1d7c --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/liblinear_helper.c @@ -0,0 +1,236 @@ +#include +#define PY_SSIZE_T_CLEAN +#include +#include "linear.h" + + +/* + * Convert matrix to sparse representation suitable for liblinear. x is + * expected to be an array of length n_samples*n_features. + * + * Whether the matrix is densely or sparsely populated, the fastest way to + * convert it to liblinear's sparse format is to calculate the amount of memory + * needed and allocate a single big block. + * + * Special care must be taken with indices, since liblinear indices start at 1 + * and not at 0. + * + * If bias is > 0, we append an item at the end. + */ +static struct feature_node **dense_to_sparse(char *x, int double_precision, + int n_samples, int n_features, int n_nonzero, double bias) +{ + float *x32 = (float *)x; + double *x64 = (double *)x; + struct feature_node **sparse; + int i, j; /* number of nonzero elements in row i */ + struct feature_node *T; /* pointer to the top of the stack */ + int have_bias = (bias > 0); + + sparse = malloc (n_samples * sizeof(struct feature_node *)); + if (sparse == NULL) + return NULL; + + n_nonzero += (have_bias+1) * n_samples; + T = malloc (n_nonzero * sizeof(struct feature_node)); + if (T == NULL) { + free(sparse); + return NULL; + } + + for (i=0; ivalue = *x64; + T->index = j; + ++ T; + } + ++ x64; /* go to next element */ + } else { + if (*x32 != 0) { + T->value = *x32; + T->index = j; + ++ T; + } + ++ x32; /* go to next element */ + } + } + + /* set bias element */ + if (have_bias) { + T->value = bias; + T->index = j; + ++ T; + } + + /* set sentinel */ + T->index = -1; + ++ T; + } + + return sparse; +} + + +/* + * Convert scipy.sparse.csr to liblinear's sparse data structure + */ +static struct feature_node **csr_to_sparse(char *x, int double_precision, + int *indices, int *indptr, int n_samples, int n_features, int n_nonzero, + double bias) +{ + float *x32 = (float *)x; + double *x64 = (double *)x; + struct feature_node **sparse; + int i, j=0, k=0, n; + struct feature_node *T; + int have_bias = (bias > 0); + + sparse = malloc (n_samples * sizeof(struct feature_node *)); + if (sparse == NULL) + return NULL; + + n_nonzero += (have_bias+1) * n_samples; + T = malloc (n_nonzero * sizeof(struct feature_node)); + if (T == NULL) { + free(sparse); + return NULL; + } + + for (i=0; ivalue = double_precision ? x64[k] : x32[k]; + T->index = indices[k] + 1; /* liblinear uses 1-based indexing */ + ++T; + ++k; + } + + if (have_bias) { + T->value = bias; + T->index = n_features + 1; + ++T; + ++j; + } + + /* set sentinel */ + T->index = -1; + ++T; + } + + return sparse; +} + +struct problem * set_problem(char *X, int double_precision_X, int n_samples, + int n_features, int n_nonzero, double bias, char* sample_weight, + char *Y) +{ + struct problem *problem; + /* not performant but simple */ + problem = malloc(sizeof(struct problem)); + if (problem == NULL) return NULL; + problem->l = n_samples; + problem->n = n_features + (bias > 0); + problem->y = (double *) Y; + problem->W = (double *) sample_weight; + problem->x = dense_to_sparse(X, double_precision_X, n_samples, n_features, + n_nonzero, bias); + problem->bias = bias; + + if (problem->x == NULL) { + free(problem); + return NULL; + } + + return problem; +} + +struct problem * csr_set_problem (char *X, int double_precision_X, + char *indices, char *indptr, int n_samples, int n_features, + int n_nonzero, double bias, char *sample_weight, char *Y) +{ + struct problem *problem; + problem = malloc (sizeof (struct problem)); + if (problem == NULL) return NULL; + problem->l = n_samples; + problem->n = n_features + (bias > 0); + problem->y = (double *) Y; + problem->W = (double *) sample_weight; + problem->x = csr_to_sparse(X, double_precision_X, (int *) indices, + (int *) indptr, n_samples, n_features, n_nonzero, bias); + problem->bias = bias; + + if (problem->x == NULL) { + free(problem); + return NULL; + } + + return problem; +} + + +/* Create a parameter struct with and return it */ +struct parameter *set_parameter(int solver_type, double eps, double C, + Py_ssize_t nr_weight, char *weight_label, + char *weight, int max_iter, unsigned seed, + double epsilon) +{ + struct parameter *param = malloc(sizeof(struct parameter)); + if (param == NULL) + return NULL; + + set_seed(seed); + param->solver_type = solver_type; + param->eps = eps; + param->C = C; + param->p = epsilon; // epsilon for epsilon-SVR + param->nr_weight = (int) nr_weight; + param->weight_label = (int *) weight_label; + param->weight = (double *) weight; + param->max_iter = max_iter; + return param; +} + +void copy_w(void *data, struct model *model, int len) +{ + memcpy(data, model->w, len * sizeof(double)); +} + +double get_bias(struct model *model) +{ + return model->bias; +} + +void free_problem(struct problem *problem) +{ + free(problem->x[0]); + free(problem->x); + free(problem); +} + +void free_parameter(struct parameter *param) +{ + free(param); +} + +/* rely on built-in facility to control verbose output */ +static void print_null(const char *s) {} + +static void print_string_stdout(const char *s) +{ + fputs(s ,stdout); + fflush(stdout); +} + +/* provide convenience wrapper */ +void set_verbosity(int verbosity_flag){ + if (verbosity_flag) + set_print_string_function(&print_string_stdout); + else + set_print_string_function(&print_null); +} diff --git a/.venv/Lib/site-packages/sklearn/svm/src/liblinear/linear.cpp b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/linear.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0286d9f1c53fe6c360190892a0cd60d43d441626 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/linear.cpp @@ -0,0 +1,3075 @@ +/* + Modified 2011: + + - Make labels sorted in group_classes, Dan Yamins. + + Modified 2012: + + - Changes roles of +1 and -1 to match scikit API, Andreas Mueller + See issue 546: https://github.com/scikit-learn/scikit-learn/pull/546 + - Also changed roles for pairwise class weights, Andreas Mueller + See issue 1491: https://github.com/scikit-learn/scikit-learn/pull/1491 + + Modified 2014: + + - Remove the hard-coded value of max_iter (1000), that allows max_iter + to be passed as a parameter from the classes LogisticRegression and + LinearSVC, Manoj Kumar + - Added function get_n_iter that exposes the number of iterations. + See issue 3499: https://github.com/scikit-learn/scikit-learn/issues/3499 + See pull 3501: https://github.com/scikit-learn/scikit-learn/pull/3501 + + Modified 2015: + - Patched liblinear for sample_weights - Manoj Kumar + See https://github.com/scikit-learn/scikit-learn/pull/5274 + + Modified 2020: + - Improved random number generator by using a mersenne twister + tweaked + lemire postprocessor. This fixed a convergence issue on windows targets. + Sylvain Marie, Schneider Electric + See + + */ + +#include +#include +#include +#include +#include +#include +#include "linear.h" +#include "tron.h" +#include +#include +#include "../newrand/newrand.h" + +typedef signed char schar; +template static inline void swap(T& x, T& y) { T t=x; x=y; y=t; } +#ifndef min +template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } +#endif +template static inline void clone(T*& dst, S* src, int n) +{ + dst = new T[n]; + memcpy((void *)dst,(void *)src,sizeof(T)*n); +} +#define Malloc(type,n) (type *)malloc((n)*sizeof(type)) +#define INF HUGE_VAL + +static void print_string_stdout(const char *s) +{ + fputs(s,stdout); + fflush(stdout); +} + +static void (*liblinear_print_string) (const char *) = &print_string_stdout; + +#if 1 +static void info(const char *fmt,...) +{ + char buf[BUFSIZ]; + va_list ap; + va_start(ap,fmt); + vsprintf(buf,fmt,ap); + va_end(ap); + (*liblinear_print_string)(buf); +} +#else +static void info(const char *fmt,...) {} +#endif + +class l2r_lr_fun: public function +{ +public: + l2r_lr_fun(const problem *prob, double *C); + ~l2r_lr_fun(); + + double fun(double *w); + void grad(double *w, double *g); + void Hv(double *s, double *Hs); + + int get_nr_variable(void); + +private: + void Xv(double *v, double *Xv); + void XTv(double *v, double *XTv); + + double *C; + double *z; + double *D; + const problem *prob; +}; + +l2r_lr_fun::l2r_lr_fun(const problem *prob, double *C) +{ + int l=prob->l; + + this->prob = prob; + + z = new double[l]; + D = new double[l]; + this->C = C; +} + +l2r_lr_fun::~l2r_lr_fun() +{ + delete[] z; + delete[] D; +} + + +double l2r_lr_fun::fun(double *w) +{ + int i; + double f=0; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + + Xv(w, z); + + for(i=0;i= 0) + f += C[i]*log(1 + exp(-yz)); + else + f += C[i]*(-yz+log(1 + exp(yz))); + } + + return(f); +} + +void l2r_lr_fun::grad(double *w, double *g) +{ + int i; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + + for(i=0;in; +} + +void l2r_lr_fun::Hv(double *s, double *Hs) +{ + int i; + int l=prob->l; + int w_size=get_nr_variable(); + double *wa = new double[l]; + + Xv(s, wa); + for(i=0;il; + feature_node **x=prob->x; + + for(i=0;iindex!=-1) + { + Xv[i]+=v[s->index-1]*s->value; + s++; + } + } +} + +void l2r_lr_fun::XTv(double *v, double *XTv) +{ + int i; + int l=prob->l; + int w_size=get_nr_variable(); + feature_node **x=prob->x; + + for(i=0;iindex!=-1) + { + XTv[s->index-1]+=v[i]*s->value; + s++; + } + } +} + +class l2r_l2_svc_fun: public function +{ +public: + l2r_l2_svc_fun(const problem *prob, double *C); + ~l2r_l2_svc_fun(); + + double fun(double *w); + void grad(double *w, double *g); + void Hv(double *s, double *Hs); + + int get_nr_variable(void); + +protected: + void Xv(double *v, double *Xv); + void subXv(double *v, double *Xv); + void subXTv(double *v, double *XTv); + + double *C; + double *z; + double *D; + int *I; + int sizeI; + const problem *prob; +}; + +l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, double *C) +{ + int l=prob->l; + + this->prob = prob; + + z = new double[l]; + D = new double[l]; + I = new int[l]; + this->C = C; +} + +l2r_l2_svc_fun::~l2r_l2_svc_fun() +{ + delete[] z; + delete[] D; + delete[] I; +} + +double l2r_l2_svc_fun::fun(double *w) +{ + int i; + double f=0; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + + Xv(w, z); + + for(i=0;i 0) + f += C[i]*d*d; + } + + return(f); +} + +void l2r_l2_svc_fun::grad(double *w, double *g) +{ + int i; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + + sizeI = 0; + for (i=0;in; +} + +void l2r_l2_svc_fun::Hv(double *s, double *Hs) +{ + int i; + int w_size=get_nr_variable(); + double *wa = new double[sizeI]; + + subXv(s, wa); + for(i=0;il; + feature_node **x=prob->x; + + for(i=0;iindex!=-1) + { + Xv[i]+=v[s->index-1]*s->value; + s++; + } + } +} + +void l2r_l2_svc_fun::subXv(double *v, double *Xv) +{ + int i; + feature_node **x=prob->x; + + for(i=0;iindex!=-1) + { + Xv[i]+=v[s->index-1]*s->value; + s++; + } + } +} + +void l2r_l2_svc_fun::subXTv(double *v, double *XTv) +{ + int i; + int w_size=get_nr_variable(); + feature_node **x=prob->x; + + for(i=0;iindex!=-1) + { + XTv[s->index-1]+=v[i]*s->value; + s++; + } + } +} + +class l2r_l2_svr_fun: public l2r_l2_svc_fun +{ +public: + l2r_l2_svr_fun(const problem *prob, double *C, double p); + + double fun(double *w); + void grad(double *w, double *g); + +private: + double p; +}; + +l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, double *C, double p): + l2r_l2_svc_fun(prob, C) +{ + this->p = p; +} + +double l2r_l2_svr_fun::fun(double *w) +{ + int i; + double f=0; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + double d; + + Xv(w, z); + + for(i=0;i p) + f += C[i]*(d-p)*(d-p); + } + + return(f); +} + +void l2r_l2_svr_fun::grad(double *w, double *g) +{ + int i; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + double d; + + sizeI = 0; + for(i=0;i p) + { + z[sizeI] = C[i]*(d-p); + I[sizeI] = i; + sizeI++; + } + + } + subXTv(z, g); + + for(i=0;iw_size = prob->n; + this->l = prob->l; + this->nr_class = nr_class; + this->eps = eps; + this->max_iter = max_iter; + this->prob = prob; + this->B = new double[nr_class]; + this->G = new double[nr_class]; + this->C = new double[prob->l]; + for(int i = 0; i < prob->l; i++) + this->C[i] = prob->W[i] * weighted_C[(int)prob->y[i]]; +} + +Solver_MCSVM_CS::~Solver_MCSVM_CS() +{ + delete[] B; + delete[] G; + delete[] C; +} + +int compare_double(const void *a, const void *b) +{ + if(*(double *)a > *(double *)b) + return -1; + if(*(double *)a < *(double *)b) + return 1; + return 0; +} + +void Solver_MCSVM_CS::solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new) +{ + int r; + double *D; + + clone(D, B, active_i); + if(yi < active_i) + D[yi] += A_i*C_yi; + qsort(D, active_i, sizeof(double), compare_double); + + double beta = D[0] - A_i*C_yi; + for(r=1;ry[i] == m + // alpha[i*nr_class+m] <= 0 if prob->y[i] != m + // If initial alpha isn't zero, uncomment the for loop below to initialize w + for(i=0;ix[i]; + QD[i] = 0; + while(xi->index != -1) + { + double val = xi->value; + QD[i] += val*val; + + // Uncomment the for loop if initial alpha isn't zero + // for(m=0; mindex-1)*nr_class+m] += alpha[i*nr_class+m]*val; + xi++; + } + active_size_i[i] = nr_class; + y_index[i] = (int)prob->y[i]; + index[i] = i; + } + + while(iter < max_iter) + { + double stopping = -INF; + for(i=0;i 0) + { + for(m=0;mx[i]; + while(xi->index!= -1) + { + double *w_i = &w[(xi->index-1)*nr_class]; + for(m=0;mvalue); + xi++; + } + + double minG = INF; + double maxG = -INF; + for(m=0;m maxG) + maxG = G[m]; + } + if(y_index[i] < active_size_i[i]) + if(alpha_i[(int) prob->y[i]] < C[GETI(i)] && G[y_index[i]] < minG) + minG = G[y_index[i]]; + + for(m=0;mm) + { + if(!be_shrunk(i, active_size_i[i], y_index[i], + alpha_i[alpha_index_i[active_size_i[i]]], minG)) + { + swap(alpha_index_i[m], alpha_index_i[active_size_i[i]]); + swap(G[m], G[active_size_i[i]]); + if(y_index[i] == active_size_i[i]) + y_index[i] = m; + else if(y_index[i] == m) + y_index[i] = active_size_i[i]; + break; + } + active_size_i[i]--; + } + } + } + + if(active_size_i[i] <= 1) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + + if(maxG-minG <= 1e-12) + continue; + else + stopping = max(maxG - minG, stopping); + + for(m=0;m= 1e-12) + { + d_ind[nz_d] = alpha_index_i[m]; + d_val[nz_d] = d; + nz_d++; + } + } + + xi = prob->x[i]; + while(xi->index != -1) + { + double *w_i = &w[(xi->index-1)*nr_class]; + for(m=0;mvalue; + xi++; + } + } + } + + iter++; + if(iter % 10 == 0) + { + info("."); + } + + if(stopping < eps_shrink) + { + if(stopping < eps && start_from_all == true) + break; + else + { + active_size = l; + for(i=0;i= max_iter) + info("\nWARNING: reaching max number of iterations\n"); + + // calculate objective value + double v = 0; + int nSV = 0; + for(i=0;i 0) + nSV++; + } + for(i=0;iy[i]]; + info("Objective value = %lf\n",v); + info("nSV = %d\n",nSV); + + delete [] alpha; + delete [] alpha_new; + delete [] index; + delete [] QD; + delete [] d_ind; + delete [] d_val; + delete [] alpha_index; + delete [] y_index; + delete [] active_size_i; + return iter; +} + +// A coordinate descent algorithm for +// L1-loss and L2-loss SVM dual problems +// +// min_\alpha 0.5(\alpha^T (Q + D)\alpha) - e^T \alpha, +// s.t. 0 <= \alpha_i <= upper_bound_i, +// +// where Qij = yi yj xi^T xj and +// D is a diagonal matrix +// +// In L1-SVM case: +// upper_bound_i = Cp if y_i = 1 +// upper_bound_i = Cn if y_i = -1 +// D_ii = 0 +// In L2-SVM case: +// upper_bound_i = INF +// D_ii = 1/(2*Cp) if y_i = 1 +// D_ii = 1/(2*Cn) if y_i = -1 +// +// Given: +// x, y, Cp, Cn +// eps is the stopping tolerance +// +// solution will be put in w +// +// See Algorithm 3 of Hsieh et al., ICML 2008 + +#undef GETI +#define GETI(i) (i) +// To support weights for instances, use GETI(i) (i) + +static int solve_l2r_l1l2_svc( + const problem *prob, double *w, double eps, + double Cp, double Cn, int solver_type, int max_iter) +{ + int l = prob->l; + int w_size = prob->n; + int i, s, iter = 0; + double C, d, G; + double *QD = new double[l]; + int *index = new int[l]; + double *alpha = new double[l]; + schar *y = new schar[l]; + int active_size = l; + + // PG: projected gradient, for shrinking and stopping + double PG; + double PGmax_old = INF; + double PGmin_old = -INF; + double PGmax_new, PGmin_new; + + // default solver_type: L2R_L2LOSS_SVC_DUAL + double *diag = new double[l]; + double *upper_bound = new double[l]; + double *C_ = new double[l]; + for(i=0; iy[i]>0) + C_[i] = prob->W[i] * Cp; + else + C_[i] = prob->W[i] * Cn; + diag[i] = 0.5/C_[i]; + upper_bound[i] = INF; + } + if(solver_type == L2R_L1LOSS_SVC_DUAL) + { + for(i=0; iy[i] > 0) + { + y[i] = +1; + } + else + { + y[i] = -1; + } + } + + // Initial alpha can be set here. Note that + // 0 <= alpha[i] <= upper_bound[GETI(i)] + for(i=0; ix[i]; + while (xi->index != -1) + { + double val = xi->value; + QD[i] += val*val; + w[xi->index-1] += y[i]*alpha[i]*val; + xi++; + } + index[i] = i; + } + + while (iter < max_iter) + { + PGmax_new = -INF; + PGmin_new = INF; + + for (i=0; ix[i]; + while(xi->index!= -1) + { + G += w[xi->index-1]*(xi->value); + xi++; + } + G = G*yi-1; + + C = upper_bound[GETI(i)]; + G += alpha[i]*diag[GETI(i)]; + + PG = 0; + if (alpha[i] == 0) + { + if (G > PGmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + else if (G < 0) + PG = G; + } + else if (alpha[i] == C) + { + if (G < PGmin_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + else if (G > 0) + PG = G; + } + else + PG = G; + + PGmax_new = max(PGmax_new, PG); + PGmin_new = min(PGmin_new, PG); + + if(fabs(PG) > 1.0e-12) + { + double alpha_old = alpha[i]; + alpha[i] = min(max(alpha[i] - G/QD[i], 0.0), C); + d = (alpha[i] - alpha_old)*yi; + xi = prob->x[i]; + while (xi->index != -1) + { + w[xi->index-1] += d*xi->value; + xi++; + } + } + } + + iter++; + if(iter % 10 == 0) + info("."); + + if(PGmax_new - PGmin_new <= eps) + { + if(active_size == l) + break; + else + { + active_size = l; + info("*"); + PGmax_old = INF; + PGmin_old = -INF; + continue; + } + } + PGmax_old = PGmax_new; + PGmin_old = PGmin_new; + if (PGmax_old <= 0) + PGmax_old = INF; + if (PGmin_old >= 0) + PGmin_old = -INF; + } + + info("\noptimization finished, #iter = %d\n",iter); + if (iter >= max_iter) + info("\nWARNING: reaching max number of iterations\nUsing -s 2 may be faster (also see FAQ)\n\n"); + + // calculate objective value + + double v = 0; + int nSV = 0; + for(i=0; i 0) + ++nSV; + } + info("Objective value = %lf\n",v/2); + info("nSV = %d\n",nSV); + + delete [] QD; + delete [] alpha; + delete [] y; + delete [] index; + delete [] diag; + delete [] upper_bound; + delete [] C_; + return iter; +} + + +// A coordinate descent algorithm for +// L1-loss and L2-loss epsilon-SVR dual problem +// +// min_\beta 0.5\beta^T (Q + diag(lambda)) \beta - p \sum_{i=1}^l|\beta_i| + \sum_{i=1}^l yi\beta_i, +// s.t. -upper_bound_i <= \beta_i <= upper_bound_i, +// +// where Qij = xi^T xj and +// D is a diagonal matrix +// +// In L1-SVM case: +// upper_bound_i = C +// lambda_i = 0 +// In L2-SVM case: +// upper_bound_i = INF +// lambda_i = 1/(2*C) +// +// Given: +// x, y, p, C +// eps is the stopping tolerance +// +// solution will be put in w +// +// See Algorithm 4 of Ho and Lin, 2012 + +#undef GETI +#define GETI(i) (i) +// To support weights for instances, use GETI(i) (i) + +static int solve_l2r_l1l2_svr( + const problem *prob, double *w, const parameter *param, + int solver_type, int max_iter) +{ + int l = prob->l; + double C = param->C; + double p = param->p; + int w_size = prob->n; + double eps = param->eps; + int i, s, iter = 0; + int active_size = l; + int *index = new int[l]; + + double d, G, H; + double Gmax_old = INF; + double Gmax_new, Gnorm1_new; + double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration + double *beta = new double[l]; + double *QD = new double[l]; + double *y = prob->y; + + // L2R_L2LOSS_SVR_DUAL + double *lambda = new double[l]; + double *upper_bound = new double[l]; + double *C_ = new double[l]; + for (i=0; iW[i] * C; + lambda[i] = 0.5/C_[i]; + upper_bound[i] = INF; + } + if(solver_type == L2R_L1LOSS_SVR_DUAL) + { + for (i=0; ix[i]; + while(xi->index != -1) + { + double val = xi->value; + QD[i] += val*val; + w[xi->index-1] += beta[i]*val; + xi++; + } + + index[i] = i; + } + + + while(iter < max_iter) + { + Gmax_new = 0; + Gnorm1_new = 0; + + for(i=0; ix[i]; + while(xi->index != -1) + { + int ind = xi->index-1; + double val = xi->value; + G += val*w[ind]; + xi++; + } + + double Gp = G+p; + double Gn = G-p; + double violation = 0; + if(beta[i] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + else if(Gp>Gmax_old && Gn<-Gmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(beta[i] >= upper_bound[GETI(i)]) + { + if(Gp > 0) + violation = Gp; + else if(Gp < -Gmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(beta[i] <= -upper_bound[GETI(i)]) + { + if(Gn < 0) + violation = -Gn; + else if(Gn > Gmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(beta[i] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + + Gmax_new = max(Gmax_new, violation); + Gnorm1_new += violation; + + // obtain Newton direction d + if(Gp < H*beta[i]) + d = -Gp/H; + else if(Gn > H*beta[i]) + d = -Gn/H; + else + d = -beta[i]; + + if(fabs(d) < 1.0e-12) + continue; + + double beta_old = beta[i]; + beta[i] = min(max(beta[i]+d, -upper_bound[GETI(i)]), upper_bound[GETI(i)]); + d = beta[i]-beta_old; + + if(d != 0) + { + xi = prob->x[i]; + while(xi->index != -1) + { + w[xi->index-1] += d*xi->value; + xi++; + } + } + } + + if(iter == 0) + Gnorm1_init = Gnorm1_new; + iter++; + if(iter % 10 == 0) + info("."); + + if(Gnorm1_new <= eps*Gnorm1_init) + { + if(active_size == l) + break; + else + { + active_size = l; + info("*"); + Gmax_old = INF; + continue; + } + } + + Gmax_old = Gmax_new; + } + + info("\noptimization finished, #iter = %d\n", iter); + if(iter >= max_iter) + info("\nWARNING: reaching max number of iterations\nUsing -s 11 may be faster\n\n"); + + // calculate objective value + double v = 0; + int nSV = 0; + for(i=0; il; + int w_size = prob->n; + int i, s, iter = 0; + double *xTx = new double[l]; + int *index = new int[l]; + double *alpha = new double[2*l]; // store alpha and C - alpha + schar *y = new schar[l]; + int max_inner_iter = 100; // for inner Newton + double innereps = 1e-2; + double innereps_min = min(1e-8, eps); + double *upper_bound = new double [l]; + + for(i=0; iy[i] > 0) + { + upper_bound[i] = prob->W[i] * Cp; + y[i] = +1; + } + else + { + upper_bound[i] = prob->W[i] * Cn; + y[i] = -1; + } + } + + // Initial alpha can be set here. Note that + // 0 < alpha[i] < upper_bound[GETI(i)] + // alpha[2*i] + alpha[2*i+1] = upper_bound[GETI(i)] + for(i=0; ix[i]; + while (xi->index != -1) + { + double val = xi->value; + xTx[i] += val*val; + w[xi->index-1] += y[i]*alpha[2*i]*val; + xi++; + } + index[i] = i; + } + + while (iter < max_iter) + { + for (i=0; ix[i]; + while (xi->index != -1) + { + ywTx += w[xi->index-1]*xi->value; + xi++; + } + ywTx *= y[i]; + double a = xisq, b = ywTx; + + // Decide to minimize g_1(z) or g_2(z) + int ind1 = 2*i, ind2 = 2*i+1, sign = 1; + if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0) + { + ind1 = 2*i+1; + ind2 = 2*i; + sign = -1; + } + + // g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old) + double alpha_old = alpha[ind1]; + double z = alpha_old; + if(C - z < 0.5 * C) + z = 0.1*z; + double gp = a*(z-alpha_old)+sign*b+log(z/(C-z)); + Gmax = max(Gmax, fabs(gp)); + + // Newton method on the sub-problem + const double eta = 0.1; // xi in the paper + int inner_iter = 0; + while (inner_iter <= max_inner_iter) + { + if(fabs(gp) < innereps) + break; + double gpp = a + C/(C-z)/z; + double tmpz = z - gp/gpp; + if(tmpz <= 0) + z *= eta; + else // tmpz in (0, C) + z = tmpz; + gp = a*(z-alpha_old)+sign*b+log(z/(C-z)); + newton_iter++; + inner_iter++; + } + + if(inner_iter > 0) // update w + { + alpha[ind1] = z; + alpha[ind2] = C-z; + xi = prob->x[i]; + while (xi->index != -1) + { + w[xi->index-1] += sign*(z-alpha_old)*yi*xi->value; + xi++; + } + } + } + + iter++; + if(iter % 10 == 0) + info("."); + + if(Gmax < eps) + break; + + if(newton_iter <= l/10) + innereps = max(innereps_min, 0.1*innereps); + + } + + info("\noptimization finished, #iter = %d\n",iter); + if (iter >= max_iter) + info("\nWARNING: reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n"); + + // calculate objective value + + double v = 0; + for(i=0; il; + int w_size = prob_col->n; + int j, s, iter = 0; + int active_size = w_size; + int max_num_linesearch = 20; + + double sigma = 0.01; + double d, G_loss, G, H; + double Gmax_old = INF; + double Gmax_new, Gnorm1_new; + double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration + double d_old, d_diff; + double loss_old, loss_new; + double appxcond, cond; + + int *index = new int[w_size]; + schar *y = new schar[l]; + double *b = new double[l]; // b = 1-ywTx + double *xj_sq = new double[w_size]; + feature_node *x; + + double *C = new double[l]; + + // Initial w can be set here. + for(j=0; jy[j] > 0) + { + y[j] = 1; + C[j] = prob_col->W[j] * Cp; + } + else + { + y[j] = -1; + C[j] = prob_col->W[j] * Cn; + } + } + for(j=0; jx[j]; + while(x->index != -1) + { + int ind = x->index-1; + x->value *= y[ind]; // x->value stores yi*xij + double val = x->value; + b[ind] -= w[j]*val; + xj_sq[j] += C[GETI(ind)]*val*val; + x++; + } + } + + while(iter < max_iter) + { + Gmax_new = 0; + Gnorm1_new = 0; + + for(j=0; jx[j]; + while(x->index != -1) + { + int ind = x->index-1; + if(b[ind] > 0) + { + double val = x->value; + double tmp = C[GETI(ind)]*val; + G_loss -= tmp*b[ind]; + H += tmp*val; + } + x++; + } + G_loss *= 2; + + G = G_loss; + H *= 2; + H = max(H, 1e-12); + + double Gp = G+1; + double Gn = G-1; + double violation = 0; + if(w[j] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + else if(Gp>Gmax_old/l && Gn<-Gmax_old/l) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(w[j] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + + Gmax_new = max(Gmax_new, violation); + Gnorm1_new += violation; + + // obtain Newton direction d + if(Gp < H*w[j]) + d = -Gp/H; + else if(Gn > H*w[j]) + d = -Gn/H; + else + d = -w[j]; + + if(fabs(d) < 1.0e-12) + continue; + + double delta = fabs(w[j]+d)-fabs(w[j]) + G*d; + d_old = 0; + int num_linesearch; + for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++) + { + d_diff = d_old - d; + cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta; + + appxcond = xj_sq[j]*d*d + G_loss*d + cond; + if(appxcond <= 0) + { + x = prob_col->x[j]; + while(x->index != -1) + { + b[x->index-1] += d_diff*x->value; + x++; + } + break; + } + + if(num_linesearch == 0) + { + loss_old = 0; + loss_new = 0; + x = prob_col->x[j]; + while(x->index != -1) + { + int ind = x->index-1; + if(b[ind] > 0) + loss_old += C[GETI(ind)]*b[ind]*b[ind]; + double b_new = b[ind] + d_diff*x->value; + b[ind] = b_new; + if(b_new > 0) + loss_new += C[GETI(ind)]*b_new*b_new; + x++; + } + } + else + { + loss_new = 0; + x = prob_col->x[j]; + while(x->index != -1) + { + int ind = x->index-1; + double b_new = b[ind] + d_diff*x->value; + b[ind] = b_new; + if(b_new > 0) + loss_new += C[GETI(ind)]*b_new*b_new; + x++; + } + } + + cond = cond + loss_new - loss_old; + if(cond <= 0) + break; + else + { + d_old = d; + d *= 0.5; + delta *= 0.5; + } + } + + w[j] += d; + + // recompute b[] if line search takes too many steps + if(num_linesearch >= max_num_linesearch) + { + info("#"); + for(int i=0; ix[i]; + while(x->index != -1) + { + b[x->index-1] -= w[i]*x->value; + x++; + } + } + } + } + + if(iter == 0) + Gnorm1_init = Gnorm1_new; + iter++; + if(iter % 10 == 0) + info("."); + + if(Gnorm1_new <= eps*Gnorm1_init) + { + if(active_size == w_size) + break; + else + { + active_size = w_size; + info("*"); + Gmax_old = INF; + continue; + } + } + + Gmax_old = Gmax_new; + } + + info("\noptimization finished, #iter = %d\n", iter); + if(iter >= max_iter) + info("\nWARNING: reaching max number of iterations\n"); + + // calculate objective value + + double v = 0; + int nnz = 0; + for(j=0; jx[j]; + while(x->index != -1) + { + x->value *= prob_col->y[x->index-1]; // restore x->value + x++; + } + if(w[j] != 0) + { + v += fabs(w[j]); + nnz++; + } + } + for(j=0; j 0) + v += C[GETI(j)]*b[j]*b[j]; + + info("Objective value = %lf\n", v); + info("#nonzeros/#features = %d/%d\n", nnz, w_size); + + delete [] index; + delete [] y; + delete [] b; + delete [] xj_sq; + delete [] C; + return iter; +} + +// A coordinate descent algorithm for +// L1-regularized logistic regression problems +// +// min_w \sum |wj| + C \sum log(1+exp(-yi w^T xi)), +// +// Given: +// x, y, Cp, Cn +// eps is the stopping tolerance +// +// solution will be put in w +// +// See Yuan et al. (2011) and appendix of LIBLINEAR paper, Fan et al. (2008) + +#undef GETI +#define GETI(i) (i) +// To support weights for instances, use GETI(i) (i) + +static int solve_l1r_lr( + const problem *prob_col, double *w, double eps, + double Cp, double Cn, int max_newton_iter) +{ + int l = prob_col->l; + int w_size = prob_col->n; + int j, s, newton_iter=0, iter=0; + int max_iter = 1000; + int max_num_linesearch = 20; + int active_size; + int QP_active_size; + int QP_no_change = 0; + + double nu = 1e-12; + double inner_eps = 1; + double sigma = 0.01; + double w_norm, w_norm_new; + double z, G, H; + double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration + double Gmax_old = INF; + double Gmax_new, Gnorm1_new; + double QP_Gmax_old = INF; + double QP_Gmax_new, QP_Gnorm1_new; + double delta, negsum_xTd, cond; + + int *index = new int[w_size]; + schar *y = new schar[l]; + double *Hdiag = new double[w_size]; + double *Grad = new double[w_size]; + double *wpd = new double[w_size]; + double *xjneg_sum = new double[w_size]; + double *xTd = new double[l]; + double *exp_wTx = new double[l]; + double *exp_wTx_new = new double[l]; + double *tau = new double[l]; + double *D = new double[l]; + feature_node *x; + + double *C = new double[l]; + + // Initial w can be set here. + for(j=0; jy[j] > 0) + { + y[j] = 1; + C[j] = prob_col->W[j] * Cp; + } + else + { + y[j] = -1; + C[j] = prob_col->W[j] * Cn; + } + + exp_wTx[j] = 0; + } + + w_norm = 0; + for(j=0; jx[j]; + while(x->index != -1) + { + int ind = x->index-1; + double val = x->value; + exp_wTx[ind] += w[j]*val; + if(y[ind] == -1) + xjneg_sum[j] += C[GETI(ind)]*val; + x++; + } + } + for(j=0; jx[j]; + while(x->index != -1) + { + int ind = x->index-1; + Hdiag[j] += x->value*x->value*D[ind]; + tmp += x->value*tau[ind]; + x++; + } + Grad[j] = -tmp + xjneg_sum[j]; + + double Gp = Grad[j]+1; + double Gn = Grad[j]-1; + double violation = 0; + if(w[j] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + //outer-level shrinking + else if(Gp>Gmax_old/l && Gn<-Gmax_old/l) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(w[j] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + + Gmax_new = max(Gmax_new, violation); + Gnorm1_new += violation; + } + + if(newton_iter == 0) + Gnorm1_init = Gnorm1_new; + + // Break outer-loop if the accumulated violation is small. + // Also break if no update in QP inner-loop ten times in a row. + if(Gnorm1_new <= eps*Gnorm1_init || QP_no_change >= 10) + break; + + QP_no_change++; + + iter = 0; + QP_Gmax_old = INF; + QP_active_size = active_size; + + for(int i=0; ix[j]; + G = Grad[j] + (wpd[j]-w[j])*nu; + while(x->index != -1) + { + int ind = x->index-1; + G += x->value*D[ind]*xTd[ind]; + x++; + } + + double Gp = G+1; + double Gn = G-1; + double violation = 0; + if(wpd[j] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + //inner-level shrinking + else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l) + { + QP_active_size--; + swap(index[s], index[QP_active_size]); + s--; + continue; + } + } + else if(wpd[j] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + + // obtain solution of one-variable problem + if(Gp < H*wpd[j]) + z = -Gp/H; + else if(Gn > H*wpd[j]) + z = -Gn/H; + else + z = -wpd[j]; + + if(fabs(z) < 1.0e-12) + continue; + z = min(max(z,-10.0),10.0); + + QP_no_change = 0; + QP_Gmax_new = max(QP_Gmax_new, violation); + QP_Gnorm1_new += violation; + + wpd[j] += z; + + x = prob_col->x[j]; + while(x->index != -1) + { + int ind = x->index-1; + xTd[ind] += x->value*z; + x++; + } + } + + iter++; + + if(QP_Gnorm1_new <= inner_eps*Gnorm1_init) + { + //inner stopping + if(QP_active_size == active_size) + break; + //active set reactivation + else + { + QP_active_size = active_size; + QP_Gmax_old = INF; + continue; + } + } + + QP_Gmax_old = QP_Gmax_new; + } + + if(iter >= max_iter) + info("WARNING: reaching max number of inner iterations\n"); + + delta = 0; + w_norm_new = 0; + for(j=0; j= max_num_linesearch) + { + for(int i=0; ix[i]; + while(x->index != -1) + { + exp_wTx[x->index-1] += w[i]*x->value; + x++; + } + } + + for(int i=0; i= max_newton_iter) + info("WARNING: reaching max number of iterations\n"); + + // calculate objective value + + double v = 0; + int nnz = 0; + for(j=0; jl; + int n = prob->n; + size_t nnz = 0; + size_t *col_ptr = new size_t [n+1]; + feature_node *x_space; + prob_col->l = l; + prob_col->n = n; + prob_col->y = new double[l]; + prob_col->x = new feature_node*[n]; + prob_col->W = new double[l]; + + for(i=0; iy[i] = prob->y[i]; + prob_col->W[i] = prob->W[i]; + } + + for(i=0; ix[i]; + while(x->index != -1) + { + nnz++; + col_ptr[x->index]++; + x++; + } + } + for(i=1; ix[i] = &x_space[col_ptr[i]]; + + for(i=0; ix[i]; + while(x->index != -1) + { + int ind = x->index-1; + x_space[col_ptr[ind]].index = i+1; // starts from 1 + x_space[col_ptr[ind]].value = x->value; + col_ptr[ind]++; + x++; + } + } + for(i=0; il; + int max_nr_class = 16; + int nr_class = 0; + int *label = Malloc(int,max_nr_class); + int *count = Malloc(int,max_nr_class); + int *data_label = Malloc(int,l); + int i; + + for(i=0;iy[i]; + int j; + for(j=0;j=0 && label[i] > this_label) + { + label[i+1] = label[i]; + count[i+1] = count[i]; + i--; + } + label[i+1] = this_label; + count[i+1] = this_count; + } + + for (i=0; i y[i]; + while(this_label != label[j]) + { + j++; + } + data_label[i] = j; + + } + + /* END MOD */ + +#if 0 + // + // Labels are ordered by their first occurrence in the training set. + // However, for two-class sets with -1/+1 labels and -1 appears first, + // we swap labels to ensure that internally the binary SVM has positive data corresponding to the +1 instances. + // + if (nr_class == 2 && label[0] == -1 && label[1] == 1) + { + swap(label[0],label[1]); + swap(count[0],count[1]); + for(i=0;ieps; + int max_iter=param->max_iter; + int pos = 0; + int neg = 0; + int n_iter = -1; + for(int i=0;il;i++) + if(prob->y[i] > 0) + pos++; + neg = prob->l - pos; + + double primal_solver_tol = eps*max(min(pos,neg), 1)/prob->l; + + function *fun_obj=NULL; + switch(param->solver_type) + { + case L2R_LR: + { + double *C = new double[prob->l]; + for(int i = 0; i < prob->l; i++) + { + if(prob->y[i] > 0) + C[i] = prob->W[i] * Cp; + else + C[i] = prob->W[i] * Cn; + } + + fun_obj=new l2r_lr_fun(prob, C); + TRON tron_obj(fun_obj, primal_solver_tol, max_iter, blas_functions); + tron_obj.set_print_string(liblinear_print_string); + n_iter=tron_obj.tron(w); + delete fun_obj; + delete[] C; + break; + } + case L2R_L2LOSS_SVC: + { + double *C = new double[prob->l]; + for(int i = 0; i < prob->l; i++) + { + if(prob->y[i] > 0) + C[i] = prob->W[i] * Cp; + else + C[i] = prob->W[i] * Cn; + } + fun_obj=new l2r_l2_svc_fun(prob, C); + TRON tron_obj(fun_obj, primal_solver_tol, max_iter, blas_functions); + tron_obj.set_print_string(liblinear_print_string); + n_iter=tron_obj.tron(w); + delete fun_obj; + delete[] C; + break; + } + case L2R_L2LOSS_SVC_DUAL: + n_iter=solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L2LOSS_SVC_DUAL, max_iter); + break; + case L2R_L1LOSS_SVC_DUAL: + n_iter=solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L1LOSS_SVC_DUAL, max_iter); + break; + case L1R_L2LOSS_SVC: + { + problem prob_col; + feature_node *x_space = NULL; + transpose(prob, &x_space ,&prob_col); + n_iter=solve_l1r_l2_svc(&prob_col, w, primal_solver_tol, Cp, Cn, max_iter); + delete [] prob_col.y; + delete [] prob_col.x; + delete [] prob_col.W; + delete [] x_space; + break; + } + case L1R_LR: + { + problem prob_col; + feature_node *x_space = NULL; + transpose(prob, &x_space ,&prob_col); + n_iter=solve_l1r_lr(&prob_col, w, primal_solver_tol, Cp, Cn, max_iter); + delete [] prob_col.y; + delete [] prob_col.x; + delete [] prob_col.W; + delete [] x_space; + break; + } + case L2R_LR_DUAL: + n_iter=solve_l2r_lr_dual(prob, w, eps, Cp, Cn, max_iter); + break; + case L2R_L2LOSS_SVR: + { + double *C = new double[prob->l]; + for(int i = 0; i < prob->l; i++) + C[i] = prob->W[i] * param->C; + + fun_obj=new l2r_l2_svr_fun(prob, C, param->p); + TRON tron_obj(fun_obj, param->eps, max_iter, blas_functions); + tron_obj.set_print_string(liblinear_print_string); + n_iter=tron_obj.tron(w); + delete fun_obj; + delete[] C; + break; + + } + case L2R_L1LOSS_SVR_DUAL: + n_iter=solve_l2r_l1l2_svr(prob, w, param, L2R_L1LOSS_SVR_DUAL, max_iter); + break; + case L2R_L2LOSS_SVR_DUAL: + n_iter=solve_l2r_l1l2_svr(prob, w, param, L2R_L2LOSS_SVR_DUAL, max_iter); + break; + default: + fprintf(stderr, "ERROR: unknown solver_type\n"); + break; + } + return n_iter; +} + +// +// Remove zero weighed data as libsvm and some liblinear solvers require C > 0. +// +static void remove_zero_weight(problem *newprob, const problem *prob) +{ + int i; + int l = 0; + for(i=0;il;i++) + if(prob->W[i] > 0) l++; + *newprob = *prob; + newprob->l = l; + newprob->x = Malloc(feature_node*,l); + newprob->y = Malloc(double,l); + newprob->W = Malloc(double,l); + + int j = 0; + for(i=0;il;i++) + if(prob->W[i] > 0) + { + newprob->x[j] = prob->x[i]; + newprob->y[j] = prob->y[i]; + newprob->W[j] = prob->W[i]; + j++; + } +} + +// +// Interface functions +// +model* train(const problem *prob, const parameter *param, BlasFunctions *blas_functions) +{ + problem newprob; + remove_zero_weight(&newprob, prob); + prob = &newprob; + int i,j; + int l = prob->l; + int n = prob->n; + int w_size = prob->n; + model *model_ = Malloc(model,1); + + if(prob->bias>=0) + model_->nr_feature=n-1; + else + model_->nr_feature=n; + model_->param = *param; + model_->bias = prob->bias; + + if(check_regression_model(model_)) + { + model_->w = Malloc(double, w_size); + model_->n_iter = Malloc(int, 1); + model_->nr_class = 2; + model_->label = NULL; + model_->n_iter[0] =train_one(prob, param, &model_->w[0], 0, 0, blas_functions); + } + else + { + int nr_class; + int *label = NULL; + int *start = NULL; + int *count = NULL; + int *perm = Malloc(int,l); + + // group training data of the same class + group_classes(prob,&nr_class,&label,&start,&count,perm); + + model_->nr_class=nr_class; + model_->label = Malloc(int,nr_class); + for(i=0;ilabel[i] = label[i]; + + // calculate weighted C + double *weighted_C = Malloc(double, nr_class); + for(i=0;iC; + for(i=0;inr_weight;i++) + { + for(j=0;jweight_label[i] == label[j]) + break; + if(j == nr_class) + fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]); + else + weighted_C[j] *= param->weight[i]; + } + + // constructing the subproblem + feature_node **x = Malloc(feature_node *,l); + for(i=0;ix[perm[i]]; + + int k; + problem sub_prob; + sub_prob.l = l; + sub_prob.n = n; + sub_prob.x = Malloc(feature_node *,sub_prob.l); + sub_prob.y = Malloc(double,sub_prob.l); + sub_prob.W = Malloc(double,sub_prob.l); + for(k=0; kW[perm[k]]; + } + + // multi-class svm by Crammer and Singer + if(param->solver_type == MCSVM_CS) + { + model_->w=Malloc(double, n*nr_class); + model_->n_iter=Malloc(int, 1); + for(i=0;ieps); + model_->n_iter[0]=Solver.Solve(model_->w); + } + else + { + if(nr_class == 2) + { + model_->w=Malloc(double, w_size); + model_->n_iter=Malloc(int, 1); + int e0 = start[0]+count[0]; + k=0; + for(; kn_iter[0]=train_one(&sub_prob, param, &model_->w[0], weighted_C[1], weighted_C[0], blas_functions); + } + else + { + model_->w=Malloc(double, w_size*nr_class); + double *w=Malloc(double, w_size); + model_->n_iter=Malloc(int, nr_class); + for(i=0;in_iter[i]=train_one(&sub_prob, param, w, weighted_C[i], param->C, blas_functions); + + for(int j=0;jw[j*nr_class+i] = w[j]; + } + free(w); + } + + } + + free(x); + free(label); + free(start); + free(count); + free(perm); + free(sub_prob.x); + free(sub_prob.y); + free(sub_prob.W); + free(weighted_C); + free(newprob.x); + free(newprob.y); + free(newprob.W); + } + return model_; +} + +#if 0 +void cross_validation(const problem *prob, const parameter *param, int nr_fold, double *target) +{ + int i; + int *fold_start; + int l = prob->l; + int *perm = Malloc(int,l); + if (nr_fold > l) + { + nr_fold = l; + fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n"); + } + fold_start = Malloc(int,nr_fold+1); + for(i=0;ibias; + subprob.n = prob->n; + subprob.l = l-(end-begin); + subprob.x = Malloc(struct feature_node*,subprob.l); + subprob.y = Malloc(double,subprob.l); + + k=0; + for(j=0;jx[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + ++k; + } + for(j=end;jx[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + ++k; + } + struct model *submodel = train(&subprob,param); + for(j=begin;jx[perm[j]]); + free_and_destroy_model(&submodel); + free(subprob.x); + free(subprob.y); + } + free(fold_start); + free(perm); +} + +double predict_values(const struct model *model_, const struct feature_node *x, double *dec_values) +{ + int idx; + int n; + if(model_->bias>=0) + n=model_->nr_feature+1; + else + n=model_->nr_feature; + double *w=model_->w; + int nr_class=model_->nr_class; + int i; + int nr_w; + if(nr_class==2 && model_->param.solver_type != MCSVM_CS) + nr_w = 1; + else + nr_w = nr_class; + + const feature_node *lx=x; + for(i=0;iindex)!=-1; lx++) + { + // the dimension of testing data may exceed that of training + if(idx<=n) + for(i=0;ivalue; + } + + if(nr_class==2) + { + if(check_regression_model(model_)) + return dec_values[0]; + else + return (dec_values[0]>0)?model_->label[0]:model_->label[1]; + } + else + { + int dec_max_idx = 0; + for(i=1;i dec_values[dec_max_idx]) + dec_max_idx = i; + } + return model_->label[dec_max_idx]; + } +} + +double predict(const model *model_, const feature_node *x) +{ + double *dec_values = Malloc(double, model_->nr_class); + double label=predict_values(model_, x, dec_values); + free(dec_values); + return label; +} + +double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates) +{ + if(check_probability_model(model_)) + { + int i; + int nr_class=model_->nr_class; + int nr_w; + if(nr_class==2) + nr_w = 1; + else + nr_w = nr_class; + + double label=predict_values(model_, x, prob_estimates); + for(i=0;inr_feature; + int n; + const parameter& param = model_->param; + + if(model_->bias>=0) + n=nr_feature+1; + else + n=nr_feature; + int w_size = n; + FILE *fp = fopen(model_file_name,"w"); + if(fp==NULL) return -1; + + char *old_locale = strdup(setlocale(LC_ALL, NULL)); + setlocale(LC_ALL, "C"); + + int nr_w; + if(model_->nr_class==2 && model_->param.solver_type != MCSVM_CS) + nr_w=1; + else + nr_w=model_->nr_class; + + fprintf(fp, "solver_type %s\n", solver_type_table[param.solver_type]); + fprintf(fp, "nr_class %d\n", model_->nr_class); + + if(model_->label) + { + fprintf(fp, "label"); + for(i=0; inr_class; i++) + fprintf(fp, " %d", model_->label[i]); + fprintf(fp, "\n"); + } + + fprintf(fp, "nr_feature %d\n", nr_feature); + + fprintf(fp, "bias %.16g\n", model_->bias); + + fprintf(fp, "w\n"); + for(i=0; iw[i*nr_w+j]); + fprintf(fp, "\n"); + } + + setlocale(LC_ALL, old_locale); + free(old_locale); + + if (ferror(fp) != 0 || fclose(fp) != 0) return -1; + else return 0; +} + +struct model *load_model(const char *model_file_name) +{ + FILE *fp = fopen(model_file_name,"r"); + if(fp==NULL) return NULL; + + int i; + int nr_feature; + int n; + int nr_class; + double bias; + model *model_ = Malloc(model,1); + parameter& param = model_->param; + + model_->label = NULL; + + char *old_locale = strdup(setlocale(LC_ALL, NULL)); + setlocale(LC_ALL, "C"); + + char cmd[81]; + while(1) + { + fscanf(fp,"%80s",cmd); + if(strcmp(cmd,"solver_type")==0) + { + fscanf(fp,"%80s",cmd); + int i; + for(i=0;solver_type_table[i];i++) + { + if(strcmp(solver_type_table[i],cmd)==0) + { + param.solver_type=i; + break; + } + } + if(solver_type_table[i] == NULL) + { + fprintf(stderr,"unknown solver type.\n"); + + setlocale(LC_ALL, old_locale); + free(model_->label); + free(model_); + free(old_locale); + return NULL; + } + } + else if(strcmp(cmd,"nr_class")==0) + { + fscanf(fp,"%d",&nr_class); + model_->nr_class=nr_class; + } + else if(strcmp(cmd,"nr_feature")==0) + { + fscanf(fp,"%d",&nr_feature); + model_->nr_feature=nr_feature; + } + else if(strcmp(cmd,"bias")==0) + { + fscanf(fp,"%lf",&bias); + model_->bias=bias; + } + else if(strcmp(cmd,"w")==0) + { + break; + } + else if(strcmp(cmd,"label")==0) + { + int nr_class = model_->nr_class; + model_->label = Malloc(int,nr_class); + for(int i=0;ilabel[i]); + } + else + { + fprintf(stderr,"unknown text in model file: [%s]\n",cmd); + setlocale(LC_ALL, old_locale); + free(model_->label); + free(model_); + free(old_locale); + return NULL; + } + } + + nr_feature=model_->nr_feature; + if(model_->bias>=0) + n=nr_feature+1; + else + n=nr_feature; + int w_size = n; + int nr_w; + if(nr_class==2 && param.solver_type != MCSVM_CS) + nr_w = 1; + else + nr_w = nr_class; + + model_->w=Malloc(double, w_size*nr_w); + for(i=0; iw[i*nr_w+j]); + fscanf(fp, "\n"); + } + + setlocale(LC_ALL, old_locale); + free(old_locale); + + if (ferror(fp) != 0 || fclose(fp) != 0) return NULL; + + return model_; +} +#endif + +int get_nr_feature(const model *model_) +{ + return model_->nr_feature; +} + +int get_nr_class(const model *model_) +{ + return model_->nr_class; +} + +void get_labels(const model *model_, int* label) +{ + if (model_->label != NULL) + for(int i=0;inr_class;i++) + label[i] = model_->label[i]; +} + +void get_n_iter(const model *model_, int* n_iter) +{ + int labels; + labels = model_->nr_class; + if (labels == 2) + labels = 1; + + if (model_->n_iter != NULL) + for(int i=0;in_iter[i]; +} + +#if 0 +// use inline here for better performance (around 20% faster than the non-inline one) +static inline double get_w_value(const struct model *model_, int idx, int label_idx) +{ + int nr_class = model_->nr_class; + int solver_type = model_->param.solver_type; + const double *w = model_->w; + + if(idx < 0 || idx > model_->nr_feature) + return 0; + if(check_regression_model(model_)) + return w[idx]; + else + { + if(label_idx < 0 || label_idx >= nr_class) + return 0; + if(nr_class == 2 && solver_type != MCSVM_CS) + { + if(label_idx == 0) + return w[idx]; + else + return -w[idx]; + } + else + return w[idx*nr_class+label_idx]; + } +} + +// feat_idx: starting from 1 to nr_feature +// label_idx: starting from 0 to nr_class-1 for classification models; +// for regression models, label_idx is ignored. +double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx) +{ + if(feat_idx > model_->nr_feature) + return 0; + return get_w_value(model_, feat_idx-1, label_idx); +} + +double get_decfun_bias(const struct model *model_, int label_idx) +{ + int bias_idx = model_->nr_feature; + double bias = model_->bias; + if(bias <= 0) + return 0; + else + return bias*get_w_value(model_, bias_idx, label_idx); +} +#endif + +void free_model_content(struct model *model_ptr) +{ + if(model_ptr->w != NULL) + free(model_ptr->w); + if(model_ptr->label != NULL) + free(model_ptr->label); + if(model_ptr->n_iter != NULL) + free(model_ptr->n_iter); +} + +void free_and_destroy_model(struct model **model_ptr_ptr) +{ + struct model *model_ptr = *model_ptr_ptr; + if(model_ptr != NULL) + { + free_model_content(model_ptr); + free(model_ptr); + } +} + +void destroy_param(parameter* param) +{ + if(param->weight_label != NULL) + free(param->weight_label); + if(param->weight != NULL) + free(param->weight); +} + +const char *check_parameter(const problem *prob, const parameter *param) +{ + if(param->eps <= 0) + return "eps <= 0"; + + if(param->C <= 0) + return "C <= 0"; + + if(param->p < 0) + return "p < 0"; + + if(param->solver_type != L2R_LR + && param->solver_type != L2R_L2LOSS_SVC_DUAL + && param->solver_type != L2R_L2LOSS_SVC + && param->solver_type != L2R_L1LOSS_SVC_DUAL + && param->solver_type != MCSVM_CS + && param->solver_type != L1R_L2LOSS_SVC + && param->solver_type != L1R_LR + && param->solver_type != L2R_LR_DUAL + && param->solver_type != L2R_L2LOSS_SVR + && param->solver_type != L2R_L2LOSS_SVR_DUAL + && param->solver_type != L2R_L1LOSS_SVR_DUAL) + return "unknown solver type"; + + return NULL; +} + +#if 0 +int check_probability_model(const struct model *model_) +{ + return (model_->param.solver_type==L2R_LR || + model_->param.solver_type==L2R_LR_DUAL || + model_->param.solver_type==L1R_LR); +} +#endif + +int check_regression_model(const struct model *model_) +{ + return (model_->param.solver_type==L2R_L2LOSS_SVR || + model_->param.solver_type==L2R_L1LOSS_SVR_DUAL || + model_->param.solver_type==L2R_L2LOSS_SVR_DUAL); +} + +void set_print_string_function(void (*print_func)(const char*)) +{ + if (print_func == NULL) + liblinear_print_string = &print_string_stdout; + else + liblinear_print_string = print_func; +} diff --git a/.venv/Lib/site-packages/sklearn/svm/src/liblinear/linear.h b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/linear.h new file mode 100644 index 0000000000000000000000000000000000000000..d85f2cb8941840a11f63bf6237a6f584940f1429 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/linear.h @@ -0,0 +1,87 @@ +#ifndef _LIBLINEAR_H +#define _LIBLINEAR_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "_cython_blas_helpers.h" + +struct feature_node +{ + int index; + double value; +}; + +struct problem +{ + int l, n; + double *y; + struct feature_node **x; + double bias; /* < 0 if no bias term */ + double *W; +}; + +enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL, L2R_L2LOSS_SVR = 11, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL }; /* solver_type */ + +struct parameter +{ + int solver_type; + + /* these are for training only */ + double eps; /* stopping criteria */ + double C; + int nr_weight; + int *weight_label; + double* weight; + int max_iter; + double p; +}; + +struct model +{ + struct parameter param; + int nr_class; /* number of classes */ + int nr_feature; + double *w; + int *label; /* label of each class */ + double bias; + int *n_iter; /* no. of iterations of each class */ +}; + +void set_seed(unsigned seed); + +struct model* train(const struct problem *prob, const struct parameter *param, BlasFunctions *blas_functions); +void cross_validation(const struct problem *prob, const struct parameter *param, int nr_fold, double *target); + +double predict_values(const struct model *model_, const struct feature_node *x, double* dec_values); +double predict(const struct model *model_, const struct feature_node *x); +double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates); + +int save_model(const char *model_file_name, const struct model *model_); +struct model *load_model(const char *model_file_name); + +int get_nr_feature(const struct model *model_); +int get_nr_class(const struct model *model_); +void get_labels(const struct model *model_, int* label); +void get_n_iter(const struct model *model_, int* n_iter); +#if 0 +double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx); +double get_decfun_bias(const struct model *model_, int label_idx); +#endif + +void free_model_content(struct model *model_ptr); +void free_and_destroy_model(struct model **model_ptr_ptr); +void destroy_param(struct parameter *param); + +const char *check_parameter(const struct problem *prob, const struct parameter *param); +int check_probability_model(const struct model *model); +int check_regression_model(const struct model *model); +void set_print_string_function(void (*print_func) (const char*)); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBLINEAR_H */ + diff --git a/.venv/Lib/site-packages/sklearn/svm/src/liblinear/tron.cpp b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/tron.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a59cb292c4a4017e002f3784e3a2bee03b39039 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/tron.cpp @@ -0,0 +1,223 @@ +#include +#include +#include +#include +#include "tron.h" + +#ifndef min +template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } +#endif + +static void default_print(const char *buf) +{ + fputs(buf,stdout); + fflush(stdout); +} + +void TRON::info(const char *fmt,...) +{ + char buf[BUFSIZ]; + va_list ap; + va_start(ap,fmt); + vsprintf(buf,fmt,ap); + va_end(ap); + (*tron_print_string)(buf); +} + +TRON::TRON(const function *fun_obj, double eps, int max_iter, BlasFunctions *blas) +{ + this->fun_obj=const_cast(fun_obj); + this->eps=eps; + this->max_iter=max_iter; + this->blas=blas; + tron_print_string = default_print; +} + +TRON::~TRON() +{ +} + +int TRON::tron(double *w) +{ + // Parameters for updating the iterates. + double eta0 = 1e-4, eta1 = 0.25, eta2 = 0.75; + + // Parameters for updating the trust region size delta. + double sigma1 = 0.25, sigma2 = 0.5, sigma3 = 4; + + int n = fun_obj->get_nr_variable(); + int i, cg_iter; + double delta, snorm; + double alpha, f, fnew, prered, actred, gs; + int search = 1, iter = 1, inc = 1; + double *s = new double[n]; + double *r = new double[n]; + double *w_new = new double[n]; + double *g = new double[n]; + + for (i=0; ifun(w); + fun_obj->grad(w, g); + delta = blas->nrm2(n, g, inc); + double gnorm1 = delta; + double gnorm = gnorm1; + + if (gnorm <= eps*gnorm1) + search = 0; + + iter = 1; + + while (iter <= max_iter && search) + { + cg_iter = trcg(delta, g, s, r); + + memcpy(w_new, w, sizeof(double)*n); + blas->axpy(n, 1.0, s, inc, w_new, inc); + + gs = blas->dot(n, g, inc, s, inc); + prered = -0.5*(gs - blas->dot(n, s, inc, r, inc)); + fnew = fun_obj->fun(w_new); + + // Compute the actual reduction. + actred = f - fnew; + + // On the first iteration, adjust the initial step bound. + snorm = blas->nrm2(n, s, inc); + if (iter == 1) + delta = min(delta, snorm); + + // Compute prediction alpha*snorm of the step. + if (fnew - f - gs <= 0) + alpha = sigma3; + else + alpha = max(sigma1, -0.5*(gs/(fnew - f - gs))); + + // Update the trust region bound according to the ratio of actual to predicted reduction. + if (actred < eta0*prered) + delta = min(max(alpha, sigma1)*snorm, sigma2*delta); + else if (actred < eta1*prered) + delta = max(sigma1*delta, min(alpha*snorm, sigma2*delta)); + else if (actred < eta2*prered) + delta = max(sigma1*delta, min(alpha*snorm, sigma3*delta)); + else + delta = max(delta, min(alpha*snorm, sigma3*delta)); + + info("iter %2d act %5.3e pre %5.3e delta %5.3e f %5.3e |g| %5.3e CG %3d\n", iter, actred, prered, delta, f, gnorm, cg_iter); + + if (actred > eta0*prered) + { + iter++; + memcpy(w, w_new, sizeof(double)*n); + f = fnew; + fun_obj->grad(w, g); + + gnorm = blas->nrm2(n, g, inc); + if (gnorm <= eps*gnorm1) + break; + } + if (f < -1.0e+32) + { + info("WARNING: f < -1.0e+32\n"); + break; + } + if (fabs(actred) <= 0 && prered <= 0) + { + info("WARNING: actred and prered <= 0\n"); + break; + } + if (fabs(actred) <= 1.0e-12*fabs(f) && + fabs(prered) <= 1.0e-12*fabs(f)) + { + info("WARNING: actred and prered too small\n"); + break; + } + } + + delete[] g; + delete[] r; + delete[] w_new; + delete[] s; + return --iter; +} + +int TRON::trcg(double delta, double *g, double *s, double *r) +{ + int i, inc = 1; + int n = fun_obj->get_nr_variable(); + double *d = new double[n]; + double *Hd = new double[n]; + double rTr, rnewTrnew, alpha, beta, cgtol; + + for (i=0; inrm2(n, g, inc); + + int cg_iter = 0; + rTr = blas->dot(n, r, inc, r, inc); + while (1) + { + if (blas->nrm2(n, r, inc) <= cgtol) + break; + cg_iter++; + fun_obj->Hv(d, Hd); + + alpha = rTr / blas->dot(n, d, inc, Hd, inc); + blas->axpy(n, alpha, d, inc, s, inc); + if (blas->nrm2(n, s, inc) > delta) + { + info("cg reaches trust region boundary\n"); + alpha = -alpha; + blas->axpy(n, alpha, d, inc, s, inc); + + double std = blas->dot(n, s, inc, d, inc); + double sts = blas->dot(n, s, inc, s, inc); + double dtd = blas->dot(n, d, inc, d, inc); + double dsq = delta*delta; + double rad = sqrt(std*std + dtd*(dsq-sts)); + if (std >= 0) + alpha = (dsq - sts)/(std + rad); + else + alpha = (rad - std)/dtd; + blas->axpy(n, alpha, d, inc, s, inc); + alpha = -alpha; + blas->axpy(n, alpha, Hd, inc, r, inc); + break; + } + alpha = -alpha; + blas->axpy(n, alpha, Hd, inc, r, inc); + rnewTrnew = blas->dot(n, r, inc, r, inc); + beta = rnewTrnew/rTr; + blas->scal(n, beta, d, inc); + blas->axpy(n, 1.0, r, inc, d, inc); + rTr = rnewTrnew; + } + + delete[] d; + delete[] Hd; + + return(cg_iter); +} + +double TRON::norm_inf(int n, double *x) +{ + double dmax = fabs(x[0]); + for (int i=1; i= dmax) + dmax = fabs(x[i]); + return(dmax); +} + +void TRON::set_print_string(void (*print_string) (const char *buf)) +{ + tron_print_string = print_string; +} diff --git a/.venv/Lib/site-packages/sklearn/svm/src/liblinear/tron.h b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/tron.h new file mode 100644 index 0000000000000000000000000000000000000000..91aca703f9830267c5686d302cf1271cc1786453 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/liblinear/tron.h @@ -0,0 +1,37 @@ +#ifndef _TRON_H +#define _TRON_H + +#include "_cython_blas_helpers.h" + +class function +{ +public: + virtual double fun(double *w) = 0 ; + virtual void grad(double *w, double *g) = 0 ; + virtual void Hv(double *s, double *Hs) = 0 ; + + virtual int get_nr_variable(void) = 0 ; + virtual ~function(void){} +}; + +class TRON +{ +public: + TRON(const function *fun_obj, double eps = 0.1, int max_iter = 1000, BlasFunctions *blas = 0); + ~TRON(); + + int tron(double *w); + void set_print_string(void (*i_print) (const char *buf)); + +private: + int trcg(double delta, double *g, double *s, double *r); + double norm_inf(int n, double *x); + + double eps; + int max_iter; + function *fun_obj; + BlasFunctions *blas; + void info(const char *fmt,...); + void (*tron_print_string)(const char *buf); +}; +#endif diff --git a/.venv/Lib/site-packages/sklearn/svm/src/libsvm/LIBSVM_CHANGES b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/LIBSVM_CHANGES new file mode 100644 index 0000000000000000000000000000000000000000..082fa1de598876acbcc2ace038cfc4c8cc0ba76d --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/LIBSVM_CHANGES @@ -0,0 +1,11 @@ +Changes to Libsvm + +This is here mainly as checklist for incorporation of new versions of libsvm. + + * Add copyright to files svm.cpp and svm.h + * Add random_seed support and call to srand in fit function + * Improved random number generator (fix on windows, enhancement on other + platforms). See + * invoke scipy blas api for svm kernel function to improve performance with speedup rate of 1.5X to 2X for dense data only. See + * Expose the number of iterations run in optimization. See +The changes made with respect to upstream are detailed in the heading of svm.cpp diff --git a/.venv/Lib/site-packages/sklearn/svm/src/libsvm/_svm_cython_blas_helpers.h b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/_svm_cython_blas_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..cd6270e693f7cfe5a18153e5b76ffb25ae650c5f --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/_svm_cython_blas_helpers.h @@ -0,0 +1,9 @@ +#ifndef _SVM_CYTHON_BLAS_HELPERS_H +#define _SVM_CYTHON_BLAS_HELPERS_H + +typedef double (*dot_func)(int, const double*, int, const double*, int); +typedef struct BlasFunctions{ + dot_func dot; +} BlasFunctions; + +#endif diff --git a/.venv/Lib/site-packages/sklearn/svm/src/libsvm/libsvm_helper.c b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/libsvm_helper.c new file mode 100644 index 0000000000000000000000000000000000000000..b75415c5929da90bd3d34da8bd4036ec07ef6867 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/libsvm_helper.c @@ -0,0 +1,425 @@ +#include +#define PY_SSIZE_T_CLEAN +#include +#include "svm.h" +#include "_svm_cython_blas_helpers.h" + + +#ifndef MAX + #define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#endif + + +/* + * Some helper methods for libsvm bindings. + * + * We need to access from python some parameters stored in svm_model + * but libsvm does not expose this structure, so we define it here + * along some utilities to convert from numpy arrays. + * + * License: BSD 3 clause + * + * Author: 2010 Fabian Pedregosa + */ + + +/* + * Convert matrix to sparse representation suitable for libsvm. x is + * expected to be an array of length nrow*ncol. + * + * Typically the matrix will be dense, so we speed up the routine for + * this case. We create a temporary array temp that collects non-zero + * elements and after we just memcpy that to the proper array. + * + * Special care must be taken with indinces, since libsvm indices start + * at 1 and not at 0. + * + * Strictly speaking, the C standard does not require that structs are + * contiguous, but in practice its a reasonable assumption. + * + */ +struct svm_node *dense_to_libsvm (double *x, Py_ssize_t *dims) +{ + struct svm_node *node; + Py_ssize_t len_row = dims[1]; + double *tx = x; + int i; + + node = malloc (dims[0] * sizeof(struct svm_node)); + + if (node == NULL) return NULL; + for (i=0; isvm_type = svm_type; + param->kernel_type = kernel_type; + param->degree = degree; + param->coef0 = coef0; + param->nu = nu; + param->cache_size = cache_size; + param->C = C; + param->eps = eps; + param->p = p; + param->shrinking = shrinking; + param->probability = probability; + param->nr_weight = nr_weight; + param->weight_label = (int *) weight_label; + param->weight = (double *) weight; + param->gamma = gamma; + param->max_iter = max_iter; + param->random_seed = random_seed; +} + +/* + * Fill an svm_problem struct. problem->x will be malloc'd. + */ +void set_problem(struct svm_problem *problem, char *X, char *Y, char *sample_weight, Py_ssize_t *dims, int kernel_type) +{ + if (problem == NULL) return; + problem->l = (int) dims[0]; /* number of samples */ + problem->y = (double *) Y; + problem->x = dense_to_libsvm((double *) X, dims); /* implicit call to malloc */ + problem->W = (double *) sample_weight; +} + +/* + * Create and return an instance of svm_model. + * + * The copy of model->sv_coef should be straightforward, but + * unfortunately to represent a matrix numpy and libsvm use different + * approaches, so it requires some iteration. + * + * Possible issue: on 64 bits, the number of columns that numpy can + * store is a long, but libsvm enforces this number (model->l) to be + * an int, so we might have numpy matrices that do not fit into libsvm's + * data structure. + * + */ +struct svm_model *set_model(struct svm_parameter *param, int nr_class, + char *SV, Py_ssize_t *SV_dims, + char *support, Py_ssize_t *support_dims, + Py_ssize_t *sv_coef_strides, + char *sv_coef, char *rho, char *nSV, + char *probA, char *probB) +{ + struct svm_model *model; + double *dsv_coef = (double *) sv_coef; + int i, m; + + m = nr_class * (nr_class-1)/2; + + if ((model = malloc(sizeof(struct svm_model))) == NULL) + goto model_error; + if ((model->nSV = malloc(nr_class * sizeof(int))) == NULL) + goto nsv_error; + if ((model->label = malloc(nr_class * sizeof(int))) == NULL) + goto label_error; + if ((model->sv_coef = malloc((nr_class-1)*sizeof(double *))) == NULL) + goto sv_coef_error; + if ((model->rho = malloc( m * sizeof(double))) == NULL) + goto rho_error; + + // This is only allocated in dynamic memory while training. + model->n_iter = NULL; + + model->nr_class = nr_class; + model->param = *param; + model->l = (int) support_dims[0]; + + if (param->kernel_type == PRECOMPUTED) { + if ((model->SV = malloc ((model->l) * sizeof(struct svm_node))) == NULL) + goto SV_error; + for (i=0; il; ++i) { + model->SV[i].ind = ((int *) support)[i]; + model->SV[i].values = NULL; + } + } else { + model->SV = dense_to_libsvm((double *) SV, SV_dims); + } + /* + * regression and one-class does not use nSV, label. + * TODO: does this provoke memory leaks (we just malloc'ed them)? + */ + if (param->svm_type < 2) { + memcpy(model->nSV, nSV, model->nr_class * sizeof(int)); + for(i=0; i < model->nr_class; i++) + model->label[i] = i; + } + + for (i=0; i < model->nr_class-1; i++) { + model->sv_coef[i] = dsv_coef + i*(model->l); + } + + for (i=0; irho)[i] = -((double *) rho)[i]; + } + + /* + * just to avoid segfaults, these features are not wrapped but + * svm_destroy_model will try to free them. + */ + + if (param->probability) { + if ((model->probA = malloc(m * sizeof(double))) == NULL) + goto probA_error; + memcpy(model->probA, probA, m * sizeof(double)); + if ((model->probB = malloc(m * sizeof(double))) == NULL) + goto probB_error; + memcpy(model->probB, probB, m * sizeof(double)); + } else { + model->probA = NULL; + model->probB = NULL; + } + + /* We'll free SV ourselves */ + model->free_sv = 0; + return model; + +probB_error: + free(model->probA); +probA_error: + free(model->SV); +SV_error: + free(model->rho); +rho_error: + free(model->sv_coef); +sv_coef_error: + free(model->label); +label_error: + free(model->nSV); +nsv_error: + free(model); +model_error: + return NULL; +} + + + +/* + * Get the number of support vectors in a model. + */ +Py_ssize_t get_l(struct svm_model *model) +{ + return (Py_ssize_t) model->l; +} + +/* + * Get the number of classes in a model, = 2 in regression/one class + * svm. + */ +Py_ssize_t get_nr(struct svm_model *model) +{ + return (Py_ssize_t) model->nr_class; +} + +/* + * Get the number of iterations run in optimization + */ +void copy_n_iter(char *data, struct svm_model *model) +{ + const int n_models = MAX(1, model->nr_class * (model->nr_class-1) / 2); + memcpy(data, model->n_iter, n_models * sizeof(int)); +} + +/* + * Some helpers to convert from libsvm sparse data structures + * model->sv_coef is a double **, whereas data is just a double *, + * so we have to do some stupid copying. + */ +void copy_sv_coef(char *data, struct svm_model *model) +{ + int i, len = model->nr_class-1; + double *temp = (double *) data; + for(i=0; isv_coef[i], sizeof(double) * model->l); + temp += model->l; + } +} + +void copy_intercept(char *data, struct svm_model *model, Py_ssize_t *dims) +{ + /* intercept = -rho */ + Py_ssize_t i, n = dims[0]; + double t, *ddata = (double *) data; + for (i=0; irho[i]; + /* we do this to avoid ugly -0.0 */ + *ddata = (t != 0) ? -t : 0; + ++ddata; + } +} + +/* + * This is a bit more complex since SV are stored as sparse + * structures, so we have to do the conversion on the fly and also + * iterate fast over data. + */ +void copy_SV(char *data, struct svm_model *model, Py_ssize_t *dims) +{ + int i, n = model->l; + double *tdata = (double *) data; + int dim = model->SV[0].dim; + for (i=0; iSV[i].values, dim * sizeof(double)); + tdata += dim; + } +} + +void copy_support (char *data, struct svm_model *model) +{ + memcpy (data, model->sv_ind, (model->l) * sizeof(int)); +} + +/* + * copy svm_model.nSV, an array with the number of SV for each class + * will be NULL in the case of SVR, OneClass + */ +void copy_nSV(char *data, struct svm_model *model) +{ + if (model->label == NULL) return; + memcpy(data, model->nSV, model->nr_class * sizeof(int)); +} + +void copy_probA(char *data, struct svm_model *model, Py_ssize_t * dims) +{ + memcpy(data, model->probA, dims[0] * sizeof(double)); +} + +void copy_probB(char *data, struct svm_model *model, Py_ssize_t * dims) +{ + memcpy(data, model->probB, dims[0] * sizeof(double)); +} + +/* + * Predict using model. + * + * It will return -1 if we run out of memory. + */ +int copy_predict(char *predict, struct svm_model *model, Py_ssize_t *predict_dims, + char *dec_values, BlasFunctions *blas_functions) +{ + double *t = (double *) dec_values; + struct svm_node *predict_nodes; + Py_ssize_t i; + + predict_nodes = dense_to_libsvm((double *) predict, predict_dims); + + if (predict_nodes == NULL) + return -1; + for(i=0; inr_class; + predict_nodes = dense_to_libsvm((double *) predict, predict_dims); + if (predict_nodes == NULL) + return -1; + for(i=0; iSV); + + /* We don't free sv_ind and n_iter, since we did not create them in + set_model */ + /* free(model->sv_ind); + * free(model->n_iter); + */ + free(model->sv_coef); + free(model->rho); + free(model->label); + free(model->probA); + free(model->probB); + free(model->nSV); + free(model); + + return 0; +} + +int free_param(struct svm_parameter *param) +{ + if (param == NULL) return -1; + free(param); + return 0; +} + + +/* borrowed from original libsvm code */ +static void print_null(const char *s) {} + +static void print_string_stdout(const char *s) +{ + fputs(s,stdout); + fflush(stdout); +} + +/* provide convenience wrapper */ +void set_verbosity(int verbosity_flag){ + if (verbosity_flag) + svm_set_print_string_function(&print_string_stdout); + else + svm_set_print_string_function(&print_null); +} diff --git a/.venv/Lib/site-packages/sklearn/svm/src/libsvm/libsvm_sparse_helper.c b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/libsvm_sparse_helper.c new file mode 100644 index 0000000000000000000000000000000000000000..2e6bed578866b8f1572f8c74e05b983bceafd5c7 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/libsvm_sparse_helper.c @@ -0,0 +1,472 @@ +#include +#define PY_SSIZE_T_CLEAN +#include +#include "svm.h" +#include "_svm_cython_blas_helpers.h" + + +#ifndef MAX + #define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#endif + + +/* + * Convert scipy.sparse.csr to libsvm's sparse data structure + */ +struct svm_csr_node **csr_to_libsvm (double *values, int* indices, int* indptr, int n_samples) +{ + struct svm_csr_node **sparse, *temp; + int i, j=0, k=0, n; + sparse = malloc (n_samples * sizeof(struct svm_csr_node *)); + + if (sparse == NULL) + return NULL; + + for (i=0; isvm_type = svm_type; + param->kernel_type = kernel_type; + param->degree = degree; + param->coef0 = coef0; + param->nu = nu; + param->cache_size = cache_size; + param->C = C; + param->eps = eps; + param->p = p; + param->shrinking = shrinking; + param->probability = probability; + param->nr_weight = nr_weight; + param->weight_label = (int *) weight_label; + param->weight = (double *) weight; + param->gamma = gamma; + param->max_iter = max_iter; + param->random_seed = random_seed; + return param; +} + + +/* + * Create and return a svm_csr_problem struct from a scipy.sparse.csr matrix. It is + * up to the user to free resulting structure. + * + * TODO: precomputed kernel. + */ +struct svm_csr_problem * csr_set_problem (char *values, Py_ssize_t *n_indices, + char *indices, Py_ssize_t *n_indptr, char *indptr, char *Y, + char *sample_weight, int kernel_type) { + + struct svm_csr_problem *problem; + problem = malloc (sizeof (struct svm_csr_problem)); + if (problem == NULL) return NULL; + problem->l = (int) n_indptr[0] - 1; + problem->y = (double *) Y; + problem->x = csr_to_libsvm((double *) values, (int *) indices, + (int *) indptr, problem->l); + /* should be removed once we implement weighted samples */ + problem->W = (double *) sample_weight; + + if (problem->x == NULL) { + free(problem); + return NULL; + } + return problem; +} + + +struct svm_csr_model *csr_set_model(struct svm_parameter *param, int nr_class, + char *SV_data, Py_ssize_t *SV_indices_dims, + char *SV_indices, Py_ssize_t *SV_indptr_dims, + char *SV_intptr, + char *sv_coef, char *rho, char *nSV, + char *probA, char *probB) +{ + struct svm_csr_model *model; + double *dsv_coef = (double *) sv_coef; + int i, m; + + m = nr_class * (nr_class-1)/2; + + if ((model = malloc(sizeof(struct svm_csr_model))) == NULL) + goto model_error; + if ((model->nSV = malloc(nr_class * sizeof(int))) == NULL) + goto nsv_error; + if ((model->label = malloc(nr_class * sizeof(int))) == NULL) + goto label_error; + if ((model->sv_coef = malloc((nr_class-1)*sizeof(double *))) == NULL) + goto sv_coef_error; + if ((model->rho = malloc( m * sizeof(double))) == NULL) + goto rho_error; + + // This is only allocated in dynamic memory while training. + model->n_iter = NULL; + + /* in the case of precomputed kernels we do not use + dense_to_precomputed because we don't want the leading 0. As + indices start at 1 (not at 0) this will work */ + model->l = (int) SV_indptr_dims[0] - 1; + model->SV = csr_to_libsvm((double *) SV_data, (int *) SV_indices, + (int *) SV_intptr, model->l); + model->nr_class = nr_class; + model->param = *param; + + /* + * regression and one-class does not use nSV, label. + */ + if (param->svm_type < 2) { + memcpy(model->nSV, nSV, model->nr_class * sizeof(int)); + for(i=0; i < model->nr_class; i++) + model->label[i] = i; + } + + for (i=0; i < model->nr_class-1; i++) { + /* + * We cannot squash all this mallocs in a single call since + * svm_destroy_model will free each element of the array. + */ + if ((model->sv_coef[i] = malloc((model->l) * sizeof(double))) == NULL) { + int j; + for (j=0; jsv_coef[j]); + goto sv_coef_i_error; + } + memcpy(model->sv_coef[i], dsv_coef, (model->l) * sizeof(double)); + dsv_coef += model->l; + } + + for (i=0; irho)[i] = -((double *) rho)[i]; + } + + /* + * just to avoid segfaults, these features are not wrapped but + * svm_destroy_model will try to free them. + */ + + if (param->probability) { + if ((model->probA = malloc(m * sizeof(double))) == NULL) + goto probA_error; + memcpy(model->probA, probA, m * sizeof(double)); + if ((model->probB = malloc(m * sizeof(double))) == NULL) + goto probB_error; + memcpy(model->probB, probB, m * sizeof(double)); + } else { + model->probA = NULL; + model->probB = NULL; + } + + /* We'll free SV ourselves */ + model->free_sv = 0; + return model; + +probB_error: + free(model->probA); +probA_error: + for (i=0; i < model->nr_class-1; i++) + free(model->sv_coef[i]); +sv_coef_i_error: + free(model->rho); +rho_error: + free(model->sv_coef); +sv_coef_error: + free(model->label); +label_error: + free(model->nSV); +nsv_error: + free(model); +model_error: + return NULL; +} + + +/* + * Copy support vectors into a scipy.sparse.csr matrix + */ +int csr_copy_SV (char *data, Py_ssize_t *n_indices, + char *indices, Py_ssize_t *n_indptr, char *indptr, + struct svm_csr_model *model, int n_features) +{ + int i, j, k=0, index; + double *dvalues = (double *) data; + int *iindices = (int *) indices; + int *iindptr = (int *) indptr; + iindptr[0] = 0; + for (i=0; il; ++i) { /* iterate over support vectors */ + index = model->SV[i][0].index; + for(j=0; index >=0 ; ++j) { + iindices[k] = index - 1; + dvalues[k] = model->SV[i][j].value; + index = model->SV[i][j+1].index; + ++k; + } + iindptr[i+1] = k; + } + + return 0; +} + +/* get number of nonzero coefficients in support vectors */ +Py_ssize_t get_nonzero_SV (struct svm_csr_model *model) { + int i, j; + Py_ssize_t count=0; + for (i=0; il; ++i) { + j = 0; + while (model->SV[i][j].index != -1) { + ++j; + ++count; + } + } + return count; +} + + +/* + * Predict using a model, where data is expected to be encoded into a csr matrix. + */ +int csr_copy_predict (Py_ssize_t *data_size, char *data, Py_ssize_t *index_size, + char *index, Py_ssize_t *intptr_size, char *intptr, struct svm_csr_model *model, + char *dec_values, BlasFunctions *blas_functions) { + double *t = (double *) dec_values; + struct svm_csr_node **predict_nodes; + Py_ssize_t i; + + predict_nodes = csr_to_libsvm((double *) data, (int *) index, + (int *) intptr, intptr_size[0]-1); + + if (predict_nodes == NULL) + return -1; + for(i=0; i < intptr_size[0] - 1; ++i) { + *t = svm_csr_predict(model, predict_nodes[i], blas_functions); + free(predict_nodes[i]); + ++t; + } + free(predict_nodes); + return 0; +} + +int csr_copy_predict_values (Py_ssize_t *data_size, char *data, Py_ssize_t *index_size, + char *index, Py_ssize_t *intptr_size, char *intptr, struct svm_csr_model *model, + char *dec_values, int nr_class, BlasFunctions *blas_functions) { + struct svm_csr_node **predict_nodes; + Py_ssize_t i; + + predict_nodes = csr_to_libsvm((double *) data, (int *) index, + (int *) intptr, intptr_size[0]-1); + + if (predict_nodes == NULL) + return -1; + for(i=0; i < intptr_size[0] - 1; ++i) { + svm_csr_predict_values(model, predict_nodes[i], + ((double *) dec_values) + i*nr_class, + blas_functions); + free(predict_nodes[i]); + } + free(predict_nodes); + + return 0; +} + +int csr_copy_predict_proba (Py_ssize_t *data_size, char *data, Py_ssize_t *index_size, + char *index, Py_ssize_t *intptr_size, char *intptr, struct svm_csr_model *model, + char *dec_values, BlasFunctions *blas_functions) { + + struct svm_csr_node **predict_nodes; + Py_ssize_t i; + int m = model->nr_class; + + predict_nodes = csr_to_libsvm((double *) data, (int *) index, + (int *) intptr, intptr_size[0]-1); + + if (predict_nodes == NULL) + return -1; + for(i=0; i < intptr_size[0] - 1; ++i) { + svm_csr_predict_probability( + model, predict_nodes[i], ((double *) dec_values) + i*m, blas_functions); + free(predict_nodes[i]); + } + free(predict_nodes); + return 0; +} + + +Py_ssize_t get_nr(struct svm_csr_model *model) +{ + return (Py_ssize_t) model->nr_class; +} + +void copy_intercept(char *data, struct svm_csr_model *model, Py_ssize_t *dims) +{ + /* intercept = -rho */ + Py_ssize_t i, n = dims[0]; + double t, *ddata = (double *) data; + for (i=0; irho[i]; + /* we do this to avoid ugly -0.0 */ + *ddata = (t != 0) ? -t : 0; + ++ddata; + } +} + +void copy_support (char *data, struct svm_csr_model *model) +{ + memcpy (data, model->sv_ind, (model->l) * sizeof(int)); +} + +/* + * Some helpers to convert from libsvm sparse data structures + * model->sv_coef is a double **, whereas data is just a double *, + * so we have to do some stupid copying. + */ +void copy_sv_coef(char *data, struct svm_csr_model *model) +{ + int i, len = model->nr_class-1; + double *temp = (double *) data; + for(i=0; isv_coef[i], sizeof(double) * model->l); + temp += model->l; + } +} + +/* + * Get the number of iterations run in optimization + */ +void copy_n_iter(char *data, struct svm_csr_model *model) +{ + const int n_models = MAX(1, model->nr_class * (model->nr_class-1) / 2); + memcpy(data, model->n_iter, n_models * sizeof(int)); +} + +/* + * Get the number of support vectors in a model. + */ +Py_ssize_t get_l(struct svm_csr_model *model) +{ + return (Py_ssize_t) model->l; +} + +void copy_nSV(char *data, struct svm_csr_model *model) +{ + if (model->label == NULL) return; + memcpy(data, model->nSV, model->nr_class * sizeof(int)); +} + +/* + * same as above with model->label + * TODO: merge in the cython layer + */ +void copy_label(char *data, struct svm_csr_model *model) +{ + if (model->label == NULL) return; + memcpy(data, model->label, model->nr_class * sizeof(int)); +} + +void copy_probA(char *data, struct svm_csr_model *model, Py_ssize_t * dims) +{ + memcpy(data, model->probA, dims[0] * sizeof(double)); +} + +void copy_probB(char *data, struct svm_csr_model *model, Py_ssize_t * dims) +{ + memcpy(data, model->probB, dims[0] * sizeof(double)); +} + + +/* + * Some free routines. Some of them are nontrivial since a lot of + * sharing happens across objects (they *must* be called in the + * correct order) + */ +int free_problem(struct svm_csr_problem *problem) +{ + int i; + if (problem == NULL) return -1; + for (i=0; il; ++i) + free (problem->x[i]); + free (problem->x); + free (problem); + return 0; +} + +int free_model(struct svm_csr_model *model) +{ + /* like svm_free_and_destroy_model, but does not free sv_coef[i] */ + /* We don't free n_iter, since we did not create them in set_model. */ + if (model == NULL) return -1; + free(model->SV); + free(model->sv_coef); + free(model->rho); + free(model->label); + free(model->probA); + free(model->probB); + free(model->nSV); + free(model); + + return 0; +} + +int free_param(struct svm_parameter *param) +{ + if (param == NULL) return -1; + free(param); + return 0; +} + + +int free_model_SV(struct svm_csr_model *model) +{ + int i; + for (i=model->l-1; i>=0; --i) free(model->SV[i]); + /* svn_destroy_model frees model->SV */ + for (i=0; i < model->nr_class-1 ; ++i) free(model->sv_coef[i]); + /* svn_destroy_model frees model->sv_coef */ + return 0; +} + + +/* borrowed from original libsvm code */ +static void print_null(const char *s) {} + +static void print_string_stdout(const char *s) +{ + fputs(s,stdout); + fflush(stdout); +} + +/* provide convenience wrapper */ +void set_verbosity(int verbosity_flag){ + if (verbosity_flag) + svm_set_print_string_function(&print_string_stdout); + else + svm_set_print_string_function(&print_null); +} diff --git a/.venv/Lib/site-packages/sklearn/svm/src/libsvm/libsvm_template.cpp b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/libsvm_template.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b4a485cf2b7703b474b2097c72eb7bd6c56bf61b --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/libsvm_template.cpp @@ -0,0 +1,8 @@ + +/* this is a hack to generate libsvm with both sparse and dense + methods in the same binary*/ + +#define _DENSE_REP +#include "svm.cpp" +#undef _DENSE_REP +#include "svm.cpp" diff --git a/.venv/Lib/site-packages/sklearn/svm/src/libsvm/svm.cpp b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/svm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5c266e1d9a0a3f3c26983838d7d6089674816b70 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/src/libsvm/svm.cpp @@ -0,0 +1,3187 @@ +/* +Copyright (c) 2000-2009 Chih-Chung Chang and Chih-Jen Lin +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither name of copyright holders nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + Modified 2010: + + - Support for dense data by Ming-Fang Weng + + - Return indices for support vectors, Fabian Pedregosa + + + - Fixes to avoid name collision, Fabian Pedregosa + + - Add support for instance weights, Fabian Pedregosa based on work + by Ming-Wei Chang, Hsuan-Tien Lin, Ming-Hen Tsai, Chia-Hua Ho and + Hsiang-Fu Yu, + . + + - Make labels sorted in svm_group_classes, Fabian Pedregosa. + + Modified 2020: + + - Improved random number generator by using a mersenne twister + tweaked + lemire postprocessor. This fixed a convergence issue on windows targets. + Sylvain Marie, Schneider Electric + see + + Modified 2021: + + - Exposed number of iterations run in optimization, Juan Martín Loyola. + See + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "svm.h" +#include "_svm_cython_blas_helpers.h" +#include "../newrand/newrand.h" + + +#ifndef _LIBSVM_CPP +typedef float Qfloat; +typedef signed char schar; +#ifndef min +template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } +#endif +template static inline void swap(T& x, T& y) { T t=x; x=y; y=t; } +template static inline void clone(T*& dst, S* src, int n) +{ + dst = new T[n]; + memcpy((void *)dst,(void *)src,sizeof(T)*n); +} +static inline double powi(double base, int times) +{ + double tmp = base, ret = 1.0; + + for(int t=times; t>0; t/=2) + { + if(t%2==1) ret*=tmp; + tmp = tmp * tmp; + } + return ret; +} +#define INF HUGE_VAL +#define TAU 1e-12 +#define Malloc(type,n) (type *)malloc((n)*sizeof(type)) + +static void print_string_stdout(const char *s) +{ + fputs(s,stdout); + fflush(stdout); +} +static void (*svm_print_string) (const char *) = &print_string_stdout; + +static void info(const char *fmt,...) +{ + char buf[BUFSIZ]; + va_list ap; + va_start(ap,fmt); + vsprintf(buf,fmt,ap); + va_end(ap); + (*svm_print_string)(buf); +} +#endif +#define _LIBSVM_CPP + + +/* yeah, this is ugly. It helps us to have unique names for both sparse +and dense versions of this library */ +#ifdef _DENSE_REP + #ifdef PREFIX + #undef PREFIX + #endif + #ifdef NAMESPACE + #undef NAMESPACE + #endif + #define PREFIX(name) svm_##name + #define NAMESPACE svm + namespace svm { +#else + /* sparse representation */ + #ifdef PREFIX + #undef PREFIX + #endif + #ifdef NAMESPACE + #undef NAMESPACE + #endif + #define PREFIX(name) svm_csr_##name + #define NAMESPACE svm_csr + namespace svm_csr { +#endif + + +// +// Kernel Cache +// +// l is the number of total data items +// size is the cache size limit in bytes +// +class Cache +{ +public: + Cache(int l,long int size); + ~Cache(); + + // request data [0,len) + // return some position p where [p,len) need to be filled + // (p >= len if nothing needs to be filled) + int get_data(const int index, Qfloat **data, int len); + void swap_index(int i, int j); +private: + int l; + long int size; + struct head_t + { + head_t *prev, *next; // a circular list + Qfloat *data; + int len; // data[0,len) is cached in this entry + }; + + head_t *head; + head_t lru_head; + void lru_delete(head_t *h); + void lru_insert(head_t *h); +}; + +Cache::Cache(int l_,long int size_):l(l_),size(size_) +{ + head = (head_t *)calloc(l,sizeof(head_t)); // initialized to 0 + size /= sizeof(Qfloat); + size -= l * sizeof(head_t) / sizeof(Qfloat); + size = max(size, 2 * (long int) l); // cache must be large enough for two columns + lru_head.next = lru_head.prev = &lru_head; +} + +Cache::~Cache() +{ + for(head_t *h = lru_head.next; h != &lru_head; h=h->next) + free(h->data); + free(head); +} + +void Cache::lru_delete(head_t *h) +{ + // delete from current location + h->prev->next = h->next; + h->next->prev = h->prev; +} + +void Cache::lru_insert(head_t *h) +{ + // insert to last position + h->next = &lru_head; + h->prev = lru_head.prev; + h->prev->next = h; + h->next->prev = h; +} + +int Cache::get_data(const int index, Qfloat **data, int len) +{ + head_t *h = &head[index]; + if(h->len) lru_delete(h); + int more = len - h->len; + + if(more > 0) + { + // free old space + while(size < more) + { + head_t *old = lru_head.next; + lru_delete(old); + free(old->data); + size += old->len; + old->data = 0; + old->len = 0; + } + + // allocate new space + h->data = (Qfloat *)realloc(h->data,sizeof(Qfloat)*len); + size -= more; + swap(h->len,len); + } + + lru_insert(h); + *data = h->data; + return len; +} + +void Cache::swap_index(int i, int j) +{ + if(i==j) return; + + if(head[i].len) lru_delete(&head[i]); + if(head[j].len) lru_delete(&head[j]); + swap(head[i].data,head[j].data); + swap(head[i].len,head[j].len); + if(head[i].len) lru_insert(&head[i]); + if(head[j].len) lru_insert(&head[j]); + + if(i>j) swap(i,j); + for(head_t *h = lru_head.next; h!=&lru_head; h=h->next) + { + if(h->len > i) + { + if(h->len > j) + swap(h->data[i],h->data[j]); + else + { + // give up + lru_delete(h); + free(h->data); + size += h->len; + h->data = 0; + h->len = 0; + } + } + } +} + +// +// Kernel evaluation +// +// the static method k_function is for doing single kernel evaluation +// the constructor of Kernel prepares to calculate the l*l kernel matrix +// the member function get_Q is for getting one column from the Q Matrix +// +class QMatrix { +public: + virtual Qfloat *get_Q(int column, int len) const = 0; + virtual double *get_QD() const = 0; + virtual void swap_index(int i, int j) const = 0; + virtual ~QMatrix() {} +}; + +class Kernel: public QMatrix { +public: +#ifdef _DENSE_REP + Kernel(int l, PREFIX(node) * x, const svm_parameter& param, BlasFunctions *blas_functions); +#else + Kernel(int l, PREFIX(node) * const * x, const svm_parameter& param, BlasFunctions *blas_functions); +#endif + virtual ~Kernel(); + + static double k_function(const PREFIX(node) *x, const PREFIX(node) *y, + const svm_parameter& param, BlasFunctions *blas_functions); + virtual Qfloat *get_Q(int column, int len) const = 0; + virtual double *get_QD() const = 0; + virtual void swap_index(int i, int j) const // no so const... + { + swap(x[i],x[j]); + if(x_square) swap(x_square[i],x_square[j]); + } +protected: + + double (Kernel::*kernel_function)(int i, int j) const; + +private: +#ifdef _DENSE_REP + PREFIX(node) *x; +#else + const PREFIX(node) **x; +#endif + double *x_square; + // scipy blas pointer + BlasFunctions *m_blas; + + // svm_parameter + const int kernel_type; + const int degree; + const double gamma; + const double coef0; + + static double dot(const PREFIX(node) *px, const PREFIX(node) *py, BlasFunctions *blas_functions); +#ifdef _DENSE_REP + static double dot(const PREFIX(node) &px, const PREFIX(node) &py, BlasFunctions *blas_functions); +#endif + + double kernel_linear(int i, int j) const + { + return dot(x[i],x[j],m_blas); + } + double kernel_poly(int i, int j) const + { + return powi(gamma*dot(x[i],x[j],m_blas)+coef0,degree); + } + double kernel_rbf(int i, int j) const + { + return exp(-gamma*(x_square[i]+x_square[j]-2*dot(x[i],x[j],m_blas))); + } + double kernel_sigmoid(int i, int j) const + { + return tanh(gamma*dot(x[i],x[j],m_blas)+coef0); + } + double kernel_precomputed(int i, int j) const + { +#ifdef _DENSE_REP + return (x+i)->values[x[j].ind]; +#else + return x[i][(int)(x[j][0].value)].value; +#endif + } +}; + +#ifdef _DENSE_REP +Kernel::Kernel(int l, PREFIX(node) * x_, const svm_parameter& param, BlasFunctions *blas_functions) +#else +Kernel::Kernel(int l, PREFIX(node) * const * x_, const svm_parameter& param, BlasFunctions *blas_functions) +#endif +:kernel_type(param.kernel_type), degree(param.degree), + gamma(param.gamma), coef0(param.coef0) +{ + m_blas = blas_functions; + switch(kernel_type) + { + case LINEAR: + kernel_function = &Kernel::kernel_linear; + break; + case POLY: + kernel_function = &Kernel::kernel_poly; + break; + case RBF: + kernel_function = &Kernel::kernel_rbf; + break; + case SIGMOID: + kernel_function = &Kernel::kernel_sigmoid; + break; + case PRECOMPUTED: + kernel_function = &Kernel::kernel_precomputed; + break; + } + + clone(x,x_,l); + + if(kernel_type == RBF) + { + x_square = new double[l]; + for(int i=0;idim, py->dim); + sum = blas_functions->dot(dim, px->values, 1, py->values, 1); + return sum; +} + +double Kernel::dot(const PREFIX(node) &px, const PREFIX(node) &py, BlasFunctions *blas_functions) +{ + double sum = 0; + + int dim = min(px.dim, py.dim); + sum = blas_functions->dot(dim, px.values, 1, py.values, 1); + return sum; +} +#else +double Kernel::dot(const PREFIX(node) *px, const PREFIX(node) *py, BlasFunctions *blas_functions) +{ + double sum = 0; + while(px->index != -1 && py->index != -1) + { + if(px->index == py->index) + { + sum += px->value * py->value; + ++px; + ++py; + } + else + { + if(px->index > py->index) + ++py; + else + ++px; + } + } + return sum; +} +#endif + +double Kernel::k_function(const PREFIX(node) *x, const PREFIX(node) *y, + const svm_parameter& param, BlasFunctions *blas_functions) +{ + switch(param.kernel_type) + { + case LINEAR: + return dot(x,y,blas_functions); + case POLY: + return powi(param.gamma*dot(x,y,blas_functions)+param.coef0,param.degree); + case RBF: + { + double sum = 0; +#ifdef _DENSE_REP + int dim = min(x->dim, y->dim), i; + double* m_array = (double*)malloc(sizeof(double)*dim); + for (i = 0; i < dim; i++) + { + m_array[i] = x->values[i] - y->values[i]; + } + sum = blas_functions->dot(dim, m_array, 1, m_array, 1); + free(m_array); + for (; i < x->dim; i++) + sum += x->values[i] * x->values[i]; + for (; i < y->dim; i++) + sum += y->values[i] * y->values[i]; +#else + while(x->index != -1 && y->index !=-1) + { + if(x->index == y->index) + { + double d = x->value - y->value; + sum += d*d; + ++x; + ++y; + } + else + { + if(x->index > y->index) + { + sum += y->value * y->value; + ++y; + } + else + { + sum += x->value * x->value; + ++x; + } + } + } + + while(x->index != -1) + { + sum += x->value * x->value; + ++x; + } + + while(y->index != -1) + { + sum += y->value * y->value; + ++y; + } +#endif + return exp(-param.gamma*sum); + } + case SIGMOID: + return tanh(param.gamma*dot(x,y,blas_functions)+param.coef0); + case PRECOMPUTED: //x: test (validation), y: SV + { +#ifdef _DENSE_REP + return x->values[y->ind]; +#else + return x[(int)(y->value)].value; +#endif + } + default: + return 0; // Unreachable + } +} +// An SMO algorithm in Fan et al., JMLR 6(2005), p. 1889--1918 +// Solves: +// +// min 0.5(\alpha^T Q \alpha) + p^T \alpha +// +// y^T \alpha = \delta +// y_i = +1 or -1 +// 0 <= alpha_i <= Cp for y_i = 1 +// 0 <= alpha_i <= Cn for y_i = -1 +// +// Given: +// +// Q, p, y, Cp, Cn, and an initial feasible point \alpha +// l is the size of vectors and matrices +// eps is the stopping tolerance +// +// solution will be put in \alpha, objective value will be put in obj +// + +class Solver { +public: + Solver() {}; + virtual ~Solver() {}; + + struct SolutionInfo { + double obj; + double rho; + double *upper_bound; + double r; // for Solver_NU + bool solve_timed_out; + int n_iter; + }; + + void Solve(int l, const QMatrix& Q, const double *p_, const schar *y_, + double *alpha_, const double *C_, double eps, + SolutionInfo* si, int shrinking, int max_iter); +protected: + int active_size; + schar *y; + double *G; // gradient of objective function + enum { LOWER_BOUND, UPPER_BOUND, FREE }; + char *alpha_status; // LOWER_BOUND, UPPER_BOUND, FREE + double *alpha; + const QMatrix *Q; + const double *QD; + double eps; + double Cp,Cn; + double *C; + double *p; + int *active_set; + double *G_bar; // gradient, if we treat free variables as 0 + int l; + bool unshrink; // XXX + + double get_C(int i) + { + return C[i]; + } + void update_alpha_status(int i) + { + if(alpha[i] >= get_C(i)) + alpha_status[i] = UPPER_BOUND; + else if(alpha[i] <= 0) + alpha_status[i] = LOWER_BOUND; + else alpha_status[i] = FREE; + } + bool is_upper_bound(int i) { return alpha_status[i] == UPPER_BOUND; } + bool is_lower_bound(int i) { return alpha_status[i] == LOWER_BOUND; } + bool is_free(int i) { return alpha_status[i] == FREE; } + void swap_index(int i, int j); + void reconstruct_gradient(); + virtual int select_working_set(int &i, int &j); + virtual double calculate_rho(); + virtual void do_shrinking(); +private: + bool be_shrunk(int i, double Gmax1, double Gmax2); +}; + +void Solver::swap_index(int i, int j) +{ + Q->swap_index(i,j); + swap(y[i],y[j]); + swap(G[i],G[j]); + swap(alpha_status[i],alpha_status[j]); + swap(alpha[i],alpha[j]); + swap(p[i],p[j]); + swap(active_set[i],active_set[j]); + swap(G_bar[i],G_bar[j]); + swap(C[i], C[j]); +} + +void Solver::reconstruct_gradient() +{ + // reconstruct inactive elements of G from G_bar and free variables + + if(active_size == l) return; + + int i,j; + int nr_free = 0; + + for(j=active_size;j 2*active_size*(l-active_size)) + { + for(i=active_size;iget_Q(i,active_size); + for(j=0;jget_Q(i,l); + double alpha_i = alpha[i]; + for(j=active_size;jl = l; + this->Q = &Q; + QD=Q.get_QD(); + clone(p, p_,l); + clone(y, y_,l); + clone(alpha,alpha_,l); + clone(C, C_, l); + this->eps = eps; + unshrink = false; + si->solve_timed_out = false; + + // initialize alpha_status + { + alpha_status = new char[l]; + for(int i=0;i= max_iter)) { + info("WARN: libsvm Solver reached max_iter"); + si->solve_timed_out = true; + break; + } + + // show progress and do shrinking + + if(--counter == 0) + { + counter = min(l,1000); + if(shrinking) do_shrinking(); + info("."); + } + + int i,j; + if(select_working_set(i,j)!=0) + { + // reconstruct the whole gradient + reconstruct_gradient(); + // reset active set size and check + active_size = l; + info("*"); + if(select_working_set(i,j)!=0) + break; + else + counter = 1; // do shrinking next iteration + } + + ++iter; + + // update alpha[i] and alpha[j], handle bounds carefully + + const Qfloat *Q_i = Q.get_Q(i,active_size); + const Qfloat *Q_j = Q.get_Q(j,active_size); + + double C_i = get_C(i); + double C_j = get_C(j); + + double old_alpha_i = alpha[i]; + double old_alpha_j = alpha[j]; + + if(y[i]!=y[j]) + { + double quad_coef = QD[i]+QD[j]+2*Q_i[j]; + if (quad_coef <= 0) + quad_coef = TAU; + double delta = (-G[i]-G[j])/quad_coef; + double diff = alpha[i] - alpha[j]; + alpha[i] += delta; + alpha[j] += delta; + + if(diff > 0) + { + if(alpha[j] < 0) + { + alpha[j] = 0; + alpha[i] = diff; + } + } + else + { + if(alpha[i] < 0) + { + alpha[i] = 0; + alpha[j] = -diff; + } + } + if(diff > C_i - C_j) + { + if(alpha[i] > C_i) + { + alpha[i] = C_i; + alpha[j] = C_i - diff; + } + } + else + { + if(alpha[j] > C_j) + { + alpha[j] = C_j; + alpha[i] = C_j + diff; + } + } + } + else + { + double quad_coef = QD[i]+QD[j]-2*Q_i[j]; + if (quad_coef <= 0) + quad_coef = TAU; + double delta = (G[i]-G[j])/quad_coef; + double sum = alpha[i] + alpha[j]; + alpha[i] -= delta; + alpha[j] += delta; + + if(sum > C_i) + { + if(alpha[i] > C_i) + { + alpha[i] = C_i; + alpha[j] = sum - C_i; + } + } + else + { + if(alpha[j] < 0) + { + alpha[j] = 0; + alpha[i] = sum; + } + } + if(sum > C_j) + { + if(alpha[j] > C_j) + { + alpha[j] = C_j; + alpha[i] = sum - C_j; + } + } + else + { + if(alpha[i] < 0) + { + alpha[i] = 0; + alpha[j] = sum; + } + } + } + + // update G + + double delta_alpha_i = alpha[i] - old_alpha_i; + double delta_alpha_j = alpha[j] - old_alpha_j; + + for(int k=0;krho = calculate_rho(); + + // calculate objective value + { + double v = 0; + int i; + for(i=0;iobj = v/2; + } + + // put back the solution + { + for(int i=0;iupper_bound[i] = C[i]; + + // store number of iterations + si->n_iter = iter; + + info("\noptimization finished, #iter = %d\n",iter); + + delete[] p; + delete[] y; + delete[] alpha; + delete[] alpha_status; + delete[] active_set; + delete[] G; + delete[] G_bar; + delete[] C; +} + +// return 1 if already optimal, return 0 otherwise +int Solver::select_working_set(int &out_i, int &out_j) +{ + // return i,j such that + // i: maximizes -y_i * grad(f)_i, i in I_up(\alpha) + // j: minimizes the decrease of obj value + // (if quadratic coefficient <= 0, replace it with tau) + // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha) + + double Gmax = -INF; + double Gmax2 = -INF; + int Gmax_idx = -1; + int Gmin_idx = -1; + double obj_diff_min = INF; + + for(int t=0;t= Gmax) + { + Gmax = -G[t]; + Gmax_idx = t; + } + } + else + { + if(!is_lower_bound(t)) + if(G[t] >= Gmax) + { + Gmax = G[t]; + Gmax_idx = t; + } + } + + int i = Gmax_idx; + const Qfloat *Q_i = NULL; + if(i != -1) // NULL Q_i not accessed: Gmax=-INF if i=-1 + Q_i = Q->get_Q(i,active_size); + + for(int j=0;j= Gmax2) + Gmax2 = G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[i]+QD[j]-2.0*y[i]*Q_i[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + else + { + if (!is_upper_bound(j)) + { + double grad_diff= Gmax-G[j]; + if (-G[j] >= Gmax2) + Gmax2 = -G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[i]+QD[j]+2.0*y[i]*Q_i[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + } + + if(Gmax+Gmax2 < eps || Gmin_idx == -1) + return 1; + + out_i = Gmax_idx; + out_j = Gmin_idx; + return 0; +} + +bool Solver::be_shrunk(int i, double Gmax1, double Gmax2) +{ + if(is_upper_bound(i)) + { + if(y[i]==+1) + return(-G[i] > Gmax1); + else + return(-G[i] > Gmax2); + } + else if(is_lower_bound(i)) + { + if(y[i]==+1) + return(G[i] > Gmax2); + else + return(G[i] > Gmax1); + } + else + return(false); +} + +void Solver::do_shrinking() +{ + int i; + double Gmax1 = -INF; // max { -y_i * grad(f)_i | i in I_up(\alpha) } + double Gmax2 = -INF; // max { y_i * grad(f)_i | i in I_low(\alpha) } + + // find maximal violating pair first + for(i=0;i= Gmax1) + Gmax1 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(G[i] >= Gmax2) + Gmax2 = G[i]; + } + } + else + { + if(!is_upper_bound(i)) + { + if(-G[i] >= Gmax2) + Gmax2 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(G[i] >= Gmax1) + Gmax1 = G[i]; + } + } + } + + if(unshrink == false && Gmax1 + Gmax2 <= eps*10) + { + unshrink = true; + reconstruct_gradient(); + active_size = l; + info("*"); + } + + for(i=0;i i) + { + if (!be_shrunk(active_size, Gmax1, Gmax2)) + { + swap_index(i,active_size); + break; + } + active_size--; + } + } +} + +double Solver::calculate_rho() +{ + double r; + int nr_free = 0; + double ub = INF, lb = -INF, sum_free = 0; + for(int i=0;i0) + r = sum_free/nr_free; + else + r = (ub+lb)/2; + + return r; +} + +// +// Solver for nu-svm classification and regression +// +// additional constraint: e^T \alpha = constant +// +class Solver_NU : public Solver +{ +public: + Solver_NU() {} + void Solve(int l, const QMatrix& Q, const double *p, const schar *y, + double *alpha, const double *C_, double eps, + SolutionInfo* si, int shrinking, int max_iter) + { + this->si = si; + Solver::Solve(l,Q,p,y,alpha,C_,eps,si,shrinking,max_iter); + } +private: + SolutionInfo *si; + int select_working_set(int &i, int &j); + double calculate_rho(); + bool be_shrunk(int i, double Gmax1, double Gmax2, double Gmax3, double Gmax4); + void do_shrinking(); +}; + +// return 1 if already optimal, return 0 otherwise +int Solver_NU::select_working_set(int &out_i, int &out_j) +{ + // return i,j such that y_i = y_j and + // i: maximizes -y_i * grad(f)_i, i in I_up(\alpha) + // j: minimizes the decrease of obj value + // (if quadratic coefficient <= 0, replace it with tau) + // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha) + + double Gmaxp = -INF; + double Gmaxp2 = -INF; + int Gmaxp_idx = -1; + + double Gmaxn = -INF; + double Gmaxn2 = -INF; + int Gmaxn_idx = -1; + + int Gmin_idx = -1; + double obj_diff_min = INF; + + for(int t=0;t= Gmaxp) + { + Gmaxp = -G[t]; + Gmaxp_idx = t; + } + } + else + { + if(!is_lower_bound(t)) + if(G[t] >= Gmaxn) + { + Gmaxn = G[t]; + Gmaxn_idx = t; + } + } + + int ip = Gmaxp_idx; + int in = Gmaxn_idx; + const Qfloat *Q_ip = NULL; + const Qfloat *Q_in = NULL; + if(ip != -1) // NULL Q_ip not accessed: Gmaxp=-INF if ip=-1 + Q_ip = Q->get_Q(ip,active_size); + if(in != -1) + Q_in = Q->get_Q(in,active_size); + + for(int j=0;j= Gmaxp2) + Gmaxp2 = G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[ip]+QD[j]-2*Q_ip[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + else + { + if (!is_upper_bound(j)) + { + double grad_diff=Gmaxn-G[j]; + if (-G[j] >= Gmaxn2) + Gmaxn2 = -G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[in]+QD[j]-2*Q_in[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + } + + if(max(Gmaxp+Gmaxp2,Gmaxn+Gmaxn2) < eps || Gmin_idx == -1) + return 1; + + if (y[Gmin_idx] == +1) + out_i = Gmaxp_idx; + else + out_i = Gmaxn_idx; + out_j = Gmin_idx; + + return 0; +} + +bool Solver_NU::be_shrunk(int i, double Gmax1, double Gmax2, double Gmax3, double Gmax4) +{ + if(is_upper_bound(i)) + { + if(y[i]==+1) + return(-G[i] > Gmax1); + else + return(-G[i] > Gmax4); + } + else if(is_lower_bound(i)) + { + if(y[i]==+1) + return(G[i] > Gmax2); + else + return(G[i] > Gmax3); + } + else + return(false); +} + +void Solver_NU::do_shrinking() +{ + double Gmax1 = -INF; // max { -y_i * grad(f)_i | y_i = +1, i in I_up(\alpha) } + double Gmax2 = -INF; // max { y_i * grad(f)_i | y_i = +1, i in I_low(\alpha) } + double Gmax3 = -INF; // max { -y_i * grad(f)_i | y_i = -1, i in I_up(\alpha) } + double Gmax4 = -INF; // max { y_i * grad(f)_i | y_i = -1, i in I_low(\alpha) } + + // find maximal violating pair first + int i; + for(i=0;i Gmax1) Gmax1 = -G[i]; + } + else if(-G[i] > Gmax4) Gmax4 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(y[i]==+1) + { + if(G[i] > Gmax2) Gmax2 = G[i]; + } + else if(G[i] > Gmax3) Gmax3 = G[i]; + } + } + + if(unshrink == false && max(Gmax1+Gmax2,Gmax3+Gmax4) <= eps*10) + { + unshrink = true; + reconstruct_gradient(); + active_size = l; + } + + for(i=0;i i) + { + if (!be_shrunk(active_size, Gmax1, Gmax2, Gmax3, Gmax4)) + { + swap_index(i,active_size); + break; + } + active_size--; + } + } +} + +double Solver_NU::calculate_rho() +{ + int nr_free1 = 0,nr_free2 = 0; + double ub1 = INF, ub2 = INF; + double lb1 = -INF, lb2 = -INF; + double sum_free1 = 0, sum_free2 = 0; + + for(int i=0;i 0) + r1 = sum_free1/nr_free1; + else + r1 = (ub1+lb1)/2; + + if(nr_free2 > 0) + r2 = sum_free2/nr_free2; + else + r2 = (ub2+lb2)/2; + + si->r = (r1+r2)/2; + return (r1-r2)/2; +} + +// +// Q matrices for various formulations +// +class SVC_Q: public Kernel +{ +public: + SVC_Q(const PREFIX(problem)& prob, const svm_parameter& param, const schar *y_, BlasFunctions *blas_functions) + :Kernel(prob.l, prob.x, param, blas_functions) + { + clone(y,y_,prob.l); + cache = new Cache(prob.l,(long int)(param.cache_size*(1<<20))); + QD = new double[prob.l]; + for(int i=0;i*kernel_function)(i,i); + } + + Qfloat *get_Q(int i, int len) const + { + Qfloat *data; + int start, j; + if((start = cache->get_data(i,&data,len)) < len) + { + for(j=start;j*kernel_function)(i,j)); + } + return data; + } + + double *get_QD() const + { + return QD; + } + + void swap_index(int i, int j) const + { + cache->swap_index(i,j); + Kernel::swap_index(i,j); + swap(y[i],y[j]); + swap(QD[i],QD[j]); + } + + ~SVC_Q() + { + delete[] y; + delete cache; + delete[] QD; + } +private: + schar *y; + Cache *cache; + double *QD; +}; + +class ONE_CLASS_Q: public Kernel +{ +public: + ONE_CLASS_Q(const PREFIX(problem)& prob, const svm_parameter& param, BlasFunctions *blas_functions) + :Kernel(prob.l, prob.x, param, blas_functions) + { + cache = new Cache(prob.l,(long int)(param.cache_size*(1<<20))); + QD = new double[prob.l]; + for(int i=0;i*kernel_function)(i,i); + } + + Qfloat *get_Q(int i, int len) const + { + Qfloat *data; + int start, j; + if((start = cache->get_data(i,&data,len)) < len) + { + for(j=start;j*kernel_function)(i,j); + } + return data; + } + + double *get_QD() const + { + return QD; + } + + void swap_index(int i, int j) const + { + cache->swap_index(i,j); + Kernel::swap_index(i,j); + swap(QD[i],QD[j]); + } + + ~ONE_CLASS_Q() + { + delete cache; + delete[] QD; + } +private: + Cache *cache; + double *QD; +}; + +class SVR_Q: public Kernel +{ +public: + SVR_Q(const PREFIX(problem)& prob, const svm_parameter& param, BlasFunctions *blas_functions) + :Kernel(prob.l, prob.x, param, blas_functions) + { + l = prob.l; + cache = new Cache(l,(long int)(param.cache_size*(1<<20))); + QD = new double[2*l]; + sign = new schar[2*l]; + index = new int[2*l]; + for(int k=0;k*kernel_function)(k,k); + QD[k+l] = QD[k]; + } + buffer[0] = new Qfloat[2*l]; + buffer[1] = new Qfloat[2*l]; + next_buffer = 0; + } + + void swap_index(int i, int j) const + { + swap(sign[i],sign[j]); + swap(index[i],index[j]); + swap(QD[i],QD[j]); + } + + Qfloat *get_Q(int i, int len) const + { + Qfloat *data; + int j, real_i = index[i]; + if(cache->get_data(real_i,&data,l) < l) + { + for(j=0;j*kernel_function)(real_i,j); + } + + // reorder and copy + Qfloat *buf = buffer[next_buffer]; + next_buffer = 1 - next_buffer; + schar si = sign[i]; + for(j=0;jl; + double *minus_ones = new double[l]; + schar *y = new schar[l]; + double *C = new double[l]; + + int i; + + for(i=0;iy[i] > 0) + { + y[i] = +1; + C[i] = prob->W[i]*Cp; + } + else + { + y[i] = -1; + C[i] = prob->W[i]*Cn; + } + } + + Solver s; + s.Solve(l, SVC_Q(*prob,*param,y, blas_functions), minus_ones, y, + alpha, C, param->eps, si, param->shrinking, + param->max_iter); + + /* + double sum_alpha=0; + for(i=0;il)); + */ + + for(i=0;il; + double nu = param->nu; + + schar *y = new schar[l]; + double *C = new double[l]; + + for(i=0;iy[i]>0) + y[i] = +1; + else + y[i] = -1; + + C[i] = prob->W[i]; + } + + double nu_l = 0; + for(i=0;ieps, si, param->shrinking, param->max_iter); + double r = si->r; + + info("C = %f\n",1/r); + + for(i=0;iupper_bound[i] /= r; + } + + si->rho /= r; + si->obj /= (r*r); + + delete[] C; + delete[] y; + delete[] zeros; +} + +static void solve_one_class( + const PREFIX(problem) *prob, const svm_parameter *param, + double *alpha, Solver::SolutionInfo* si, BlasFunctions *blas_functions) +{ + int l = prob->l; + double *zeros = new double[l]; + schar *ones = new schar[l]; + double *C = new double[l]; + int i; + + double nu_l = 0; + + for(i=0;iW[i]; + nu_l += C[i] * param->nu; + } + + i = 0; + while(nu_l > 0) + { + alpha[i] = min(C[i],nu_l); + nu_l -= alpha[i]; + ++i; + } + for(;ieps, si, param->shrinking, param->max_iter); + + delete[] C; + delete[] zeros; + delete[] ones; +} + +static void solve_epsilon_svr( + const PREFIX(problem) *prob, const svm_parameter *param, + double *alpha, Solver::SolutionInfo* si, BlasFunctions *blas_functions) +{ + int l = prob->l; + double *alpha2 = new double[2*l]; + double *linear_term = new double[2*l]; + schar *y = new schar[2*l]; + double *C = new double[2*l]; + int i; + + for(i=0;ip - prob->y[i]; + y[i] = 1; + C[i] = prob->W[i]*param->C; + + alpha2[i+l] = 0; + linear_term[i+l] = param->p + prob->y[i]; + y[i+l] = -1; + C[i+l] = prob->W[i]*param->C; + } + + Solver s; + s.Solve(2*l, SVR_Q(*prob,*param,blas_functions), linear_term, y, + alpha2, C, param->eps, si, param->shrinking, param->max_iter); + + double sum_alpha = 0; + for(i=0;il; + double *C = new double[2*l]; + double *alpha2 = new double[2*l]; + double *linear_term = new double[2*l]; + schar *y = new schar[2*l]; + int i; + + double sum = 0; + for(i=0;iW[i]*param->C; + sum += C[i] * param->nu; + } + sum /= 2; + + for(i=0;iy[i]; + y[i] = 1; + + linear_term[i+l] = prob->y[i]; + y[i+l] = -1; + } + + Solver_NU s; + s.Solve(2*l, SVR_Q(*prob,*param,blas_functions), linear_term, y, + alpha2, C, param->eps, si, param->shrinking, param->max_iter); + + info("epsilon = %f\n",-si->r); + + for(i=0;il); + Solver::SolutionInfo si; + switch(param->svm_type) + { + case C_SVC: + si.upper_bound = Malloc(double,prob->l); + solve_c_svc(prob,param,alpha,&si,Cp,Cn,blas_functions); + break; + case NU_SVC: + si.upper_bound = Malloc(double,prob->l); + solve_nu_svc(prob,param,alpha,&si,blas_functions); + break; + case ONE_CLASS: + si.upper_bound = Malloc(double,prob->l); + solve_one_class(prob,param,alpha,&si,blas_functions); + break; + case EPSILON_SVR: + si.upper_bound = Malloc(double,2*prob->l); + solve_epsilon_svr(prob,param,alpha,&si,blas_functions); + break; + case NU_SVR: + si.upper_bound = Malloc(double,2*prob->l); + solve_nu_svr(prob,param,alpha,&si,blas_functions); + break; + } + + *status |= si.solve_timed_out; + + info("obj = %f, rho = %f\n",si.obj,si.rho); + + // output SVs + + int nSV = 0; + int nBSV = 0; + for(int i=0;il;i++) + { + if(fabs(alpha[i]) > 0) + { + ++nSV; + if(prob->y[i] > 0) + { + if(fabs(alpha[i]) >= si.upper_bound[i]) + ++nBSV; + } + else + { + if(fabs(alpha[i]) >= si.upper_bound[i]) + ++nBSV; + } + } + } + + free(si.upper_bound); + + info("nSV = %d, nBSV = %d\n",nSV,nBSV); + + decision_function f; + f.alpha = alpha; + f.rho = si.rho; + f.n_iter = si.n_iter; + return f; +} + +// Platt's binary SVM Probabilistic Output: an improvement from Lin et al. +static void sigmoid_train( + int l, const double *dec_values, const double *labels, + double& A, double& B) +{ + double prior1=0, prior0 = 0; + int i; + + for (i=0;i 0) prior1+=1; + else prior0+=1; + + int max_iter=100; // Maximal number of iterations + double min_step=1e-10; // Minimal step taken in line search + double sigma=1e-12; // For numerically strict PD of Hessian + double eps=1e-5; + double hiTarget=(prior1+1.0)/(prior1+2.0); + double loTarget=1/(prior0+2.0); + double *t=Malloc(double,l); + double fApB,p,q,h11,h22,h21,g1,g2,det,dA,dB,gd,stepsize; + double newA,newB,newf,d1,d2; + int iter; + + // Initial Point and Initial Fun Value + A=0.0; B=log((prior0+1.0)/(prior1+1.0)); + double fval = 0.0; + + for (i=0;i0) t[i]=hiTarget; + else t[i]=loTarget; + fApB = dec_values[i]*A+B; + if (fApB>=0) + fval += t[i]*fApB + log(1+exp(-fApB)); + else + fval += (t[i] - 1)*fApB +log(1+exp(fApB)); + } + for (iter=0;iter= 0) + { + p=exp(-fApB)/(1.0+exp(-fApB)); + q=1.0/(1.0+exp(-fApB)); + } + else + { + p=1.0/(1.0+exp(fApB)); + q=exp(fApB)/(1.0+exp(fApB)); + } + d2=p*q; + h11+=dec_values[i]*dec_values[i]*d2; + h22+=d2; + h21+=dec_values[i]*d2; + d1=t[i]-p; + g1+=dec_values[i]*d1; + g2+=d1; + } + + // Stopping Criteria + if (fabs(g1)= min_step) + { + newA = A + stepsize * dA; + newB = B + stepsize * dB; + + // New function value + newf = 0.0; + for (i=0;i= 0) + newf += t[i]*fApB + log(1+exp(-fApB)); + else + newf += (t[i] - 1)*fApB +log(1+exp(fApB)); + } + // Check sufficient decrease + if (newf=max_iter) + info("Reaching maximal iterations in two-class probability estimates\n"); + free(t); +} + +static double sigmoid_predict(double decision_value, double A, double B) +{ + double fApB = decision_value*A+B; + // 1-p used later; avoid catastrophic cancellation + if (fApB >= 0) + return exp(-fApB)/(1.0+exp(-fApB)); + else + return 1.0/(1+exp(fApB)) ; +} + +// Method 2 from the multiclass_prob paper by Wu, Lin, and Weng +static void multiclass_probability(int k, double **r, double *p) +{ + int t,j; + int iter = 0, max_iter=max(100,k); + double **Q=Malloc(double *,k); + double *Qp=Malloc(double,k); + double pQp, eps=0.005/k; + + for (t=0;tmax_error) + max_error=error; + } + if (max_error=max_iter) + info("Exceeds max_iter in multiclass_prob\n"); + for(t=0;tl); + double *dec_values = Malloc(double,prob->l); + + // random shuffle + for(i=0;il;i++) perm[i]=i; + for(i=0;il;i++) + { + int j = i+bounded_rand_int(prob->l-i); + swap(perm[i],perm[j]); + } + for(i=0;il/nr_fold; + int end = (i+1)*prob->l/nr_fold; + int j,k; + struct PREFIX(problem) subprob; + + subprob.l = prob->l-(end-begin); +#ifdef _DENSE_REP + subprob.x = Malloc(struct PREFIX(node),subprob.l); +#else + subprob.x = Malloc(struct PREFIX(node)*,subprob.l); +#endif + subprob.y = Malloc(double,subprob.l); + subprob.W = Malloc(double,subprob.l); + + k=0; + for(j=0;jx[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + subprob.W[k] = prob->W[perm[j]]; + ++k; + } + for(j=end;jl;j++) + { + subprob.x[k] = prob->x[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + subprob.W[k] = prob->W[perm[j]]; + ++k; + } + int p_count=0,n_count=0; + for(j=0;j0) + p_count++; + else + n_count++; + + if(p_count==0 && n_count==0) + for(j=begin;j 0 && n_count == 0) + for(j=begin;j 0) + for(j=begin;jx+perm[j]),&(dec_values[perm[j]]), blas_functions); +#else + PREFIX(predict_values)(submodel,prob->x[perm[j]],&(dec_values[perm[j]]), blas_functions); +#endif + // ensure +1 -1 order; reason not using CV subroutine + dec_values[perm[j]] *= submodel->label[0]; + } + PREFIX(free_and_destroy_model)(&submodel); + PREFIX(destroy_param)(&subparam); + } + free(subprob.x); + free(subprob.y); + free(subprob.W); + } + sigmoid_train(prob->l,dec_values,prob->y,probA,probB); + free(dec_values); + free(perm); +} + +// Return parameter of a Laplace distribution +static double svm_svr_probability( + const PREFIX(problem) *prob, const svm_parameter *param, BlasFunctions *blas_functions) +{ + int i; + int nr_fold = 5; + double *ymv = Malloc(double,prob->l); + double mae = 0; + + svm_parameter newparam = *param; + newparam.probability = 0; + newparam.random_seed = -1; // This is called from train, which already sets + // the seed. + PREFIX(cross_validation)(prob,&newparam,nr_fold,ymv, blas_functions); + for(i=0;il;i++) + { + ymv[i]=prob->y[i]-ymv[i]; + mae += fabs(ymv[i]); + } + mae /= prob->l; + double std=sqrt(2*mae*mae); + int count=0; + mae=0; + for(i=0;il;i++) + if (fabs(ymv[i]) > 5*std) + count=count+1; + else + mae+=fabs(ymv[i]); + mae /= (prob->l-count); + info("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma= %g\n",mae); + free(ymv); + return mae; +} + + + +// label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data +// perm, length l, must be allocated before calling this subroutine +static void svm_group_classes(const PREFIX(problem) *prob, int *nr_class_ret, int **label_ret, int **start_ret, int **count_ret, int *perm) +{ + int l = prob->l; + int max_nr_class = 16; + int nr_class = 0; + int *label = Malloc(int,max_nr_class); + int *count = Malloc(int,max_nr_class); + int *data_label = Malloc(int,l); + int i, j, this_label, this_count; + + for(i=0;iy[i]; + for(j=0;j=0 && label[i] > this_label) + { + label[i+1] = label[i]; + count[i+1] = count[i]; + i--; + } + label[i+1] = this_label; + count[i+1] = this_count; + } + + for (i=0; iy[i]; + while(this_label != label[j]){ + j ++; + } + data_label[i] = j; + } + + int *start = Malloc(int,nr_class); + start[0] = 0; + for(i=1;i 0. +// +static void remove_zero_weight(PREFIX(problem) *newprob, const PREFIX(problem) *prob) +{ + int i; + int l = 0; + for(i=0;il;i++) + if(prob->W[i] > 0) l++; + *newprob = *prob; + newprob->l = l; +#ifdef _DENSE_REP + newprob->x = Malloc(PREFIX(node),l); +#else + newprob->x = Malloc(PREFIX(node) *,l); +#endif + newprob->y = Malloc(double,l); + newprob->W = Malloc(double,l); + + int j = 0; + for(i=0;il;i++) + if(prob->W[i] > 0) + { + newprob->x[j] = prob->x[i]; + newprob->y[j] = prob->y[i]; + newprob->W[j] = prob->W[i]; + j++; + } +} + +// +// Interface functions +// +PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *param, + int *status, BlasFunctions *blas_functions) +{ + PREFIX(problem) newprob; + remove_zero_weight(&newprob, prob); + prob = &newprob; + + PREFIX(model) *model = Malloc(PREFIX(model),1); + model->param = *param; + model->free_sv = 0; // XXX + + if(param->random_seed >= 0) + { + set_seed(param->random_seed); + } + + if(param->svm_type == ONE_CLASS || + param->svm_type == EPSILON_SVR || + param->svm_type == NU_SVR) + { + // regression or one-class-svm + model->nr_class = 2; + model->label = NULL; + model->nSV = NULL; + model->probA = NULL; model->probB = NULL; + model->sv_coef = Malloc(double *,1); + + if(param->probability && + (param->svm_type == EPSILON_SVR || + param->svm_type == NU_SVR)) + { + model->probA = Malloc(double,1); + model->probA[0] = NAMESPACE::svm_svr_probability(prob,param,blas_functions); + } + + NAMESPACE::decision_function f = NAMESPACE::svm_train_one(prob,param,0,0, status,blas_functions); + model->rho = Malloc(double,1); + model->rho[0] = f.rho; + model->n_iter = Malloc(int,1); + model->n_iter[0] = f.n_iter; + + int nSV = 0; + int i; + for(i=0;il;i++) + if(fabs(f.alpha[i]) > 0) ++nSV; + model->l = nSV; +#ifdef _DENSE_REP + model->SV = Malloc(PREFIX(node),nSV); +#else + model->SV = Malloc(PREFIX(node) *,nSV); +#endif + model->sv_ind = Malloc(int, nSV); + model->sv_coef[0] = Malloc(double, nSV); + int j = 0; + for(i=0;il;i++) + if(fabs(f.alpha[i]) > 0) + { + model->SV[j] = prob->x[i]; + model->sv_ind[j] = i; + model->sv_coef[0][j] = f.alpha[i]; + ++j; + } + + free(f.alpha); + } + else + { + // classification + int l = prob->l; + int nr_class; + int *label = NULL; + int *start = NULL; + int *count = NULL; + int *perm = Malloc(int,l); + + // group training data of the same class + NAMESPACE::svm_group_classes(prob,&nr_class,&label,&start,&count,perm); +#ifdef _DENSE_REP + PREFIX(node) *x = Malloc(PREFIX(node),l); +#else + PREFIX(node) **x = Malloc(PREFIX(node) *,l); +#endif + double *W = Malloc(double, l); + + int i; + for(i=0;ix[perm[i]]; + W[i] = prob->W[perm[i]]; + } + + // calculate weighted C + + double *weighted_C = Malloc(double, nr_class); + for(i=0;iC; + for(i=0;inr_weight;i++) + { + int j; + for(j=0;jweight_label[i] == label[j]) + break; + if(j == nr_class) + fprintf(stderr,"warning: class label %d specified in weight is not found\n", param->weight_label[i]); + else + weighted_C[j] *= param->weight[i]; + } + + // train k*(k-1)/2 models + + bool *nonzero = Malloc(bool,l); + for(i=0;iprobability) + { + probA=Malloc(double,nr_class*(nr_class-1)/2); + probB=Malloc(double,nr_class*(nr_class-1)/2); + } + + int p = 0; + for(i=0;iprobability) + NAMESPACE::svm_binary_svc_probability(&sub_prob,param,weighted_C[i],weighted_C[j],probA[p],probB[p], status, blas_functions); + + f[p] = NAMESPACE::svm_train_one(&sub_prob,param,weighted_C[i],weighted_C[j], status, blas_functions); + for(k=0;k 0) + nonzero[si+k] = true; + for(k=0;k 0) + nonzero[sj+k] = true; + free(sub_prob.x); + free(sub_prob.y); + free(sub_prob.W); + ++p; + } + + // build output + + model->nr_class = nr_class; + + model->label = Malloc(int,nr_class); + for(i=0;ilabel[i] = label[i]; + + model->rho = Malloc(double,nr_class*(nr_class-1)/2); + model->n_iter = Malloc(int,nr_class*(nr_class-1)/2); + for(i=0;irho[i] = f[i].rho; + model->n_iter[i] = f[i].n_iter; + } + + if(param->probability) + { + model->probA = Malloc(double,nr_class*(nr_class-1)/2); + model->probB = Malloc(double,nr_class*(nr_class-1)/2); + for(i=0;iprobA[i] = probA[i]; + model->probB[i] = probB[i]; + } + } + else + { + model->probA=NULL; + model->probB=NULL; + } + + int total_sv = 0; + int *nz_count = Malloc(int,nr_class); + model->nSV = Malloc(int,nr_class); + for(i=0;inSV[i] = nSV; + nz_count[i] = nSV; + } + + info("Total nSV = %d\n",total_sv); + + model->l = total_sv; + model->sv_ind = Malloc(int, total_sv); +#ifdef _DENSE_REP + model->SV = Malloc(PREFIX(node),total_sv); +#else + model->SV = Malloc(PREFIX(node) *,total_sv); +#endif + p = 0; + for(i=0;iSV[p] = x[i]; + model->sv_ind[p] = perm[i]; + ++p; + } + } + + int *nz_start = Malloc(int,nr_class); + nz_start[0] = 0; + for(i=1;isv_coef = Malloc(double *,nr_class-1); + for(i=0;isv_coef[i] = Malloc(double,total_sv); + + p = 0; + for(i=0;isv_coef[j-1][q++] = f[p].alpha[k]; + q = nz_start[j]; + for(k=0;ksv_coef[i][q++] = f[p].alpha[ci+k]; + ++p; + } + + free(label); + free(probA); + free(probB); + free(count); + free(perm); + free(start); + free(W); + free(x); + free(weighted_C); + free(nonzero); + for(i=0;il; + int *perm = Malloc(int,l); + int nr_class; + if(param->random_seed >= 0) + { + set_seed(param->random_seed); + } + + // stratified cv may not give leave-one-out rate + // Each class to l folds -> some folds may have zero elements + if((param->svm_type == C_SVC || + param->svm_type == NU_SVC) && nr_fold < l) + { + int *start = NULL; + int *label = NULL; + int *count = NULL; + NAMESPACE::svm_group_classes(prob,&nr_class,&label,&start,&count,perm); + + // random shuffle and then data grouped by fold using the array perm + int *fold_count = Malloc(int,nr_fold); + int c; + int *index = Malloc(int,l); + for(i=0;ix[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + subprob.W[k] = prob->W[perm[j]]; + ++k; + } + for(j=end;jx[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + subprob.W[k] = prob->W[perm[j]]; + ++k; + } + int dummy_status = 0; // IGNORES TIMEOUT ERRORS + struct PREFIX(model) *submodel = PREFIX(train)(&subprob,param, &dummy_status, blas_functions); + if(param->probability && + (param->svm_type == C_SVC || param->svm_type == NU_SVC)) + { + double *prob_estimates=Malloc(double, PREFIX(get_nr_class)(submodel)); + for(j=begin;jx + perm[j]),prob_estimates, blas_functions); +#else + target[perm[j]] = PREFIX(predict_probability)(submodel,prob->x[perm[j]],prob_estimates, blas_functions); +#endif + free(prob_estimates); + } + else + for(j=begin;jx+perm[j],blas_functions); +#else + target[perm[j]] = PREFIX(predict)(submodel,prob->x[perm[j]],blas_functions); +#endif + PREFIX(free_and_destroy_model)(&submodel); + free(subprob.x); + free(subprob.y); + free(subprob.W); + } + free(fold_start); + free(perm); +} + + +int PREFIX(get_svm_type)(const PREFIX(model) *model) +{ + return model->param.svm_type; +} + +int PREFIX(get_nr_class)(const PREFIX(model) *model) +{ + return model->nr_class; +} + +void PREFIX(get_labels)(const PREFIX(model) *model, int* label) +{ + if (model->label != NULL) + for(int i=0;inr_class;i++) + label[i] = model->label[i]; +} + +double PREFIX(get_svr_probability)(const PREFIX(model) *model) +{ + if ((model->param.svm_type == EPSILON_SVR || model->param.svm_type == NU_SVR) && + model->probA!=NULL) + return model->probA[0]; + else + { + fprintf(stderr,"Model doesn't contain information for SVR probability inference\n"); + return 0; + } +} + +double PREFIX(predict_values)(const PREFIX(model) *model, const PREFIX(node) *x, double* dec_values, BlasFunctions *blas_functions) +{ + int i; + if(model->param.svm_type == ONE_CLASS || + model->param.svm_type == EPSILON_SVR || + model->param.svm_type == NU_SVR) + { + double *sv_coef = model->sv_coef[0]; + double sum = 0; + + for(i=0;il;i++) +#ifdef _DENSE_REP + sum += sv_coef[i] * NAMESPACE::Kernel::k_function(x,model->SV+i,model->param,blas_functions); +#else + sum += sv_coef[i] * NAMESPACE::Kernel::k_function(x,model->SV[i],model->param,blas_functions); +#endif + sum -= model->rho[0]; + *dec_values = sum; + + if(model->param.svm_type == ONE_CLASS) + return (sum>0)?1:-1; + else + return sum; + } + else + { + int nr_class = model->nr_class; + int l = model->l; + + double *kvalue = Malloc(double,l); + for(i=0;iSV+i,model->param,blas_functions); +#else + kvalue[i] = NAMESPACE::Kernel::k_function(x,model->SV[i],model->param,blas_functions); +#endif + + int *start = Malloc(int,nr_class); + start[0] = 0; + for(i=1;inSV[i-1]; + + int *vote = Malloc(int,nr_class); + for(i=0;inSV[i]; + int cj = model->nSV[j]; + + int k; + double *coef1 = model->sv_coef[j-1]; + double *coef2 = model->sv_coef[i]; + for(k=0;krho[p]; + dec_values[p] = sum; + + if(dec_values[p] > 0) + ++vote[i]; + else + ++vote[j]; + p++; + } + + int vote_max_idx = 0; + for(i=1;i vote[vote_max_idx]) + vote_max_idx = i; + + free(kvalue); + free(start); + free(vote); + return model->label[vote_max_idx]; + } +} + +double PREFIX(predict)(const PREFIX(model) *model, const PREFIX(node) *x, BlasFunctions *blas_functions) +{ + int nr_class = model->nr_class; + double *dec_values; + if(model->param.svm_type == ONE_CLASS || + model->param.svm_type == EPSILON_SVR || + model->param.svm_type == NU_SVR) + dec_values = Malloc(double, 1); + else + dec_values = Malloc(double, nr_class*(nr_class-1)/2); + double pred_result = PREFIX(predict_values)(model, x, dec_values, blas_functions); + free(dec_values); + return pred_result; +} + +double PREFIX(predict_probability)( + const PREFIX(model) *model, const PREFIX(node) *x, double *prob_estimates, BlasFunctions *blas_functions) +{ + if ((model->param.svm_type == C_SVC || model->param.svm_type == NU_SVC) && + model->probA!=NULL && model->probB!=NULL) + { + int i; + int nr_class = model->nr_class; + double *dec_values = Malloc(double, nr_class*(nr_class-1)/2); + PREFIX(predict_values)(model, x, dec_values, blas_functions); + + double min_prob=1e-7; + double **pairwise_prob=Malloc(double *,nr_class); + for(i=0;iprobA[k],model->probB[k]),min_prob),1-min_prob); + pairwise_prob[j][i]=1-pairwise_prob[i][j]; + k++; + } + NAMESPACE::multiclass_probability(nr_class,pairwise_prob,prob_estimates); + + int prob_max_idx = 0; + for(i=1;i prob_estimates[prob_max_idx]) + prob_max_idx = i; + for(i=0;ilabel[prob_max_idx]; + } + else + return PREFIX(predict)(model, x, blas_functions); +} + + +void PREFIX(free_model_content)(PREFIX(model)* model_ptr) +{ + if(model_ptr->free_sv && model_ptr->l > 0 && model_ptr->SV != NULL) +#ifdef _DENSE_REP + for (int i = 0; i < model_ptr->l; i++) + free(model_ptr->SV[i].values); +#else + free((void *)(model_ptr->SV[0])); +#endif + + if(model_ptr->sv_coef) + { + for(int i=0;inr_class-1;i++) + free(model_ptr->sv_coef[i]); + } + + free(model_ptr->SV); + model_ptr->SV = NULL; + + free(model_ptr->sv_coef); + model_ptr->sv_coef = NULL; + + free(model_ptr->sv_ind); + model_ptr->sv_ind = NULL; + + free(model_ptr->rho); + model_ptr->rho = NULL; + + free(model_ptr->label); + model_ptr->label= NULL; + + free(model_ptr->probA); + model_ptr->probA = NULL; + + free(model_ptr->probB); + model_ptr->probB= NULL; + + free(model_ptr->nSV); + model_ptr->nSV = NULL; + + free(model_ptr->n_iter); + model_ptr->n_iter = NULL; +} + +void PREFIX(free_and_destroy_model)(PREFIX(model)** model_ptr_ptr) +{ + if(model_ptr_ptr != NULL && *model_ptr_ptr != NULL) + { + PREFIX(free_model_content)(*model_ptr_ptr); + free(*model_ptr_ptr); + *model_ptr_ptr = NULL; + } +} + +void PREFIX(destroy_param)(svm_parameter* param) +{ + free(param->weight_label); + free(param->weight); +} + +const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_parameter *param) +{ + // svm_type + + int svm_type = param->svm_type; + if(svm_type != C_SVC && + svm_type != NU_SVC && + svm_type != ONE_CLASS && + svm_type != EPSILON_SVR && + svm_type != NU_SVR) + return "unknown svm type"; + + // kernel_type, degree + + int kernel_type = param->kernel_type; + if(kernel_type != LINEAR && + kernel_type != POLY && + kernel_type != RBF && + kernel_type != SIGMOID && + kernel_type != PRECOMPUTED) + return "unknown kernel type"; + + if(param->gamma < 0) + return "gamma < 0"; + + if(param->degree < 0) + return "degree of polynomial kernel < 0"; + + // cache_size,eps,C,nu,p,shrinking + + if(param->cache_size <= 0) + return "cache_size <= 0"; + + if(param->eps <= 0) + return "eps <= 0"; + + if(svm_type == C_SVC || + svm_type == EPSILON_SVR || + svm_type == NU_SVR) + if(param->C <= 0) + return "C <= 0"; + + if(svm_type == NU_SVC || + svm_type == ONE_CLASS || + svm_type == NU_SVR) + if(param->nu <= 0 || param->nu > 1) + return "nu <= 0 or nu > 1"; + + if(svm_type == EPSILON_SVR) + if(param->p < 0) + return "p < 0"; + + if(param->shrinking != 0 && + param->shrinking != 1) + return "shrinking != 0 and shrinking != 1"; + + if(param->probability != 0 && + param->probability != 1) + return "probability != 0 and probability != 1"; + + if(param->probability == 1 && + svm_type == ONE_CLASS) + return "one-class SVM probability output not supported yet"; + + + // check whether nu-svc is feasible + + if(svm_type == NU_SVC) + { + int l = prob->l; + int max_nr_class = 16; + int nr_class = 0; + int *label = Malloc(int,max_nr_class); + double *count = Malloc(double,max_nr_class); + + int i; + for(i=0;iy[i]; + int j; + for(j=0;jW[i]; + break; + } + if(j == nr_class) + { + if(nr_class == max_nr_class) + { + max_nr_class *= 2; + label = (int *)realloc(label,max_nr_class*sizeof(int)); + count = (double *)realloc(count,max_nr_class*sizeof(double)); + + } + label[nr_class] = this_label; + count[nr_class] = prob->W[i]; + ++nr_class; + } + } + + for(i=0;inu*(n1+n2)/2 > min(n1,n2)) + { + free(label); + free(count); + return "specified nu is infeasible"; + } + } + } + free(label); + free(count); + } + + if(svm_type == C_SVC || + svm_type == EPSILON_SVR || + svm_type == NU_SVR || + svm_type == ONE_CLASS) + { + PREFIX(problem) newprob; + // filter samples with negative and null weights + remove_zero_weight(&newprob, prob); + + // all samples were removed + if(newprob.l == 0) { + free(newprob.x); + free(newprob.y); + free(newprob.W); + return "Invalid input - all samples have zero or negative weights."; + } + else if(prob->l != newprob.l && + svm_type == C_SVC) + { + bool only_one_label = true; + int first_label = newprob.y[0]; + for(int i=1;i + */ +#ifndef _NEWRAND_H +#define _NEWRAND_H + +#ifdef __cplusplus +#include // needed for cython to generate a .cpp file from newrand.h +extern "C" { +#endif + +// Scikit-Learn-specific random number generator replacing `rand()` originally +// used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux, +// with increased speed +// - (1) Init a `mt_rand` object +std::mt19937 mt_rand(std::mt19937::default_seed); + +// - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed. +void set_seed(unsigned custom_seed) { + mt_rand.seed(custom_seed); +} + +// - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere. +inline uint32_t bounded_rand_int(uint32_t range) { + // "LibSVM / LibLinear Original way" - make a 31bit positive + // random number and use modulo to make it fit in the range + // return abs( (int)mt_rand()) % range; + + // "Better way": tweaked Lemire post-processor + // from http://www.pcg-random.org/posts/bounded-rands.html + uint32_t x = mt_rand(); + uint64_t m = uint64_t(x) * uint64_t(range); + uint32_t l = uint32_t(m); + if (l < range) { + uint32_t t = -range; + if (t >= range) { + t -= range; + if (t >= range) + t %= range; + } + while (l < t) { + x = mt_rand(); + m = uint64_t(x) * uint64_t(range); + l = uint32_t(m); + } + } + return m >> 32; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _NEWRAND_H */ diff --git a/.venv/Lib/site-packages/sklearn/svm/tests/__init__.py b/.venv/Lib/site-packages/sklearn/svm/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/Lib/site-packages/sklearn/svm/tests/test_bounds.py b/.venv/Lib/site-packages/sklearn/svm/tests/test_bounds.py new file mode 100644 index 0000000000000000000000000000000000000000..01c33b1319b42323e9a112d4d54c4ed673473633 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/tests/test_bounds.py @@ -0,0 +1,142 @@ +import numpy as np +import pytest +from scipy import stats + +from sklearn.linear_model import LogisticRegression +from sklearn.svm import LinearSVC +from sklearn.svm._bounds import l1_min_c +from sklearn.svm._newrand import bounded_rand_int_wrap, set_seed_wrap +from sklearn.utils.fixes import CSR_CONTAINERS + +dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]] + +Y1 = [0, 1, 1, 1] +Y2 = [2, 1, 0, 0] + + +@pytest.mark.parametrize("X_container", CSR_CONTAINERS + [np.array]) +@pytest.mark.parametrize("loss", ["squared_hinge", "log"]) +@pytest.mark.parametrize("Y_label", ["two-classes", "multi-class"]) +@pytest.mark.parametrize("intercept_label", ["no-intercept", "fit-intercept"]) +def test_l1_min_c(X_container, loss, Y_label, intercept_label): + Ys = {"two-classes": Y1, "multi-class": Y2} + intercepts = { + "no-intercept": {"fit_intercept": False}, + "fit-intercept": {"fit_intercept": True, "intercept_scaling": 10}, + } + + X = X_container(dense_X) + Y = Ys[Y_label] + intercept_params = intercepts[intercept_label] + check_l1_min_c(X, Y, loss, **intercept_params) + + +def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=1.0): + min_c = l1_min_c( + X, + y, + loss=loss, + fit_intercept=fit_intercept, + intercept_scaling=intercept_scaling, + ) + + clf = { + "log": LogisticRegression(penalty="l1", solver="liblinear"), + "squared_hinge": LinearSVC(loss="squared_hinge", penalty="l1", dual=False), + }[loss] + + clf.fit_intercept = fit_intercept + clf.intercept_scaling = intercept_scaling + + clf.C = min_c + clf.fit(X, y) + assert (np.asarray(clf.coef_) == 0).all() + assert (np.asarray(clf.intercept_) == 0).all() + + clf.C = min_c * 1.01 + clf.fit(X, y) + assert (np.asarray(clf.coef_) != 0).any() or (np.asarray(clf.intercept_) != 0).any() + + +def test_ill_posed_min_c(): + X = [[0, 0], [0, 0]] + y = [0, 1] + with pytest.raises(ValueError): + l1_min_c(X, y) + + +_MAX_UNSIGNED_INT = 4294967295 + + +def test_newrand_default(): + """Test that bounded_rand_int_wrap without seeding respects the range + + Note this test should pass either if executed alone, or in conjunctions + with other tests that call set_seed explicit in any order: it checks + invariants on the RNG instead of specific values. + """ + generated = [bounded_rand_int_wrap(100) for _ in range(10)] + assert all(0 <= x < 100 for x in generated) + assert not all(x == generated[0] for x in generated) + + +@pytest.mark.parametrize("seed, expected", [(0, 54), (_MAX_UNSIGNED_INT, 9)]) +def test_newrand_set_seed(seed, expected): + """Test that `set_seed` produces deterministic results""" + set_seed_wrap(seed) + generated = bounded_rand_int_wrap(100) + assert generated == expected + + +@pytest.mark.parametrize("seed", [-1, _MAX_UNSIGNED_INT + 1]) +def test_newrand_set_seed_overflow(seed): + """Test that `set_seed_wrap` is defined for unsigned 32bits ints""" + with pytest.raises(OverflowError): + set_seed_wrap(seed) + + +@pytest.mark.parametrize("range_, n_pts", [(_MAX_UNSIGNED_INT, 10000), (100, 25)]) +def test_newrand_bounded_rand_int(range_, n_pts): + """Test that `bounded_rand_int` follows a uniform distribution""" + # XXX: this test is very seed sensitive: either it is wrong (too strict?) + # or the wrapped RNG is not uniform enough, at least on some platforms. + set_seed_wrap(42) + n_iter = 100 + ks_pvals = [] + uniform_dist = stats.uniform(loc=0, scale=range_) + # perform multiple samplings to make chance of outlier sampling negligible + for _ in range(n_iter): + # Deterministic random sampling + sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)] + res = stats.kstest(sample, uniform_dist.cdf) + ks_pvals.append(res.pvalue) + # Null hypothesis = samples come from an uniform distribution. + # Under the null hypothesis, p-values should be uniformly distributed + # and not concentrated on low values + # (this may seem counter-intuitive but is backed by multiple refs) + # So we can do two checks: + + # (1) check uniformity of p-values + uniform_p_vals_dist = stats.uniform(loc=0, scale=1) + res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) + assert res_pvals.pvalue > 0.05, ( + "Null hypothesis rejected: generated random numbers are not uniform." + " Details: the (meta) p-value of the test of uniform distribution" + f" of p-values is {res_pvals.pvalue} which is not > 0.05" + ) + + # (2) (safety belt) check that 90% of p-values are above 0.05 + min_10pct_pval = np.percentile(ks_pvals, q=10) + # lower 10th quantile pvalue <= 0.05 means that the test rejects the + # null hypothesis that the sample came from the uniform distribution + assert min_10pct_pval > 0.05, ( + "Null hypothesis rejected: generated random numbers are not uniform. " + f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05." + ) + + +@pytest.mark.parametrize("range_", [-1, _MAX_UNSIGNED_INT + 1]) +def test_newrand_bounded_rand_int_limits(range_): + """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints""" + with pytest.raises(OverflowError): + bounded_rand_int_wrap(range_) diff --git a/.venv/Lib/site-packages/sklearn/svm/tests/test_sparse.py b/.venv/Lib/site-packages/sklearn/svm/tests/test_sparse.py new file mode 100644 index 0000000000000000000000000000000000000000..42774f6866bbd337c1695a89a4842f1dcbf756ae --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/tests/test_sparse.py @@ -0,0 +1,493 @@ +import numpy as np +import pytest +from scipy import sparse + +from sklearn import base, datasets, linear_model, svm +from sklearn.datasets import load_digits, make_blobs, make_classification +from sklearn.exceptions import ConvergenceWarning +from sklearn.svm.tests import test_svm +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, + skip_if_32bit, +) +from sklearn.utils.extmath import safe_sparse_dot +from sklearn.utils.fixes import ( + CSR_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, +) + +# test sample 1 +X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) +Y = [1, 1, 1, 2, 2, 2] +T = np.array([[-1, -1], [2, 2], [3, 2]]) +true_result = [1, 2, 2] + +# test sample 2 +X2 = np.array( + [ + [0, 0, 0], + [1, 1, 1], + [2, 0, 0], + [0, 0, 2], + [3, 3, 3], + ] +) +Y2 = [1, 2, 2, 2, 3] +T2 = np.array([[-1, -1, -1], [1, 1, 1], [2, 2, 2]]) +true_result2 = [1, 2, 3] + +iris = datasets.load_iris() +rng = np.random.RandomState(0) +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + +X_blobs, y_blobs = make_blobs(n_samples=100, centers=10, random_state=0) + + +def check_svm_model_equal(dense_svm, X_train, y_train, X_test): + # Use the original svm model for dense fit and clone an exactly same + # svm model for sparse fit + sparse_svm = base.clone(dense_svm) + + dense_svm.fit(X_train.toarray(), y_train) + if sparse.issparse(X_test): + X_test_dense = X_test.toarray() + else: + X_test_dense = X_test + sparse_svm.fit(X_train, y_train) + assert sparse.issparse(sparse_svm.support_vectors_) + assert sparse.issparse(sparse_svm.dual_coef_) + assert_allclose(dense_svm.support_vectors_, sparse_svm.support_vectors_.toarray()) + assert_allclose(dense_svm.dual_coef_, sparse_svm.dual_coef_.toarray()) + if dense_svm.kernel == "linear": + assert sparse.issparse(sparse_svm.coef_) + assert_array_almost_equal(dense_svm.coef_, sparse_svm.coef_.toarray()) + assert_allclose(dense_svm.support_, sparse_svm.support_) + assert_allclose(dense_svm.predict(X_test_dense), sparse_svm.predict(X_test)) + + assert_array_almost_equal( + dense_svm.decision_function(X_test_dense), sparse_svm.decision_function(X_test) + ) + assert_array_almost_equal( + dense_svm.decision_function(X_test_dense), + sparse_svm.decision_function(X_test_dense), + ) + if isinstance(dense_svm, svm.OneClassSVM): + msg = "cannot use sparse input in 'OneClassSVM' trained on dense data" + else: + assert_array_almost_equal( + dense_svm.predict_proba(X_test_dense), + sparse_svm.predict_proba(X_test), + decimal=4, + ) + msg = "cannot use sparse input in 'SVC' trained on dense data" + if sparse.issparse(X_test): + with pytest.raises(ValueError, match=msg): + dense_svm.predict(X_test) + + +@skip_if_32bit +@pytest.mark.parametrize( + "X_train, y_train, X_test", + [ + [X, Y, T], + [X2, Y2, T2], + [X_blobs[:80], y_blobs[:80], X_blobs[80:]], + [iris.data, iris.target, iris.data], + ], +) +@pytest.mark.parametrize("kernel", ["linear", "poly", "rbf", "sigmoid"]) +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + LIL_CONTAINERS) +def test_svc(X_train, y_train, X_test, kernel, sparse_container): + """Check that sparse SVC gives the same result as SVC.""" + X_train = sparse_container(X_train) + + clf = svm.SVC( + gamma=1, + kernel=kernel, + probability=True, + random_state=0, + decision_function_shape="ovo", + ) + check_svm_model_equal(clf, X_train, y_train, X_test) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_unsorted_indices(csr_container): + # test that the result with sorted and unsorted indices in csr is the same + # we use a subset of digits as iris, blobs or make_classification didn't + # show the problem + X, y = load_digits(return_X_y=True) + X_test = csr_container(X[50:100]) + X, y = X[:50], y[:50] + + X_sparse = csr_container(X) + coef_dense = ( + svm.SVC(kernel="linear", probability=True, random_state=0).fit(X, y).coef_ + ) + sparse_svc = svm.SVC(kernel="linear", probability=True, random_state=0).fit( + X_sparse, y + ) + coef_sorted = sparse_svc.coef_ + # make sure dense and sparse SVM give the same result + assert_allclose(coef_dense, coef_sorted.toarray()) + + # reverse each row's indices + def scramble_indices(X): + new_data = [] + new_indices = [] + for i in range(1, len(X.indptr)): + row_slice = slice(*X.indptr[i - 1 : i + 1]) + new_data.extend(X.data[row_slice][::-1]) + new_indices.extend(X.indices[row_slice][::-1]) + return csr_container((new_data, new_indices, X.indptr), shape=X.shape) + + X_sparse_unsorted = scramble_indices(X_sparse) + X_test_unsorted = scramble_indices(X_test) + + assert not X_sparse_unsorted.has_sorted_indices + assert not X_test_unsorted.has_sorted_indices + + unsorted_svc = svm.SVC(kernel="linear", probability=True, random_state=0).fit( + X_sparse_unsorted, y + ) + coef_unsorted = unsorted_svc.coef_ + # make sure unsorted indices give same result + assert_allclose(coef_unsorted.toarray(), coef_sorted.toarray()) + assert_allclose( + sparse_svc.predict_proba(X_test_unsorted), sparse_svc.predict_proba(X_test) + ) + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_svc_with_custom_kernel(lil_container): + def kfunc(x, y): + return safe_sparse_dot(x, y.T) + + X_sp = lil_container(X) + clf_lin = svm.SVC(kernel="linear").fit(X_sp, Y) + clf_mylin = svm.SVC(kernel=kfunc).fit(X_sp, Y) + assert_array_equal(clf_lin.predict(X_sp), clf_mylin.predict(X_sp)) + + +@skip_if_32bit +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +@pytest.mark.parametrize("kernel", ["linear", "poly", "rbf"]) +def test_svc_iris(csr_container, kernel): + # Test the sparse SVC with the iris dataset + iris_data_sp = csr_container(iris.data) + + sp_clf = svm.SVC(kernel=kernel).fit(iris_data_sp, iris.target) + clf = svm.SVC(kernel=kernel).fit(iris.data, iris.target) + + assert_allclose(clf.support_vectors_, sp_clf.support_vectors_.toarray()) + assert_allclose(clf.dual_coef_, sp_clf.dual_coef_.toarray()) + assert_allclose(clf.predict(iris.data), sp_clf.predict(iris_data_sp)) + if kernel == "linear": + assert_allclose(clf.coef_, sp_clf.coef_.toarray()) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_decision_function(csr_container): + # Test decision_function + + # Sanity check, test that decision_function implemented in python + # returns the same as the one in libsvm + + # multi class: + iris_data_sp = csr_container(iris.data) + svc = svm.SVC(kernel="linear", C=0.1, decision_function_shape="ovo") + clf = svc.fit(iris_data_sp, iris.target) + + dec = safe_sparse_dot(iris_data_sp, clf.coef_.T) + clf.intercept_ + + assert_allclose(dec, clf.decision_function(iris_data_sp)) + + # binary: + clf.fit(X, Y) + dec = np.dot(X, clf.coef_.T) + clf.intercept_ + prediction = clf.predict(X) + assert_allclose(dec.ravel(), clf.decision_function(X)) + assert_allclose( + prediction, clf.classes_[(clf.decision_function(X) > 0).astype(int).ravel()] + ) + expected = np.array([-1.0, -0.66, -1.0, 0.66, 1.0, 1.0]) + assert_array_almost_equal(clf.decision_function(X), expected, decimal=2) + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_error(lil_container): + # Test that it gives proper exception on deficient input + clf = svm.SVC() + X_sp = lil_container(X) + + Y2 = Y[:-1] # wrong dimensions for labels + with pytest.raises(ValueError): + clf.fit(X_sp, Y2) + + clf.fit(X_sp, Y) + assert_array_equal(clf.predict(T), true_result) + + +@pytest.mark.parametrize( + "lil_container, dok_container", zip(LIL_CONTAINERS, DOK_CONTAINERS) +) +def test_linearsvc(lil_container, dok_container): + # Similar to test_SVC + X_sp = lil_container(X) + X2_sp = dok_container(X2) + + clf = svm.LinearSVC(random_state=0).fit(X, Y) + sp_clf = svm.LinearSVC(random_state=0).fit(X_sp, Y) + + assert sp_clf.fit_intercept + + assert_array_almost_equal(clf.coef_, sp_clf.coef_, decimal=4) + assert_array_almost_equal(clf.intercept_, sp_clf.intercept_, decimal=4) + + assert_allclose(clf.predict(X), sp_clf.predict(X_sp)) + + clf.fit(X2, Y2) + sp_clf.fit(X2_sp, Y2) + + assert_array_almost_equal(clf.coef_, sp_clf.coef_, decimal=4) + assert_array_almost_equal(clf.intercept_, sp_clf.intercept_, decimal=4) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_linearsvc_iris(csr_container): + # Test the sparse LinearSVC with the iris dataset + iris_data_sp = csr_container(iris.data) + + sp_clf = svm.LinearSVC(random_state=0).fit(iris_data_sp, iris.target) + clf = svm.LinearSVC(random_state=0).fit(iris.data, iris.target) + + assert clf.fit_intercept == sp_clf.fit_intercept + + assert_array_almost_equal(clf.coef_, sp_clf.coef_, decimal=1) + assert_array_almost_equal(clf.intercept_, sp_clf.intercept_, decimal=1) + assert_allclose(clf.predict(iris.data), sp_clf.predict(iris_data_sp)) + + # check decision_function + pred = np.argmax(sp_clf.decision_function(iris_data_sp), axis=1) + assert_allclose(pred, clf.predict(iris.data)) + + # sparsify the coefficients on both models and check that they still + # produce the same results + clf.sparsify() + assert_array_equal(pred, clf.predict(iris_data_sp)) + sp_clf.sparsify() + assert_array_equal(pred, sp_clf.predict(iris_data_sp)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_weight(csr_container): + # Test class weights + X_, y_ = make_classification( + n_samples=200, n_features=100, weights=[0.833, 0.167], random_state=0 + ) + + X_ = csr_container(X_) + for clf in ( + linear_model.LogisticRegression(), + svm.LinearSVC(random_state=0), + svm.SVC(), + ): + clf.set_params(class_weight={0: 5}) + clf.fit(X_[:180], y_[:180]) + y_pred = clf.predict(X_[180:]) + assert np.sum(y_pred == y_[180:]) >= 11 + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_sample_weights(lil_container): + # Test weights on individual samples + X_sp = lil_container(X) + + clf = svm.SVC() + clf.fit(X_sp, Y) + assert_array_equal(clf.predict([X[2]]), [1.0]) + + sample_weight = [0.1] * 3 + [10] * 3 + clf.fit(X_sp, Y, sample_weight=sample_weight) + assert_array_equal(clf.predict([X[2]]), [2.0]) + + +def test_sparse_liblinear_intercept_handling(): + # Test that sparse liblinear honours intercept_scaling param + test_svm.test_dense_liblinear_intercept_handling(svm.LinearSVC) + + +@pytest.mark.parametrize( + "X_train, y_train, X_test", + [ + [X, None, T], + [X2, None, T2], + [X_blobs[:80], None, X_blobs[80:]], + [iris.data, None, iris.data], + ], +) +@pytest.mark.parametrize("kernel", ["linear", "poly", "rbf", "sigmoid"]) +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + LIL_CONTAINERS) +@skip_if_32bit +def test_sparse_oneclasssvm(X_train, y_train, X_test, kernel, sparse_container): + # Check that sparse OneClassSVM gives the same result as dense OneClassSVM + X_train = sparse_container(X_train) + + clf = svm.OneClassSVM(gamma=1, kernel=kernel) + check_svm_model_equal(clf, X_train, y_train, X_test) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_realdata(csr_container): + # Test on a subset from the 20newsgroups dataset. + # This catches some bugs if input is not correctly converted into + # sparse format or weights are not correctly initialized. + data = np.array([0.03771744, 0.1003567, 0.01174647, 0.027069]) + + # SVC does not support large sparse, so we specify int32 indices + # In this case, `csr_matrix` automatically uses int32 regardless of the dtypes of + # `indices` and `indptr` but `csr_array` may or may not use the same dtype as + # `indices` and `indptr`, which would be int64 if not specified + indices = np.array([6, 5, 35, 31], dtype=np.int32) + indptr = np.array([0] * 8 + [1] * 32 + [2] * 38 + [4] * 3, dtype=np.int32) + + X = csr_container((data, indices, indptr)) + y = np.array( + [ + 1.0, + 0.0, + 2.0, + 2.0, + 1.0, + 1.0, + 1.0, + 2.0, + 2.0, + 0.0, + 1.0, + 2.0, + 2.0, + 0.0, + 2.0, + 0.0, + 3.0, + 0.0, + 3.0, + 0.0, + 1.0, + 1.0, + 3.0, + 2.0, + 3.0, + 2.0, + 0.0, + 3.0, + 1.0, + 0.0, + 2.0, + 1.0, + 2.0, + 0.0, + 1.0, + 0.0, + 2.0, + 3.0, + 1.0, + 3.0, + 0.0, + 1.0, + 0.0, + 0.0, + 2.0, + 0.0, + 1.0, + 2.0, + 2.0, + 2.0, + 3.0, + 2.0, + 0.0, + 3.0, + 2.0, + 1.0, + 2.0, + 3.0, + 2.0, + 2.0, + 0.0, + 1.0, + 0.0, + 1.0, + 2.0, + 3.0, + 0.0, + 0.0, + 2.0, + 2.0, + 1.0, + 3.0, + 1.0, + 1.0, + 0.0, + 1.0, + 2.0, + 1.0, + 1.0, + 3.0, + ] + ) + + clf = svm.SVC(kernel="linear").fit(X.toarray(), y) + sp_clf = svm.SVC(kernel="linear").fit(X.tocoo(), y) + + assert_array_equal(clf.support_vectors_, sp_clf.support_vectors_.toarray()) + assert_array_equal(clf.dual_coef_, sp_clf.dual_coef_.toarray()) + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_sparse_svc_clone_with_callable_kernel(lil_container): + # Test that the "dense_fit" is called even though we use sparse input + # meaning that everything works fine. + a = svm.SVC(C=1, kernel=lambda x, y: x @ y.T, probability=True, random_state=0) + b = base.clone(a) + + X_sp = lil_container(X) + b.fit(X_sp, Y) + pred = b.predict(X_sp) + b.predict_proba(X_sp) + + dense_svm = svm.SVC( + C=1, kernel=lambda x, y: np.dot(x, y.T), probability=True, random_state=0 + ) + pred_dense = dense_svm.fit(X, Y).predict(X) + assert_array_equal(pred_dense, pred) + # b.decision_function(X_sp) # XXX : should be supported + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_timeout(lil_container): + sp = svm.SVC( + C=1, kernel=lambda x, y: x @ y.T, probability=True, random_state=0, max_iter=1 + ) + warning_msg = ( + r"Solver terminated early \(max_iter=1\). Consider pre-processing " + r"your data with StandardScaler or MinMaxScaler." + ) + with pytest.warns(ConvergenceWarning, match=warning_msg): + sp.fit(lil_container(X), Y) + + +def test_consistent_proba(): + a = svm.SVC(probability=True, max_iter=1, random_state=0) + with ignore_warnings(category=ConvergenceWarning): + proba_1 = a.fit(X, Y).predict_proba(X) + a = svm.SVC(probability=True, max_iter=1, random_state=0) + with ignore_warnings(category=ConvergenceWarning): + proba_2 = a.fit(X, Y).predict_proba(X) + assert_allclose(proba_1, proba_2) diff --git a/.venv/Lib/site-packages/sklearn/svm/tests/test_svm.py b/.venv/Lib/site-packages/sklearn/svm/tests/test_svm.py new file mode 100644 index 0000000000000000000000000000000000000000..82c37d14f77b2520585433ae786c21f0dd87ba8c --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/svm/tests/test_svm.py @@ -0,0 +1,1440 @@ +""" +Testing for Support Vector Machine module (sklearn.svm) + +TODO: remove hard coded numerical results when possible +""" + +import numpy as np +import pytest +from numpy.testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) + +from sklearn import base, datasets, linear_model, metrics, svm +from sklearn.datasets import make_blobs, make_classification, make_regression +from sklearn.exceptions import ( + ConvergenceWarning, + NotFittedError, +) +from sklearn.metrics import f1_score +from sklearn.metrics.pairwise import rbf_kernel +from sklearn.model_selection import train_test_split +from sklearn.multiclass import OneVsRestClassifier + +# mypy error: Module 'sklearn.svm' has no attribute '_libsvm' +from sklearn.svm import ( # type: ignore + SVR, + LinearSVC, + LinearSVR, + NuSVR, + OneClassSVM, + _libsvm, +) +from sklearn.svm._classes import _validate_dual_parameter +from sklearn.utils import check_random_state, shuffle +from sklearn.utils.fixes import _IS_32BIT, CSR_CONTAINERS, LIL_CONTAINERS +from sklearn.utils.validation import _num_samples + +# toy sample +X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] +Y = [1, 1, 1, 2, 2, 2] +T = [[-1, -1], [2, 2], [3, 2]] +true_result = [1, 2, 2] + +# also load the iris dataset +iris = datasets.load_iris() +rng = check_random_state(42) +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + + +def test_libsvm_parameters(): + # Test parameters on classes that make use of libsvm. + clf = svm.SVC(kernel="linear").fit(X, Y) + assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]]) + assert_array_equal(clf.support_, [1, 3]) + assert_array_equal(clf.support_vectors_, (X[1], X[3])) + assert_array_equal(clf.intercept_, [0.0]) + assert_array_equal(clf.predict(X), Y) + + +def test_libsvm_iris(): + # Check consistency on dataset iris. + + # shuffle the dataset so that labels are not ordered + for k in ("linear", "rbf"): + clf = svm.SVC(kernel=k).fit(iris.data, iris.target) + assert np.mean(clf.predict(iris.data) == iris.target) > 0.9 + assert hasattr(clf, "coef_") == (k == "linear") + + assert_array_equal(clf.classes_, np.sort(clf.classes_)) + + # check also the low-level API + # We unpack the values to create a dictionary with some of the return values + # from Libsvm's fit. + ( + libsvm_support, + libsvm_support_vectors, + libsvm_n_class_SV, + libsvm_sv_coef, + libsvm_intercept, + libsvm_probA, + libsvm_probB, + # libsvm_fit_status and libsvm_n_iter won't be used below. + libsvm_fit_status, + libsvm_n_iter, + ) = _libsvm.fit(iris.data, iris.target.astype(np.float64)) + + model_params = { + "support": libsvm_support, + "SV": libsvm_support_vectors, + "nSV": libsvm_n_class_SV, + "sv_coef": libsvm_sv_coef, + "intercept": libsvm_intercept, + "probA": libsvm_probA, + "probB": libsvm_probB, + } + pred = _libsvm.predict(iris.data, **model_params) + assert np.mean(pred == iris.target) > 0.95 + + # We unpack the values to create a dictionary with some of the return values + # from Libsvm's fit. + ( + libsvm_support, + libsvm_support_vectors, + libsvm_n_class_SV, + libsvm_sv_coef, + libsvm_intercept, + libsvm_probA, + libsvm_probB, + # libsvm_fit_status and libsvm_n_iter won't be used below. + libsvm_fit_status, + libsvm_n_iter, + ) = _libsvm.fit(iris.data, iris.target.astype(np.float64), kernel="linear") + + model_params = { + "support": libsvm_support, + "SV": libsvm_support_vectors, + "nSV": libsvm_n_class_SV, + "sv_coef": libsvm_sv_coef, + "intercept": libsvm_intercept, + "probA": libsvm_probA, + "probB": libsvm_probB, + } + pred = _libsvm.predict(iris.data, **model_params, kernel="linear") + assert np.mean(pred == iris.target) > 0.95 + + pred = _libsvm.cross_validation( + iris.data, iris.target.astype(np.float64), 5, kernel="linear", random_seed=0 + ) + assert np.mean(pred == iris.target) > 0.95 + + # If random_seed >= 0, the libsvm rng is seeded (by calling `srand`), hence + # we should get deterministic results (assuming that there is no other + # thread calling this wrapper calling `srand` concurrently). + pred2 = _libsvm.cross_validation( + iris.data, iris.target.astype(np.float64), 5, kernel="linear", random_seed=0 + ) + assert_array_equal(pred, pred2) + + +def test_precomputed(): + # SVC with a precomputed kernel. + # We test it with a toy dataset and with iris. + clf = svm.SVC(kernel="precomputed") + # Gram matrix for train data (square matrix) + # (we use just a linear kernel) + K = np.dot(X, np.array(X).T) + clf.fit(K, Y) + # Gram matrix for test data (rectangular matrix) + KT = np.dot(T, np.array(X).T) + pred = clf.predict(KT) + with pytest.raises(ValueError): + clf.predict(KT.T) + + assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]]) + assert_array_equal(clf.support_, [1, 3]) + assert_array_equal(clf.intercept_, [0]) + assert_array_almost_equal(clf.support_, [1, 3]) + assert_array_equal(pred, true_result) + + # Gram matrix for test data but compute KT[i,j] + # for support vectors j only. + KT = np.zeros_like(KT) + for i in range(len(T)): + for j in clf.support_: + KT[i, j] = np.dot(T[i], X[j]) + + pred = clf.predict(KT) + assert_array_equal(pred, true_result) + + # same as before, but using a callable function instead of the kernel + # matrix. kernel is just a linear kernel + + def kfunc(x, y): + return np.dot(x, y.T) + + clf = svm.SVC(kernel=kfunc) + clf.fit(np.array(X), Y) + pred = clf.predict(T) + + assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]]) + assert_array_equal(clf.intercept_, [0]) + assert_array_almost_equal(clf.support_, [1, 3]) + assert_array_equal(pred, true_result) + + # test a precomputed kernel with the iris dataset + # and check parameters against a linear SVC + clf = svm.SVC(kernel="precomputed") + clf2 = svm.SVC(kernel="linear") + K = np.dot(iris.data, iris.data.T) + clf.fit(K, iris.target) + clf2.fit(iris.data, iris.target) + pred = clf.predict(K) + assert_array_almost_equal(clf.support_, clf2.support_) + assert_array_almost_equal(clf.dual_coef_, clf2.dual_coef_) + assert_array_almost_equal(clf.intercept_, clf2.intercept_) + assert_almost_equal(np.mean(pred == iris.target), 0.99, decimal=2) + + # Gram matrix for test data but compute KT[i,j] + # for support vectors j only. + K = np.zeros_like(K) + for i in range(len(iris.data)): + for j in clf.support_: + K[i, j] = np.dot(iris.data[i], iris.data[j]) + + pred = clf.predict(K) + assert_almost_equal(np.mean(pred == iris.target), 0.99, decimal=2) + + clf = svm.SVC(kernel=kfunc) + clf.fit(iris.data, iris.target) + assert_almost_equal(np.mean(pred == iris.target), 0.99, decimal=2) + + +def test_svr(): + # Test Support Vector Regression + + diabetes = datasets.load_diabetes() + for clf in ( + svm.NuSVR(kernel="linear", nu=0.4, C=1.0), + svm.NuSVR(kernel="linear", nu=0.4, C=10.0), + svm.SVR(kernel="linear", C=10.0), + svm.LinearSVR(C=10.0), + svm.LinearSVR(C=10.0), + ): + clf.fit(diabetes.data, diabetes.target) + assert clf.score(diabetes.data, diabetes.target) > 0.02 + + # non-regression test; previously, BaseLibSVM would check that + # len(np.unique(y)) < 2, which must only be done for SVC + svm.SVR().fit(diabetes.data, np.ones(len(diabetes.data))) + svm.LinearSVR().fit(diabetes.data, np.ones(len(diabetes.data))) + + +def test_linearsvr(): + # check that SVR(kernel='linear') and LinearSVC() give + # comparable results + diabetes = datasets.load_diabetes() + lsvr = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target) + score1 = lsvr.score(diabetes.data, diabetes.target) + + svr = svm.SVR(kernel="linear", C=1e3).fit(diabetes.data, diabetes.target) + score2 = svr.score(diabetes.data, diabetes.target) + + assert_allclose(np.linalg.norm(lsvr.coef_), np.linalg.norm(svr.coef_), 1, 0.0001) + assert_almost_equal(score1, score2, 2) + + +def test_linearsvr_fit_sampleweight(): + # check correct result when sample_weight is 1 + # check that SVR(kernel='linear') and LinearSVC() give + # comparable results + diabetes = datasets.load_diabetes() + n_samples = len(diabetes.target) + unit_weight = np.ones(n_samples) + lsvr = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit( + diabetes.data, diabetes.target, sample_weight=unit_weight + ) + score1 = lsvr.score(diabetes.data, diabetes.target) + + lsvr_no_weight = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit( + diabetes.data, diabetes.target + ) + score2 = lsvr_no_weight.score(diabetes.data, diabetes.target) + + assert_allclose( + np.linalg.norm(lsvr.coef_), np.linalg.norm(lsvr_no_weight.coef_), 1, 0.0001 + ) + assert_almost_equal(score1, score2, 2) + + # check that fit(X) = fit([X1, X2, X3], sample_weight = [n1, n2, n3]) where + # X = X1 repeated n1 times, X2 repeated n2 times and so forth + random_state = check_random_state(0) + random_weight = random_state.randint(0, 10, n_samples) + lsvr_unflat = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit( + diabetes.data, diabetes.target, sample_weight=random_weight + ) + score3 = lsvr_unflat.score( + diabetes.data, diabetes.target, sample_weight=random_weight + ) + + X_flat = np.repeat(diabetes.data, random_weight, axis=0) + y_flat = np.repeat(diabetes.target, random_weight, axis=0) + lsvr_flat = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit(X_flat, y_flat) + score4 = lsvr_flat.score(X_flat, y_flat) + + assert_almost_equal(score3, score4, 2) + + +def test_svr_errors(): + X = [[0.0], [1.0]] + y = [0.0, 0.5] + + # Bad kernel + clf = svm.SVR(kernel=lambda x, y: np.array([[1.0]])) + clf.fit(X, y) + with pytest.raises(ValueError): + clf.predict(X) + + +def test_oneclass(): + # Test OneClassSVM + clf = svm.OneClassSVM() + clf.fit(X) + pred = clf.predict(T) + + assert_array_equal(pred, [1, -1, -1]) + assert pred.dtype == np.dtype("intp") + assert_array_almost_equal(clf.intercept_, [-1.218], decimal=3) + assert_array_almost_equal(clf.dual_coef_, [[0.750, 0.750, 0.750, 0.750]], decimal=3) + with pytest.raises(AttributeError): + (lambda: clf.coef_)() + + +def test_oneclass_decision_function(): + # Test OneClassSVM decision function + clf = svm.OneClassSVM() + rnd = check_random_state(2) + + # Generate train data + X = 0.3 * rnd.randn(100, 2) + X_train = np.r_[X + 2, X - 2] + + # Generate some regular novel observations + X = 0.3 * rnd.randn(20, 2) + X_test = np.r_[X + 2, X - 2] + # Generate some abnormal novel observations + X_outliers = rnd.uniform(low=-4, high=4, size=(20, 2)) + + # fit the model + clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) + clf.fit(X_train) + + # predict things + y_pred_test = clf.predict(X_test) + assert np.mean(y_pred_test == 1) > 0.9 + y_pred_outliers = clf.predict(X_outliers) + assert np.mean(y_pred_outliers == -1) > 0.9 + dec_func_test = clf.decision_function(X_test) + assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1) + dec_func_outliers = clf.decision_function(X_outliers) + assert_array_equal((dec_func_outliers > 0).ravel(), y_pred_outliers == 1) + + +def test_oneclass_score_samples(): + X_train = [[1, 1], [1, 2], [2, 1]] + clf = svm.OneClassSVM(gamma=1).fit(X_train) + assert_array_equal( + clf.score_samples([[2.0, 2.0]]), + clf.decision_function([[2.0, 2.0]]) + clf.offset_, + ) + + +def test_tweak_params(): + # Make sure some tweaking of parameters works. + # We change clf.dual_coef_ at run time and expect .predict() to change + # accordingly. Notice that this is not trivial since it involves a lot + # of C/Python copying in the libsvm bindings. + # The success of this test ensures that the mapping between libsvm and + # the python classifier is complete. + clf = svm.SVC(kernel="linear", C=1.0) + clf.fit(X, Y) + assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]]) + assert_array_equal(clf.predict([[-0.1, -0.1]]), [1]) + clf._dual_coef_ = np.array([[0.0, 1.0]]) + assert_array_equal(clf.predict([[-0.1, -0.1]]), [2]) + + +def test_probability(): + # Predict probabilities using SVC + # This uses cross validation, so we use a slightly bigger testing set. + + for clf in ( + svm.SVC(probability=True, random_state=0, C=1.0), + svm.NuSVC(probability=True, random_state=0), + ): + clf.fit(iris.data, iris.target) + + prob_predict = clf.predict_proba(iris.data) + assert_array_almost_equal(np.sum(prob_predict, 1), np.ones(iris.data.shape[0])) + assert np.mean(np.argmax(prob_predict, 1) == clf.predict(iris.data)) > 0.9 + + assert_almost_equal( + clf.predict_proba(iris.data), np.exp(clf.predict_log_proba(iris.data)), 8 + ) + + +def test_decision_function(): + # Test decision_function + # Sanity check, test that decision_function implemented in python + # returns the same as the one in libsvm + # multi class: + clf = svm.SVC(kernel="linear", C=0.1, decision_function_shape="ovo").fit( + iris.data, iris.target + ) + + dec = np.dot(iris.data, clf.coef_.T) + clf.intercept_ + + assert_array_almost_equal(dec, clf.decision_function(iris.data)) + + # binary: + clf.fit(X, Y) + dec = np.dot(X, clf.coef_.T) + clf.intercept_ + prediction = clf.predict(X) + assert_array_almost_equal(dec.ravel(), clf.decision_function(X)) + assert_array_almost_equal( + prediction, clf.classes_[(clf.decision_function(X) > 0).astype(int)] + ) + expected = np.array([-1.0, -0.66, -1.0, 0.66, 1.0, 1.0]) + assert_array_almost_equal(clf.decision_function(X), expected, 2) + + # kernel binary: + clf = svm.SVC(kernel="rbf", gamma=1, decision_function_shape="ovo") + clf.fit(X, Y) + + rbfs = rbf_kernel(X, clf.support_vectors_, gamma=clf.gamma) + dec = np.dot(rbfs, clf.dual_coef_.T) + clf.intercept_ + assert_array_almost_equal(dec.ravel(), clf.decision_function(X)) + + +@pytest.mark.parametrize("SVM", (svm.SVC, svm.NuSVC)) +def test_decision_function_shape(SVM): + # check that decision_function_shape='ovr' or 'ovo' gives + # correct shape and is consistent with predict + + clf = SVM(kernel="linear", decision_function_shape="ovr").fit( + iris.data, iris.target + ) + dec = clf.decision_function(iris.data) + assert dec.shape == (len(iris.data), 3) + assert_array_equal(clf.predict(iris.data), np.argmax(dec, axis=1)) + + # with five classes: + X, y = make_blobs(n_samples=80, centers=5, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + clf = SVM(kernel="linear", decision_function_shape="ovr").fit(X_train, y_train) + dec = clf.decision_function(X_test) + assert dec.shape == (len(X_test), 5) + assert_array_equal(clf.predict(X_test), np.argmax(dec, axis=1)) + + # check shape of ovo_decition_function=True + clf = SVM(kernel="linear", decision_function_shape="ovo").fit(X_train, y_train) + dec = clf.decision_function(X_train) + assert dec.shape == (len(X_train), 10) + + +def test_svr_predict(): + # Test SVR's decision_function + # Sanity check, test that predict implemented in python + # returns the same as the one in libsvm + + X = iris.data + y = iris.target + + # linear kernel + reg = svm.SVR(kernel="linear", C=0.1).fit(X, y) + + dec = np.dot(X, reg.coef_.T) + reg.intercept_ + assert_array_almost_equal(dec.ravel(), reg.predict(X).ravel()) + + # rbf kernel + reg = svm.SVR(kernel="rbf", gamma=1).fit(X, y) + + rbfs = rbf_kernel(X, reg.support_vectors_, gamma=reg.gamma) + dec = np.dot(rbfs, reg.dual_coef_.T) + reg.intercept_ + assert_array_almost_equal(dec.ravel(), reg.predict(X).ravel()) + + +def test_weight(): + # Test class weights + clf = svm.SVC(class_weight={1: 0.1}) + # we give a small weights to class 1 + clf.fit(X, Y) + # so all predicted values belong to class 2 + assert_array_almost_equal(clf.predict(X), [2] * 6) + + X_, y_ = make_classification( + n_samples=200, n_features=10, weights=[0.833, 0.167], random_state=2 + ) + + for clf in ( + linear_model.LogisticRegression(), + svm.LinearSVC(random_state=0), + svm.SVC(), + ): + clf.set_params(class_weight={0: 0.1, 1: 10}) + clf.fit(X_[:100], y_[:100]) + y_pred = clf.predict(X_[100:]) + assert f1_score(y_[100:], y_pred) > 0.3 + + +@pytest.mark.parametrize("estimator", [svm.SVC(C=1e-2), svm.NuSVC()]) +def test_svm_classifier_sided_sample_weight(estimator): + # fit a linear SVM and check that giving more weight to opposed samples + # in the space will flip the decision toward these samples. + X = [[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 0]] + estimator.set_params(kernel="linear") + + # check that with unit weights, a sample is supposed to be predicted on + # the boundary + sample_weight = [1] * 6 + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.decision_function([[-1.0, 1.0]]) + assert y_pred == pytest.approx(0) + + # give more weights to opposed samples + sample_weight = [10.0, 0.1, 0.1, 0.1, 0.1, 10] + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.decision_function([[-1.0, 1.0]]) + assert y_pred < 0 + + sample_weight = [1.0, 0.1, 10.0, 10.0, 0.1, 0.1] + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.decision_function([[-1.0, 1.0]]) + assert y_pred > 0 + + +@pytest.mark.parametrize("estimator", [svm.SVR(C=1e-2), svm.NuSVR(C=1e-2)]) +def test_svm_regressor_sided_sample_weight(estimator): + # similar test to test_svm_classifier_sided_sample_weight but for + # SVM regressors + X = [[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 0]] + estimator.set_params(kernel="linear") + + # check that with unit weights, a sample is supposed to be predicted on + # the boundary + sample_weight = [1] * 6 + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.predict([[-1.0, 1.0]]) + assert y_pred == pytest.approx(1.5) + + # give more weights to opposed samples + sample_weight = [10.0, 0.1, 0.1, 0.1, 0.1, 10] + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.predict([[-1.0, 1.0]]) + assert y_pred < 1.5 + + sample_weight = [1.0, 0.1, 10.0, 10.0, 0.1, 0.1] + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.predict([[-1.0, 1.0]]) + assert y_pred > 1.5 + + +def test_svm_equivalence_sample_weight_C(): + # test that rescaling all samples is the same as changing C + clf = svm.SVC() + clf.fit(X, Y) + dual_coef_no_weight = clf.dual_coef_ + clf.set_params(C=100) + clf.fit(X, Y, sample_weight=np.repeat(0.01, len(X))) + assert_allclose(dual_coef_no_weight, clf.dual_coef_) + + +@pytest.mark.parametrize( + "Estimator, err_msg", + [ + (svm.SVC, "Invalid input - all samples have zero or negative weights."), + (svm.NuSVC, "(negative dimensions are not allowed|nu is infeasible)"), + (svm.SVR, "Invalid input - all samples have zero or negative weights."), + (svm.NuSVR, "Invalid input - all samples have zero or negative weights."), + (svm.OneClassSVM, "Invalid input - all samples have zero or negative weights."), + ], + ids=["SVC", "NuSVC", "SVR", "NuSVR", "OneClassSVM"], +) +@pytest.mark.parametrize( + "sample_weight", + [[0] * len(Y), [-0.3] * len(Y)], + ids=["weights-are-zero", "weights-are-negative"], +) +def test_negative_sample_weights_mask_all_samples(Estimator, err_msg, sample_weight): + est = Estimator(kernel="linear") + with pytest.raises(ValueError, match=err_msg): + est.fit(X, Y, sample_weight=sample_weight) + + +@pytest.mark.parametrize( + "Classifier, err_msg", + [ + ( + svm.SVC, + ( + "Invalid input - all samples with positive weights belong to the same" + " class" + ), + ), + (svm.NuSVC, "specified nu is infeasible"), + ], + ids=["SVC", "NuSVC"], +) +@pytest.mark.parametrize( + "sample_weight", + [[0, -0.5, 0, 1, 1, 1], [1, 1, 1, 0, -0.1, -0.3]], + ids=["mask-label-1", "mask-label-2"], +) +def test_negative_weights_svc_leave_just_one_label(Classifier, err_msg, sample_weight): + clf = Classifier(kernel="linear") + with pytest.raises(ValueError, match=err_msg): + clf.fit(X, Y, sample_weight=sample_weight) + + +@pytest.mark.parametrize( + "Classifier, model", + [ + (svm.SVC, {"when-left": [0.3998, 0.4], "when-right": [0.4, 0.3999]}), + (svm.NuSVC, {"when-left": [0.3333, 0.3333], "when-right": [0.3333, 0.3333]}), + ], + ids=["SVC", "NuSVC"], +) +@pytest.mark.parametrize( + "sample_weight, mask_side", + [([1, -0.5, 1, 1, 1, 1], "when-left"), ([1, 1, 1, 0, 1, 1], "when-right")], + ids=["partial-mask-label-1", "partial-mask-label-2"], +) +def test_negative_weights_svc_leave_two_labels( + Classifier, model, sample_weight, mask_side +): + clf = Classifier(kernel="linear") + clf.fit(X, Y, sample_weight=sample_weight) + assert_allclose(clf.coef_, [model[mask_side]], rtol=1e-3) + + +@pytest.mark.parametrize( + "Estimator", [svm.SVC, svm.NuSVC, svm.NuSVR], ids=["SVC", "NuSVC", "NuSVR"] +) +@pytest.mark.parametrize( + "sample_weight", + [[1, -0.5, 1, 1, 1, 1], [1, 1, 1, 0, 1, 1]], + ids=["partial-mask-label-1", "partial-mask-label-2"], +) +def test_negative_weight_equal_coeffs(Estimator, sample_weight): + # model generates equal coefficients + est = Estimator(kernel="linear") + est.fit(X, Y, sample_weight=sample_weight) + coef = np.abs(est.coef_).ravel() + assert coef[0] == pytest.approx(coef[1], rel=1e-3) + + +def test_auto_weight(): + # Test class weights for imbalanced data + from sklearn.linear_model import LogisticRegression + + # We take as dataset the two-dimensional projection of iris so + # that it is not separable and remove half of predictors from + # class 1. + # We add one to the targets as a non-regression test: + # class_weight="balanced" + # used to work only when the labels where a range [0..K). + from sklearn.utils import compute_class_weight + + X, y = iris.data[:, :2], iris.target + 1 + unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2]) + + classes = np.unique(y[unbalanced]) + class_weights = compute_class_weight("balanced", classes=classes, y=y[unbalanced]) + assert np.argmax(class_weights) == 2 + + for clf in ( + svm.SVC(kernel="linear"), + svm.LinearSVC(random_state=0), + LogisticRegression(), + ): + # check that score is better when class='balanced' is set. + y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X) + clf.set_params(class_weight="balanced") + y_pred_balanced = clf.fit( + X[unbalanced], + y[unbalanced], + ).predict(X) + assert metrics.f1_score(y, y_pred, average="macro") <= metrics.f1_score( + y, y_pred_balanced, average="macro" + ) + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_bad_input(lil_container): + # Test dimensions for labels + Y2 = Y[:-1] # wrong dimensions for labels + with pytest.raises(ValueError): + svm.SVC().fit(X, Y2) + + # Test with arrays that are non-contiguous. + for clf in (svm.SVC(), svm.LinearSVC(random_state=0)): + Xf = np.asfortranarray(X) + assert not Xf.flags["C_CONTIGUOUS"] + yf = np.ascontiguousarray(np.tile(Y, (2, 1)).T) + yf = yf[:, -1] + assert not yf.flags["F_CONTIGUOUS"] + assert not yf.flags["C_CONTIGUOUS"] + clf.fit(Xf, yf) + assert_array_equal(clf.predict(T), true_result) + + # error for precomputed kernelsx + clf = svm.SVC(kernel="precomputed") + with pytest.raises(ValueError): + clf.fit(X, Y) + + # predict with sparse input when trained with dense + clf = svm.SVC().fit(X, Y) + with pytest.raises(ValueError): + clf.predict(lil_container(X)) + + Xt = np.array(X).T + clf.fit(np.dot(X, Xt), Y) + with pytest.raises(ValueError): + clf.predict(X) + + clf = svm.SVC() + clf.fit(X, Y) + with pytest.raises(ValueError): + clf.predict(Xt) + + +def test_svc_nonfinite_params(): + # Check SVC throws ValueError when dealing with non-finite parameter values + rng = np.random.RandomState(0) + n_samples = 10 + fmax = np.finfo(np.float64).max + X = fmax * rng.uniform(size=(n_samples, 2)) + y = rng.randint(0, 2, size=n_samples) + + clf = svm.SVC() + msg = "The dual coefficients or intercepts are not finite" + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) + + +def test_unicode_kernel(): + # Test that a unicode kernel name does not cause a TypeError + clf = svm.SVC(kernel="linear", probability=True) + clf.fit(X, Y) + clf.predict_proba(T) + _libsvm.cross_validation( + iris.data, iris.target.astype(np.float64), 5, kernel="linear", random_seed=0 + ) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_precomputed(csr_container): + clf = svm.SVC(kernel="precomputed") + sparse_gram = csr_container([[1, 0], [0, 1]]) + with pytest.raises(TypeError, match="Sparse precomputed"): + clf.fit(sparse_gram, [0, 1]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_fit_support_vectors_empty(csr_container): + # Regression test for #14893 + X_train = csr_container([[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]]) + y_train = np.array([0.04, 0.04, 0.10, 0.16]) + model = svm.SVR(kernel="linear") + model.fit(X_train, y_train) + assert not model.support_vectors_.data.size + assert not model.dual_coef_.data.size + + +@pytest.mark.parametrize("loss", ["hinge", "squared_hinge"]) +@pytest.mark.parametrize("penalty", ["l1", "l2"]) +@pytest.mark.parametrize("dual", [True, False]) +def test_linearsvc_parameters(loss, penalty, dual): + # Test possible parameter combinations in LinearSVC + # Generate list of possible parameter combinations + X, y = make_classification(n_samples=5, n_features=5, random_state=0) + + clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual, random_state=0) + if ( + (loss, penalty) == ("hinge", "l1") + or (loss, penalty, dual) == ("hinge", "l2", False) + or (penalty, dual) == ("l1", True) + ): + with pytest.raises( + ValueError, + match="Unsupported set of arguments.*penalty='%s.*loss='%s.*dual=%s" + % (penalty, loss, dual), + ): + clf.fit(X, y) + else: + clf.fit(X, y) + + +def test_linearsvc(): + # Test basic routines using LinearSVC + clf = svm.LinearSVC(random_state=0).fit(X, Y) + + # by default should have intercept + assert clf.fit_intercept + + assert_array_equal(clf.predict(T), true_result) + assert_array_almost_equal(clf.intercept_, [0], decimal=3) + + # the same with l1 penalty + clf = svm.LinearSVC( + penalty="l1", loss="squared_hinge", dual=False, random_state=0 + ).fit(X, Y) + assert_array_equal(clf.predict(T), true_result) + + # l2 penalty with dual formulation + clf = svm.LinearSVC(penalty="l2", dual=True, random_state=0).fit(X, Y) + assert_array_equal(clf.predict(T), true_result) + + # l2 penalty, l1 loss + clf = svm.LinearSVC(penalty="l2", loss="hinge", dual=True, random_state=0) + clf.fit(X, Y) + assert_array_equal(clf.predict(T), true_result) + + # test also decision function + dec = clf.decision_function(T) + res = (dec > 0).astype(int) + 1 + assert_array_equal(res, true_result) + + +def test_linearsvc_crammer_singer(): + # Test LinearSVC with crammer_singer multi-class svm + ovr_clf = svm.LinearSVC(random_state=0).fit(iris.data, iris.target) + cs_clf = svm.LinearSVC(multi_class="crammer_singer", random_state=0) + cs_clf.fit(iris.data, iris.target) + + # similar prediction for ovr and crammer-singer: + assert (ovr_clf.predict(iris.data) == cs_clf.predict(iris.data)).mean() > 0.9 + + # classifiers shouldn't be the same + assert (ovr_clf.coef_ != cs_clf.coef_).all() + + # test decision function + assert_array_equal( + cs_clf.predict(iris.data), + np.argmax(cs_clf.decision_function(iris.data), axis=1), + ) + dec_func = np.dot(iris.data, cs_clf.coef_.T) + cs_clf.intercept_ + assert_array_almost_equal(dec_func, cs_clf.decision_function(iris.data)) + + +def test_linearsvc_fit_sampleweight(): + # check correct result when sample_weight is 1 + n_samples = len(X) + unit_weight = np.ones(n_samples) + clf = svm.LinearSVC(random_state=0).fit(X, Y) + clf_unitweight = svm.LinearSVC(random_state=0, tol=1e-12, max_iter=1000).fit( + X, Y, sample_weight=unit_weight + ) + + # check if same as sample_weight=None + assert_array_equal(clf_unitweight.predict(T), clf.predict(T)) + assert_allclose(clf.coef_, clf_unitweight.coef_, 1, 0.0001) + + # check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where + # X = X1 repeated n1 times, X2 repeated n2 times and so forth + + random_state = check_random_state(0) + random_weight = random_state.randint(0, 10, n_samples) + lsvc_unflat = svm.LinearSVC(random_state=0, tol=1e-12, max_iter=1000).fit( + X, Y, sample_weight=random_weight + ) + + pred1 = lsvc_unflat.predict(T) + + X_flat = np.repeat(X, random_weight, axis=0) + y_flat = np.repeat(Y, random_weight, axis=0) + lsvc_flat = svm.LinearSVC(random_state=0, tol=1e-12, max_iter=1000).fit( + X_flat, y_flat + ) + pred2 = lsvc_flat.predict(T) + + assert_array_equal(pred1, pred2) + assert_allclose(lsvc_unflat.coef_, lsvc_flat.coef_, 1, 0.0001) + + +def test_crammer_singer_binary(): + # Test Crammer-Singer formulation in the binary case + X, y = make_classification(n_classes=2, random_state=0) + + for fit_intercept in (True, False): + acc = ( + svm.LinearSVC( + fit_intercept=fit_intercept, + multi_class="crammer_singer", + random_state=0, + ) + .fit(X, y) + .score(X, y) + ) + assert acc > 0.9 + + +def test_linearsvc_iris(): + # Test that LinearSVC gives plausible predictions on the iris dataset + # Also, test symbolic class names (classes_). + target = iris.target_names[iris.target] + clf = svm.LinearSVC(random_state=0).fit(iris.data, target) + assert set(clf.classes_) == set(iris.target_names) + assert np.mean(clf.predict(iris.data) == target) > 0.8 + + dec = clf.decision_function(iris.data) + pred = iris.target_names[np.argmax(dec, 1)] + assert_array_equal(pred, clf.predict(iris.data)) + + +def test_dense_liblinear_intercept_handling(classifier=svm.LinearSVC): + # Test that dense liblinear honours intercept_scaling param + X = [[2, 1], [3, 1], [1, 3], [2, 3]] + y = [0, 0, 1, 1] + clf = classifier( + fit_intercept=True, + penalty="l1", + loss="squared_hinge", + dual=False, + C=4, + tol=1e-7, + random_state=0, + ) + assert clf.intercept_scaling == 1, clf.intercept_scaling + assert clf.fit_intercept + + # when intercept_scaling is low the intercept value is highly "penalized" + # by regularization + clf.intercept_scaling = 1 + clf.fit(X, y) + assert_almost_equal(clf.intercept_, 0, decimal=5) + + # when intercept_scaling is sufficiently high, the intercept value + # is not affected by regularization + clf.intercept_scaling = 100 + clf.fit(X, y) + intercept1 = clf.intercept_ + assert intercept1 < -1 + + # when intercept_scaling is sufficiently high, the intercept value + # doesn't depend on intercept_scaling value + clf.intercept_scaling = 1000 + clf.fit(X, y) + intercept2 = clf.intercept_ + assert_array_almost_equal(intercept1, intercept2, decimal=2) + + +def test_liblinear_set_coef(): + # multi-class case + clf = svm.LinearSVC().fit(iris.data, iris.target) + values = clf.decision_function(iris.data) + clf.coef_ = clf.coef_.copy() + clf.intercept_ = clf.intercept_.copy() + values2 = clf.decision_function(iris.data) + assert_array_almost_equal(values, values2) + + # binary-class case + X = [[2, 1], [3, 1], [1, 3], [2, 3]] + y = [0, 0, 1, 1] + + clf = svm.LinearSVC().fit(X, y) + values = clf.decision_function(X) + clf.coef_ = clf.coef_.copy() + clf.intercept_ = clf.intercept_.copy() + values2 = clf.decision_function(X) + assert_array_equal(values, values2) + + +def test_immutable_coef_property(): + # Check that primal coef modification are not silently ignored + svms = [ + svm.SVC(kernel="linear").fit(iris.data, iris.target), + svm.NuSVC(kernel="linear").fit(iris.data, iris.target), + svm.SVR(kernel="linear").fit(iris.data, iris.target), + svm.NuSVR(kernel="linear").fit(iris.data, iris.target), + svm.OneClassSVM(kernel="linear").fit(iris.data), + ] + for clf in svms: + with pytest.raises(AttributeError): + clf.__setattr__("coef_", np.arange(3)) + with pytest.raises((RuntimeError, ValueError)): + clf.coef_.__setitem__((0, 0), 0) + + +def test_linearsvc_verbose(): + # stdout: redirect + import os + + stdout = os.dup(1) # save original stdout + os.dup2(os.pipe()[1], 1) # replace it + + # actual call + clf = svm.LinearSVC(verbose=1) + clf.fit(X, Y) + + # stdout: restore + os.dup2(stdout, 1) # restore original stdout + + +def test_svc_clone_with_callable_kernel(): + # create SVM with callable linear kernel, check that results are the same + # as with built-in linear kernel + svm_callable = svm.SVC( + kernel=lambda x, y: np.dot(x, y.T), + probability=True, + random_state=0, + decision_function_shape="ovr", + ) + # clone for checking clonability with lambda functions.. + svm_cloned = base.clone(svm_callable) + svm_cloned.fit(iris.data, iris.target) + + svm_builtin = svm.SVC( + kernel="linear", probability=True, random_state=0, decision_function_shape="ovr" + ) + svm_builtin.fit(iris.data, iris.target) + + assert_array_almost_equal(svm_cloned.dual_coef_, svm_builtin.dual_coef_) + assert_array_almost_equal(svm_cloned.intercept_, svm_builtin.intercept_) + assert_array_equal(svm_cloned.predict(iris.data), svm_builtin.predict(iris.data)) + + assert_array_almost_equal( + svm_cloned.predict_proba(iris.data), + svm_builtin.predict_proba(iris.data), + decimal=4, + ) + assert_array_almost_equal( + svm_cloned.decision_function(iris.data), + svm_builtin.decision_function(iris.data), + ) + + +def test_svc_bad_kernel(): + svc = svm.SVC(kernel=lambda x, y: x) + with pytest.raises(ValueError): + svc.fit(X, Y) + + +def test_libsvm_convergence_warnings(): + a = svm.SVC( + kernel=lambda x, y: np.dot(x, y.T), probability=True, random_state=0, max_iter=2 + ) + warning_msg = ( + r"Solver terminated early \(max_iter=2\). Consider pre-processing " + r"your data with StandardScaler or MinMaxScaler." + ) + with pytest.warns(ConvergenceWarning, match=warning_msg): + a.fit(np.array(X), Y) + assert np.all(a.n_iter_ == 2) + + +def test_unfitted(): + X = "foo!" # input validation not required when SVM not fitted + + clf = svm.SVC() + with pytest.raises(Exception, match=r".*\bSVC\b.*\bnot\b.*\bfitted\b"): + clf.predict(X) + + clf = svm.NuSVR() + with pytest.raises(Exception, match=r".*\bNuSVR\b.*\bnot\b.*\bfitted\b"): + clf.predict(X) + + +# ignore convergence warnings from max_iter=1 +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_consistent_proba(): + a = svm.SVC(probability=True, max_iter=1, random_state=0) + proba_1 = a.fit(X, Y).predict_proba(X) + a = svm.SVC(probability=True, max_iter=1, random_state=0) + proba_2 = a.fit(X, Y).predict_proba(X) + assert_array_almost_equal(proba_1, proba_2) + + +def test_linear_svm_convergence_warnings(): + # Test that warnings are raised if model does not converge + + lsvc = svm.LinearSVC(random_state=0, max_iter=2) + warning_msg = "Liblinear failed to converge, increase the number of iterations." + with pytest.warns(ConvergenceWarning, match=warning_msg): + lsvc.fit(X, Y) + # Check that we have an n_iter_ attribute with int type as opposed to a + # numpy array or an np.int32 so as to match the docstring. + assert isinstance(lsvc.n_iter_, int) + assert lsvc.n_iter_ == 2 + + lsvr = svm.LinearSVR(random_state=0, max_iter=2) + with pytest.warns(ConvergenceWarning, match=warning_msg): + lsvr.fit(iris.data, iris.target) + assert isinstance(lsvr.n_iter_, int) + assert lsvr.n_iter_ == 2 + + +def test_svr_coef_sign(): + # Test that SVR(kernel="linear") has coef_ with the right sign. + # Non-regression test for #2933. + X = np.random.RandomState(21).randn(10, 3) + y = np.random.RandomState(12).randn(10) + + for svr in [ + svm.SVR(kernel="linear"), + svm.NuSVR(kernel="linear"), + svm.LinearSVR(), + ]: + svr.fit(X, y) + assert_array_almost_equal( + svr.predict(X), np.dot(X, svr.coef_.ravel()) + svr.intercept_ + ) + + +def test_lsvc_intercept_scaling_zero(): + # Test that intercept_scaling is ignored when fit_intercept is False + + lsvc = svm.LinearSVC(fit_intercept=False) + lsvc.fit(X, Y) + assert lsvc.intercept_ == 0.0 + + +def test_hasattr_predict_proba(): + # Method must be (un)available before or after fit, switched by + # `probability` param + + G = svm.SVC(probability=True) + assert hasattr(G, "predict_proba") + G.fit(iris.data, iris.target) + assert hasattr(G, "predict_proba") + + G = svm.SVC(probability=False) + assert not hasattr(G, "predict_proba") + G.fit(iris.data, iris.target) + assert not hasattr(G, "predict_proba") + + # Switching to `probability=True` after fitting should make + # predict_proba available, but calling it must not work: + G.probability = True + assert hasattr(G, "predict_proba") + msg = "predict_proba is not available when fitted with probability=False" + + with pytest.raises(NotFittedError, match=msg): + G.predict_proba(iris.data) + + +def test_decision_function_shape_two_class(): + for n_classes in [2, 3]: + X, y = make_blobs(centers=n_classes, random_state=0) + for estimator in [svm.SVC, svm.NuSVC]: + clf = OneVsRestClassifier(estimator(decision_function_shape="ovr")).fit( + X, y + ) + assert len(clf.predict(X)) == len(y) + + +def test_ovr_decision_function(): + # One point from each quadrant represents one class + X_train = np.array([[1, 1], [-1, 1], [-1, -1], [1, -1]]) + y_train = [0, 1, 2, 3] + + # First point is closer to the decision boundaries than the second point + base_points = np.array([[5, 5], [10, 10]]) + + # For all the quadrants (classes) + X_test = np.vstack( + ( + base_points * [1, 1], # Q1 + base_points * [-1, 1], # Q2 + base_points * [-1, -1], # Q3 + base_points * [1, -1], # Q4 + ) + ) + + y_test = [0] * 2 + [1] * 2 + [2] * 2 + [3] * 2 + + clf = svm.SVC(kernel="linear", decision_function_shape="ovr") + clf.fit(X_train, y_train) + + y_pred = clf.predict(X_test) + + # Test if the prediction is the same as y + assert_array_equal(y_pred, y_test) + + deci_val = clf.decision_function(X_test) + + # Assert that the predicted class has the maximum value + assert_array_equal(np.argmax(deci_val, axis=1), y_pred) + + # Get decision value at test points for the predicted class + pred_class_deci_val = deci_val[range(8), y_pred].reshape((4, 2)) + + # Assert pred_class_deci_val > 0 here + assert np.min(pred_class_deci_val) > 0.0 + + # Test if the first point has lower decision value on every quadrant + # compared to the second point + assert np.all(pred_class_deci_val[:, 0] < pred_class_deci_val[:, 1]) + + +@pytest.mark.parametrize("SVCClass", [svm.SVC, svm.NuSVC]) +def test_svc_invalid_break_ties_param(SVCClass): + X, y = make_blobs(random_state=42) + + svm = SVCClass( + kernel="linear", decision_function_shape="ovo", break_ties=True, random_state=42 + ).fit(X, y) + + with pytest.raises(ValueError, match="break_ties must be False"): + svm.predict(y) + + +@pytest.mark.parametrize("SVCClass", [svm.SVC, svm.NuSVC]) +def test_svc_ovr_tie_breaking(SVCClass): + """Test if predict breaks ties in OVR mode. + Related issue: https://github.com/scikit-learn/scikit-learn/issues/8277 + """ + if SVCClass.__name__ == "NuSVC" and _IS_32BIT: + # XXX: known failure to be investigated. Either the code needs to be + # fixed or the test itself might need to be made less sensitive to + # random changes in test data and rounding errors more generally. + # https://github.com/scikit-learn/scikit-learn/issues/29633 + pytest.xfail("Failing test on 32bit OS") + + X, y = make_blobs(random_state=0, n_samples=20, n_features=2) + + xs = np.linspace(X[:, 0].min(), X[:, 0].max(), 100) + ys = np.linspace(X[:, 1].min(), X[:, 1].max(), 100) + xx, yy = np.meshgrid(xs, ys) + + common_params = dict( + kernel="rbf", gamma=1e6, random_state=42, decision_function_shape="ovr" + ) + svm = SVCClass( + break_ties=False, + **common_params, + ).fit(X, y) + pred = svm.predict(np.c_[xx.ravel(), yy.ravel()]) + dv = svm.decision_function(np.c_[xx.ravel(), yy.ravel()]) + assert not np.all(pred == np.argmax(dv, axis=1)) + + svm = SVCClass( + break_ties=True, + **common_params, + ).fit(X, y) + pred = svm.predict(np.c_[xx.ravel(), yy.ravel()]) + dv = svm.decision_function(np.c_[xx.ravel(), yy.ravel()]) + assert np.all(pred == np.argmax(dv, axis=1)) + + +def test_gamma_scale(): + X, y = [[0.0], [1.0]], [0, 1] + + clf = svm.SVC() + clf.fit(X, y) + assert_almost_equal(clf._gamma, 4) + + +@pytest.mark.parametrize( + "SVM, params", + [ + (LinearSVC, {"penalty": "l1", "loss": "squared_hinge", "dual": False}), + (LinearSVC, {"penalty": "l2", "loss": "squared_hinge", "dual": True}), + (LinearSVC, {"penalty": "l2", "loss": "squared_hinge", "dual": False}), + (LinearSVC, {"penalty": "l2", "loss": "hinge", "dual": True}), + (LinearSVR, {"loss": "epsilon_insensitive", "dual": True}), + (LinearSVR, {"loss": "squared_epsilon_insensitive", "dual": True}), + (LinearSVR, {"loss": "squared_epsilon_insensitive", "dual": True}), + ], +) +def test_linearsvm_liblinear_sample_weight(SVM, params): + X = np.array( + [ + [1, 3], + [1, 3], + [1, 3], + [1, 3], + [2, 1], + [2, 1], + [2, 1], + [2, 1], + [3, 3], + [3, 3], + [3, 3], + [3, 3], + [4, 1], + [4, 1], + [4, 1], + [4, 1], + ], + dtype=np.dtype("float"), + ) + y = np.array( + [1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype("int") + ) + + X2 = np.vstack([X, X]) + y2 = np.hstack([y, 3 - y]) + sample_weight = np.ones(shape=len(y) * 2) + sample_weight[len(y) :] = 0 + X2, y2, sample_weight = shuffle(X2, y2, sample_weight, random_state=0) + + base_estimator = SVM(random_state=42) + base_estimator.set_params(**params) + base_estimator.set_params(tol=1e-12, max_iter=1000) + est_no_weight = base.clone(base_estimator).fit(X, y) + est_with_weight = base.clone(base_estimator).fit( + X2, y2, sample_weight=sample_weight + ) + + for method in ("predict", "decision_function"): + if hasattr(base_estimator, method): + X_est_no_weight = getattr(est_no_weight, method)(X) + X_est_with_weight = getattr(est_with_weight, method)(X) + assert_allclose(X_est_no_weight, X_est_with_weight) + + +@pytest.mark.parametrize("Klass", (OneClassSVM, SVR, NuSVR)) +def test_n_support(Klass): + # Make n_support is correct for oneclass and SVR (used to be + # non-initialized) + # this is a non regression test for issue #14774 + X = np.array([[0], [0.44], [0.45], [0.46], [1]]) + y = np.arange(X.shape[0]) + est = Klass() + assert not hasattr(est, "n_support_") + est.fit(X, y) + assert est.n_support_[0] == est.support_vectors_.shape[0] + assert est.n_support_.size == 1 + + +@pytest.mark.parametrize("Estimator", [svm.SVC, svm.SVR]) +def test_custom_kernel_not_array_input(Estimator): + """Test using a custom kernel that is not fed with array-like for floats""" + data = ["A A", "A", "B", "B B", "A B"] + X = np.array([[2, 0], [1, 0], [0, 1], [0, 2], [1, 1]]) # count encoding + y = np.array([1, 1, 2, 2, 1]) + + def string_kernel(X1, X2): + assert isinstance(X1[0], str) + n_samples1 = _num_samples(X1) + n_samples2 = _num_samples(X2) + K = np.zeros((n_samples1, n_samples2)) + for ii in range(n_samples1): + for jj in range(ii, n_samples2): + K[ii, jj] = X1[ii].count("A") * X2[jj].count("A") + K[ii, jj] += X1[ii].count("B") * X2[jj].count("B") + K[jj, ii] = K[ii, jj] + return K + + K = string_kernel(data, data) + assert_array_equal(np.dot(X, X.T), K) + + svc1 = Estimator(kernel=string_kernel).fit(data, y) + svc2 = Estimator(kernel="linear").fit(X, y) + svc3 = Estimator(kernel="precomputed").fit(K, y) + + assert svc1.score(data, y) == svc3.score(K, y) + assert svc1.score(data, y) == svc2.score(X, y) + if hasattr(svc1, "decision_function"): # classifier + assert_allclose(svc1.decision_function(data), svc2.decision_function(X)) + assert_allclose(svc1.decision_function(data), svc3.decision_function(K)) + assert_array_equal(svc1.predict(data), svc2.predict(X)) + assert_array_equal(svc1.predict(data), svc3.predict(K)) + else: # regressor + assert_allclose(svc1.predict(data), svc2.predict(X)) + assert_allclose(svc1.predict(data), svc3.predict(K)) + + +def test_svc_raises_error_internal_representation(): + """Check that SVC raises error when internal representation is altered. + + Non-regression test for #18891 and https://nvd.nist.gov/vuln/detail/CVE-2020-28975 + """ + clf = svm.SVC(kernel="linear").fit(X, Y) + clf._n_support[0] = 1000000 + + msg = "The internal representation of SVC was altered" + with pytest.raises(ValueError, match=msg): + clf.predict(X) + + +@pytest.mark.parametrize( + "estimator, expected_n_iter_type", + [ + (svm.SVC, np.ndarray), + (svm.NuSVC, np.ndarray), + (svm.SVR, int), + (svm.NuSVR, int), + (svm.OneClassSVM, int), + ], +) +@pytest.mark.parametrize( + "dataset", + [ + make_classification(n_classes=2, n_informative=2, random_state=0), + make_classification(n_classes=3, n_informative=3, random_state=0), + make_classification(n_classes=4, n_informative=4, random_state=0), + ], +) +def test_n_iter_libsvm(estimator, expected_n_iter_type, dataset): + # Check that the type of n_iter_ is correct for the classes that inherit + # from BaseSVC. + # Note that for SVC, and NuSVC this is an ndarray; while for SVR, NuSVR, and + # OneClassSVM, it is an int. + # For SVC and NuSVC also check the shape of n_iter_. + X, y = dataset + n_iter = estimator(kernel="linear").fit(X, y).n_iter_ + assert type(n_iter) == expected_n_iter_type + if estimator in [svm.SVC, svm.NuSVC]: + n_classes = len(np.unique(y)) + assert n_iter.shape == (n_classes * (n_classes - 1) // 2,) + + +@pytest.mark.parametrize("loss", ["squared_hinge", "squared_epsilon_insensitive"]) +def test_dual_auto(loss): + # OvR, L2, N > M (6,2) + dual = _validate_dual_parameter("auto", loss, "l2", "ovr", np.asarray(X)) + assert dual is False + # OvR, L2, N < M (2,6) + dual = _validate_dual_parameter("auto", loss, "l2", "ovr", np.asarray(X).T) + assert dual is True + + +def test_dual_auto_edge_cases(): + # Hinge, OvR, L2, N > M (6,2) + dual = _validate_dual_parameter("auto", "hinge", "l2", "ovr", np.asarray(X)) + assert dual is True # only supports True + dual = _validate_dual_parameter( + "auto", "epsilon_insensitive", "l2", "ovr", np.asarray(X) + ) + assert dual is True # only supports True + # SqHinge, OvR, L1, N < M (2,6) + dual = _validate_dual_parameter( + "auto", "squared_hinge", "l1", "ovr", np.asarray(X).T + ) + assert dual is False # only supports False + + +@pytest.mark.parametrize( + "Estimator, make_dataset", + [(svm.SVC, make_classification), (svm.SVR, make_regression)], +) +@pytest.mark.parametrize("C_inf", [np.inf, float("inf")]) +def test_svm_with_infinite_C(Estimator, make_dataset, C_inf, global_random_seed): + """Check that we can pass `C=inf` that is equivalent to a very large C value. + + Non-regression test for + https://github.com/scikit-learn/scikit-learn/issues/29772 + """ + X, y = make_dataset(random_state=global_random_seed) + estimator_C_inf = Estimator(C=C_inf).fit(X, y) + estimator_C_large = Estimator(C=1e10).fit(X, y) + + assert_allclose(estimator_C_large.predict(X), estimator_C_inf.predict(X)) diff --git a/.venv/Lib/site-packages/sklearn/tests/metadata_routing_common.py b/.venv/Lib/site-packages/sklearn/tests/metadata_routing_common.py new file mode 100644 index 0000000000000000000000000000000000000000..5daccf49f2bc5c740b585df877ba07cd8b03d7f0 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/tests/metadata_routing_common.py @@ -0,0 +1,544 @@ +import inspect +from collections import defaultdict +from functools import partial + +import numpy as np +from numpy.testing import assert_array_equal + +from sklearn.base import ( + BaseEstimator, + ClassifierMixin, + MetaEstimatorMixin, + RegressorMixin, + TransformerMixin, + clone, +) +from sklearn.metrics._scorer import _Scorer, mean_squared_error +from sklearn.model_selection import BaseCrossValidator +from sklearn.model_selection._split import GroupsConsumerMixin +from sklearn.utils._metadata_requests import ( + SIMPLE_METHODS, +) +from sklearn.utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + process_routing, +) +from sklearn.utils.multiclass import _check_partial_fit_first_call + + +def record_metadata(obj, record_default=True, **kwargs): + """Utility function to store passed metadata to a method of obj. + + If record_default is False, kwargs whose values are "default" are skipped. + This is so that checks on keyword arguments whose default was not changed + are skipped. + + """ + stack = inspect.stack() + callee = stack[1].function + caller = stack[2].function + if not hasattr(obj, "_records"): + obj._records = defaultdict(lambda: defaultdict(list)) + if not record_default: + kwargs = { + key: val + for key, val in kwargs.items() + if not isinstance(val, str) or (val != "default") + } + obj._records[callee][caller].append(kwargs) + + +def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs): + """Check whether the expected metadata is passed to the object's method. + + Parameters + ---------- + obj : estimator object + sub-estimator to check routed params for + method : str + sub-estimator's method where metadata is routed to, or otherwise in + the context of metadata routing referred to as 'callee' + parent : str + the parent method which should have called `method`, or otherwise in + the context of metadata routing referred to as 'caller' + split_params : tuple, default=empty + specifies any parameters which are to be checked as being a subset + of the original values + **kwargs : dict + passed metadata + """ + all_records = ( + getattr(obj, "_records", dict()).get(method, dict()).get(parent, list()) + ) + for record in all_records: + # first check that the names of the metadata passed are the same as + # expected. The names are stored as keys in `record`. + assert set(kwargs.keys()) == set( + record.keys() + ), f"Expected {kwargs.keys()} vs {record.keys()}" + for key, value in kwargs.items(): + recorded_value = record[key] + # The following condition is used to check for any specified parameters + # being a subset of the original values + if key in split_params and recorded_value is not None: + assert np.isin(recorded_value, value).all() + else: + if isinstance(recorded_value, np.ndarray): + assert_array_equal(recorded_value, value) + else: + assert ( + recorded_value is value + ), f"Expected {recorded_value} vs {value}. Method: {method}" + + +record_metadata_not_default = partial(record_metadata, record_default=False) + + +def assert_request_is_empty(metadata_request, exclude=None): + """Check if a metadata request dict is empty. + + One can exclude a method or a list of methods from the check using the + ``exclude`` parameter. If metadata_request is a MetadataRouter, then + ``exclude`` can be of the form ``{"object" : [method, ...]}``. + """ + if isinstance(metadata_request, MetadataRouter): + for name, route_mapping in metadata_request: + if exclude is not None and name in exclude: + _exclude = exclude[name] + else: + _exclude = None + assert_request_is_empty(route_mapping.router, exclude=_exclude) + return + + exclude = [] if exclude is None else exclude + for method in SIMPLE_METHODS: + if method in exclude: + continue + mmr = getattr(metadata_request, method) + props = [ + prop + for prop, alias in mmr.requests.items() + if isinstance(alias, str) or alias is not None + ] + assert not props + + +def assert_request_equal(request, dictionary): + for method, requests in dictionary.items(): + mmr = getattr(request, method) + assert mmr.requests == requests + + empty_methods = [method for method in SIMPLE_METHODS if method not in dictionary] + for method in empty_methods: + assert not len(getattr(request, method).requests) + + +class _Registry(list): + # This list is used to get a reference to the sub-estimators, which are not + # necessarily stored on the metaestimator. We need to override __deepcopy__ + # because the sub-estimators are probably cloned, which would result in a + # new copy of the list, but we need copy and deep copy both to return the + # same instance. + def __deepcopy__(self, memo): + return self + + def __copy__(self): + return self + + +class ConsumingRegressor(RegressorMixin, BaseEstimator): + """A regressor consuming metadata. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + """ + + def __init__(self, registry=None): + self.registry = registry + + def partial_fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return self + + def fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return self + + def predict(self, X, y=None, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return np.zeros(shape=(len(X),)) + + def score(self, X, y, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return 1 + + +class NonConsumingClassifier(ClassifierMixin, BaseEstimator): + """A classifier which accepts no metadata on any method.""" + + def __init__(self, alpha=0.0): + self.alpha = alpha + + def fit(self, X, y): + self.classes_ = np.unique(y) + self.coef_ = np.ones_like(X) + return self + + def partial_fit(self, X, y, classes=None): + return self + + def decision_function(self, X): + return self.predict(X) + + def predict(self, X): + y_pred = np.empty(shape=(len(X),)) + y_pred[: len(X) // 2] = 0 + y_pred[len(X) // 2 :] = 1 + return y_pred + + def predict_proba(self, X): + # dummy probabilities to support predict_proba + y_proba = np.empty(shape=(len(X), 2)) + y_proba[: len(X) // 2, :] = np.asarray([1.0, 0.0]) + y_proba[len(X) // 2 :, :] = np.asarray([0.0, 1.0]) + return y_proba + + def predict_log_proba(self, X): + # dummy probabilities to support predict_log_proba + return self.predict_proba(X) + + +class NonConsumingRegressor(RegressorMixin, BaseEstimator): + """A classifier which accepts no metadata on any method.""" + + def fit(self, X, y): + return self + + def partial_fit(self, X, y): + return self + + def predict(self, X): + return np.ones(len(X)) # pragma: no cover + + +class ConsumingClassifier(ClassifierMixin, BaseEstimator): + """A classifier consuming metadata. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + + alpha : float, default=0 + This parameter is only used to test the ``*SearchCV`` objects, and + doesn't do anything. + """ + + def __init__(self, registry=None, alpha=0.0): + self.alpha = alpha + self.registry = registry + + def partial_fit( + self, X, y, classes=None, sample_weight="default", metadata="default" + ): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + _check_partial_fit_first_call(self, classes) + return self + + def fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + + self.classes_ = np.unique(y) + self.coef_ = np.ones_like(X) + return self + + def predict(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + y_score = np.empty(shape=(len(X),), dtype="int8") + y_score[len(X) // 2 :] = 0 + y_score[: len(X) // 2] = 1 + return y_score + + def predict_proba(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + y_proba = np.empty(shape=(len(X), 2)) + y_proba[: len(X) // 2, :] = np.asarray([1.0, 0.0]) + y_proba[len(X) // 2 :, :] = np.asarray([0.0, 1.0]) + return y_proba + + def predict_log_proba(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return np.zeros(shape=(len(X), 2)) + + def decision_function(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + y_score = np.empty(shape=(len(X),)) + y_score[len(X) // 2 :] = 0 + y_score[: len(X) // 2] = 1 + return y_score + + def score(self, X, y, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return 1 + + +class ConsumingTransformer(TransformerMixin, BaseEstimator): + """A transformer which accepts metadata on fit and transform. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + """ + + def __init__(self, registry=None): + self.registry = registry + + def fit(self, X, y=None, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + self.fitted_ = True + return self + + def transform(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return X + 1 + + def fit_transform(self, X, y, sample_weight="default", metadata="default"): + # implementing ``fit_transform`` is necessary since + # ``TransformerMixin.fit_transform`` doesn't route any metadata to + # ``transform``, while here we want ``transform`` to receive + # ``sample_weight`` and ``metadata``. + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return self.fit(X, y, sample_weight=sample_weight, metadata=metadata).transform( + X, sample_weight=sample_weight, metadata=metadata + ) + + def inverse_transform(self, X, sample_weight=None, metadata=None): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return X - 1 + + +class ConsumingNoFitTransformTransformer(BaseEstimator): + """A metadata consuming transformer that doesn't inherit from + TransformerMixin, and thus doesn't implement `fit_transform`. Note that + TransformerMixin's `fit_transform` doesn't route metadata to `transform`.""" + + def __init__(self, registry=None): + self.registry = registry + + def fit(self, X, y=None, sample_weight=None, metadata=None): + if self.registry is not None: + self.registry.append(self) + + record_metadata(self, sample_weight=sample_weight, metadata=metadata) + + return self + + def transform(self, X, sample_weight=None, metadata=None): + record_metadata(self, sample_weight=sample_weight, metadata=metadata) + return X + + +class ConsumingScorer(_Scorer): + def __init__(self, registry=None): + super().__init__( + score_func=mean_squared_error, sign=1, kwargs={}, response_method="predict" + ) + self.registry = registry + + def _score(self, method_caller, clf, X, y, **kwargs): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default(self, **kwargs) + + sample_weight = kwargs.get("sample_weight", None) + return super()._score(method_caller, clf, X, y, sample_weight=sample_weight) + + +class ConsumingSplitter(GroupsConsumerMixin, BaseCrossValidator): + def __init__(self, registry=None): + self.registry = registry + + def split(self, X, y=None, groups="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default(self, groups=groups, metadata=metadata) + + split_index = len(X) // 2 + train_indices = list(range(0, split_index)) + test_indices = list(range(split_index, len(X))) + yield test_indices, train_indices + yield train_indices, test_indices + + def get_n_splits(self, X=None, y=None, groups=None, metadata=None): + return 2 + + def _iter_test_indices(self, X=None, y=None, groups=None): + split_index = len(X) // 2 + train_indices = list(range(0, split_index)) + test_indices = list(range(split_index, len(X))) + yield test_indices + yield train_indices + + +class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): + """A meta-regressor which is only a router.""" + + def __init__(self, estimator): + self.estimator = estimator + + def fit(self, X, y, **fit_params): + params = process_routing(self, "fit", **fit_params) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + + def get_metadata_routing(self): + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + return router + + +class WeightedMetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): + """A meta-regressor which is also a consumer.""" + + def __init__(self, estimator, registry=None): + self.estimator = estimator + self.registry = registry + + def fit(self, X, y, sample_weight=None, **fit_params): + if self.registry is not None: + self.registry.append(self) + + record_metadata(self, sample_weight=sample_weight) + params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + return self + + def predict(self, X, **predict_params): + params = process_routing(self, "predict", **predict_params) + return self.estimator_.predict(X, **params.estimator.predict) + + def get_metadata_routing(self): + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add( + estimator=self.estimator, + method_mapping=MethodMapping() + .add(caller="fit", callee="fit") + .add(caller="predict", callee="predict"), + ) + ) + return router + + +class WeightedMetaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): + """A meta-estimator which also consumes sample_weight itself in ``fit``.""" + + def __init__(self, estimator, registry=None): + self.estimator = estimator + self.registry = registry + + def fit(self, X, y, sample_weight=None, **kwargs): + if self.registry is not None: + self.registry.append(self) + + record_metadata(self, sample_weight=sample_weight) + params = process_routing(self, "fit", sample_weight=sample_weight, **kwargs) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + return self + + def get_metadata_routing(self): + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add( + estimator=self.estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + ) + return router + + +class MetaTransformer(MetaEstimatorMixin, TransformerMixin, BaseEstimator): + """A simple meta-transformer.""" + + def __init__(self, transformer): + self.transformer = transformer + + def fit(self, X, y=None, **fit_params): + params = process_routing(self, "fit", **fit_params) + self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit) + return self + + def transform(self, X, y=None, **transform_params): + params = process_routing(self, "transform", **transform_params) + return self.transformer_.transform(X, **params.transformer.transform) + + def get_metadata_routing(self): + return MetadataRouter(owner=self.__class__.__name__).add( + transformer=self.transformer, + method_mapping=MethodMapping() + .add(caller="fit", callee="fit") + .add(caller="transform", callee="transform"), + ) diff --git a/.venv/Lib/site-packages/sklearn/tests/test_base.py b/.venv/Lib/site-packages/sklearn/tests/test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..c5b141ec199b5b4af146ab529c1cc84e40b93e09 --- /dev/null +++ b/.venv/Lib/site-packages/sklearn/tests/test_base.py @@ -0,0 +1,994 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import pickle +import re +import warnings + +import numpy as np +import pytest +import scipy.sparse as sp +from numpy.testing import assert_allclose + +import sklearn +from sklearn import config_context, datasets +from sklearn.base import ( + BaseEstimator, + OutlierMixin, + TransformerMixin, + clone, + is_classifier, + is_clusterer, + is_outlier_detector, + is_regressor, +) +from sklearn.cluster import KMeans +from sklearn.decomposition import PCA +from sklearn.ensemble import IsolationForest +from sklearn.exceptions import InconsistentVersionWarning +from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC, SVR +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils._mocking import MockDataFrame +from sklearn.utils._set_output import _get_output_config +from sklearn.utils._testing import ( + _convert_container, + assert_array_equal, +) +from sklearn.utils.validation import _check_n_features, validate_data + + +############################################################################# +# A few test classes +class MyEstimator(BaseEstimator): + def __init__(self, l1=0, empty=None): + self.l1 = l1 + self.empty = empty + + +class K(BaseEstimator): + def __init__(self, c=None, d=None): + self.c = c + self.d = d + + +class T(BaseEstimator): + def __init__(self, a=None, b=None): + self.a = a + self.b = b + + +class NaNTag(BaseEstimator): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +class NoNaNTag(BaseEstimator): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags + + +class OverrideTag(NaNTag): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags + + +class DiamondOverwriteTag(NaNTag, NoNaNTag): + pass + + +class InheritDiamondOverwriteTag(DiamondOverwriteTag): + pass + + +class ModifyInitParams(BaseEstimator): + """Deprecated behavior. + Equal parameters but with a type cast. + Doesn't fulfill a is a + """ + + def __init__(self, a=np.array([0])): + self.a = a.copy() + + +class Buggy(BaseEstimator): + "A buggy estimator that does not set its parameters right." + + def __init__(self, a=None): + self.a = 1 + + +class NoEstimator: + def __init__(self): + pass + + def fit(self, X=None, y=None): + return self + + def predict(self, X=None): + return None + + +class VargEstimator(BaseEstimator): + """scikit-learn estimators shouldn't have vargs.""" + + def __init__(self, *vargs): + pass + + +############################################################################# +# The tests + + +def test_clone(): + # Tests that clone creates a correct deep copy. + # We create an estimator, make a copy of its original state + # (which, in this case, is the current state of the estimator), + # and check that the obtained copy is a correct deep copy. + + from sklearn.feature_selection import SelectFpr, f_classif + + selector = SelectFpr(f_classif, alpha=0.1) + new_selector = clone(selector) + assert selector is not new_selector + assert selector.get_params() == new_selector.get_params() + + selector = SelectFpr(f_classif, alpha=np.zeros((10, 2))) + new_selector = clone(selector) + assert selector is not new_selector + + +def test_clone_2(): + # Tests that clone doesn't copy everything. + # We first create an estimator, give it an own attribute, and + # make a copy of its original state. Then we check that the copy doesn't + # have the specific attribute we manually added to the initial estimator. + + from sklearn.feature_selection import SelectFpr, f_classif + + selector = SelectFpr(f_classif, alpha=0.1) + selector.own_attribute = "test" + new_selector = clone(selector) + assert not hasattr(new_selector, "own_attribute") + + +def test_clone_buggy(): + # Check that clone raises an error on buggy estimators. + buggy = Buggy() + buggy.a = 2 + with pytest.raises(RuntimeError): + clone(buggy) + + no_estimator = NoEstimator() + with pytest.raises(TypeError): + clone(no_estimator) + + varg_est = VargEstimator() + with pytest.raises(RuntimeError): + clone(varg_est) + + est = ModifyInitParams() + with pytest.raises(RuntimeError): + clone(est) + + +def test_clone_empty_array(): + # Regression test for cloning estimators with empty arrays + clf = MyEstimator(empty=np.array([])) + clf2 = clone(clf) + assert_array_equal(clf.empty, clf2.empty) + + clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]]))) + clf2 = clone(clf) + assert_array_equal(clf.empty.data, clf2.empty.data) + + +def test_clone_nan(): + # Regression test for cloning estimators with default parameter as np.nan + clf = MyEstimator(empty=np.nan) + clf2 = clone(clf) + + assert clf.empty is clf2.empty + + +def test_clone_dict(): + # test that clone creates a clone of a dict + orig = {"a": MyEstimator()} + cloned = clone(orig) + assert orig["a"] is not cloned["a"] + + +def test_clone_sparse_matrices(): + sparse_matrix_classes = [ + cls + for name in dir(sp) + if name.endswith("_matrix") and type(cls := getattr(sp, name)) is type + ] + + for cls in sparse_matrix_classes: + sparse_matrix = cls(np.eye(5)) + clf = MyEstimator(empty=sparse_matrix) + clf_cloned = clone(clf) + assert clf.empty.__class__ is clf_cloned.empty.__class__ + assert_array_equal(clf.empty.toarray(), clf_cloned.empty.toarray()) + + +def test_clone_estimator_types(): + # Check that clone works for parameters that are types rather than + # instances + clf = MyEstimator(empty=MyEstimator) + clf2 = clone(clf) + + assert clf.empty is clf2.empty + + +def test_clone_class_rather_than_instance(): + # Check that clone raises expected error message when + # cloning class rather than instance + msg = "You should provide an instance of scikit-learn estimator" + with pytest.raises(TypeError, match=msg): + clone(MyEstimator) + + +def test_repr(): + # Smoke test the repr of the base estimator. + my_estimator = MyEstimator() + repr(my_estimator) + test = T(K(), K()) + assert repr(test) == "T(a=K(), b=K())" + + some_est = T(a=["long_params"] * 1000) + assert len(repr(some_est)) == 485 + + +def test_str(): + # Smoke test the str of the base estimator + my_estimator = MyEstimator() + str(my_estimator) + + +def test_get_params(): + test = T(K(), K) + + assert "a__d" in test.get_params(deep=True) + assert "a__d" not in test.get_params(deep=False) + + test.set_params(a__d=2) + assert test.a.d == 2 + + with pytest.raises(ValueError): + test.set_params(a__a=2) + + +# TODO(1.8): Remove this test when the deprecation is removed +def test_is_estimator_type_class(): + with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): + assert is_classifier(SVC) + + with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): + assert is_regressor(SVR) + + with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): + assert is_clusterer(KMeans) + + with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): + assert is_outlier_detector(IsolationForest) + + +@pytest.mark.parametrize( + "estimator, expected_result", + [ + (SVC(), True), + (GridSearchCV(SVC(), {"C": [0.1, 1]}), True), + (Pipeline([("svc", SVC())]), True), + (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), True), + (SVR(), False), + (GridSearchCV(SVR(), {"C": [0.1, 1]}), False), + (Pipeline([("svr", SVR())]), False), + (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), False), + ], +) +def test_is_classifier(estimator, expected_result): + assert is_classifier(estimator) == expected_result + + +@pytest.mark.parametrize( + "estimator, expected_result", + [ + (SVR(), True), + (GridSearchCV(SVR(), {"C": [0.1, 1]}), True), + (Pipeline([("svr", SVR())]), True), + (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), True), + (SVC(), False), + (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), + (Pipeline([("svc", SVC())]), False), + (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False), + ], +) +def test_is_regressor(estimator, expected_result): + assert is_regressor(estimator) == expected_result + + +@pytest.mark.parametrize( + "estimator, expected_result", + [ + (KMeans(), True), + (GridSearchCV(KMeans(), {"n_clusters": [3, 8]}), True), + (Pipeline([("km", KMeans())]), True), + (Pipeline([("km_cv", GridSearchCV(KMeans(), {"n_clusters": [3, 8]}))]), True), + (SVC(), False), + (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), + (Pipeline([("svc", SVC())]), False), + (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False), + ], +) +def test_is_clusterer(estimator, expected_result): + assert is_clusterer(estimator) == expected_result + + +def test_set_params(): + # test nested estimator parameter setting + clf = Pipeline([("svc", SVC())]) + + # non-existing parameter in svc + with pytest.raises(ValueError): + clf.set_params(svc__stupid_param=True) + + # non-existing parameter of pipeline + with pytest.raises(ValueError): + clf.set_params(svm__stupid_param=True) + + # we don't currently catch if the things in pipeline are estimators + # bad_pipeline = Pipeline([("bad", NoEstimator())]) + # with pytest.raises(AttributeError): + # bad_pipeline.set_params(bad__stupid_param=True) + + +def test_set_params_passes_all_parameters(): + # Make sure all parameters are passed together to set_params + # of nested estimator. Regression test for #9944 + + class TestDecisionTree(DecisionTreeClassifier): + def set_params(self, **kwargs): + super().set_params(**kwargs) + # expected_kwargs is in test scope + assert kwargs == expected_kwargs + return self + + expected_kwargs = {"max_depth": 5, "min_samples_leaf": 2} + for est in [ + Pipeline([("estimator", TestDecisionTree())]), + GridSearchCV(TestDecisionTree(), {}), + ]: + est.set_params(estimator__max_depth=5, estimator__min_samples_leaf=2) + + +def test_set_params_updates_valid_params(): + # Check that set_params tries to set SVC().C, not + # DecisionTreeClassifier().C + gscv = GridSearchCV(DecisionTreeClassifier(), {}) + gscv.set_params(estimator=SVC(), estimator__C=42.0) + assert gscv.estimator.C == 42.0 + + +@pytest.mark.parametrize( + "tree,dataset", + [ + ( + DecisionTreeClassifier(max_depth=2, random_state=0), + datasets.make_classification(random_state=0), + ), + ( + DecisionTreeRegressor(max_depth=2, random_state=0), + datasets.make_regression(random_state=0), + ), + ], +) +def test_score_sample_weight(tree, dataset): + rng = np.random.RandomState(0) + # check that the score with and without sample weights are different + X, y = dataset + + tree.fit(X, y) + # generate random sample weights + sample_weight = rng.randint(1, 10, size=len(y)) + score_unweighted = tree.score(X, y) + score_weighted = tree.score(X, y, sample_weight=sample_weight) + msg = "Unweighted and weighted scores are unexpectedly equal" + assert score_unweighted != score_weighted, msg + + +def test_clone_pandas_dataframe(): + class DummyEstimator(TransformerMixin, BaseEstimator): + """This is a dummy class for generating numerical features + + This feature extractor extracts numerical features from pandas data + frame. + + Parameters + ---------- + + df: pandas data frame + The pandas data frame parameter. + + Notes + ----- + """ + + def __init__(self, df=None, scalar_param=1): + self.df = df + self.scalar_param = scalar_param + + def fit(self, X, y=None): + pass + + def transform(self, X): + pass + + # build and clone estimator + d = np.arange(10) + df = MockDataFrame(d) + e = DummyEstimator(df, scalar_param=1) + cloned_e = clone(e) + + # the test + assert (e.df == cloned_e.df).values.all() + assert e.scalar_param == cloned_e.scalar_param + + +def test_clone_protocol(): + """Checks that clone works with `__sklearn_clone__` protocol.""" + + class FrozenEstimator(BaseEstimator): + def __init__(self, fitted_estimator): + self.fitted_estimator = fitted_estimator + + def __getattr__(self, name): + return getattr(self.fitted_estimator, name) + + def __sklearn_clone__(self): + return self + + def fit(self, *args, **kwargs): + return self + + def fit_transform(self, *args, **kwargs): + return self.fitted_estimator.transform(*args, **kwargs) + + X = np.array([[-1, -1], [-2, -1], [-3, -2]]) + pca = PCA().fit(X) + components = pca.components_ + + frozen_pca = FrozenEstimator(pca) + assert_allclose(frozen_pca.components_, components) + + # Calling PCA methods such as `get_feature_names_out` still works + assert_array_equal(frozen_pca.get_feature_names_out(), pca.get_feature_names_out()) + + # Fitting on a new data does not alter `components_` + X_new = np.asarray([[-1, 2], [3, 4], [1, 2]]) + frozen_pca.fit(X_new) + assert_allclose(frozen_pca.components_, components) + + # `fit_transform` does not alter state + frozen_pca.fit_transform(X_new) + assert_allclose(frozen_pca.components_, components) + + # Cloning estimator is a no-op + clone_frozen_pca = clone(frozen_pca) + assert clone_frozen_pca is frozen_pca + assert_allclose(clone_frozen_pca.components_, components) + + +def test_pickle_version_warning_is_not_raised_with_matching_version(): + iris = datasets.load_iris() + tree = DecisionTreeClassifier().fit(iris.data, iris.target) + tree_pickle = pickle.dumps(tree) + assert b"_sklearn_version" in tree_pickle + + with warnings.catch_warnings(): + warnings.simplefilter("error") + tree_restored = pickle.loads(tree_pickle) + + # test that we can predict with the restored decision tree classifier + score_of_original = tree.score(iris.data, iris.target) + score_of_restored = tree_restored.score(iris.data, iris.target) + assert score_of_original == score_of_restored + + +class TreeBadVersion(DecisionTreeClassifier): + def __getstate__(self): + return dict(self.__dict__.items(), _sklearn_version="something") + + +pickle_error_message = ( + "Trying to unpickle estimator {estimator} from " + "version {old_version} when using version " + "{current_version}. This might " + "lead to breaking code or invalid results. " + "Use at your own risk." +) + + +def test_pickle_version_warning_is_issued_upon_different_version(): + iris = datasets.load_iris() + tree = TreeBadVersion().fit(iris.data, iris.target) + tree_pickle_other = pickle.dumps(tree) + message = pickle_error_message.format( + estimator="TreeBadVersion", + old_version="something", + current_version=sklearn.__version__, + ) + with pytest.warns(UserWarning, match=message) as warning_record: + pickle.loads(tree_pickle_other) + + message = warning_record.list[0].message + assert isinstance(message, InconsistentVersionWarning) + assert message.estimator_name == "TreeBadVersion" + assert message.original_sklearn_version == "something" + assert message.current_sklearn_version == sklearn.__version__ + + +class TreeNoVersion(DecisionTreeClassifier): + def __getstate__(self): + return self.__dict__ + + +def test_pickle_version_warning_is_issued_when_no_version_info_in_pickle(): + iris = datasets.load_iris() + # TreeNoVersion has no getstate, like pre-0.18 + tree = TreeNoVersion().fit(iris.data, iris.target) + + tree_pickle_noversion = pickle.dumps(tree) + assert b"_sklearn_version" not in tree_pickle_noversion + message = pickle_error_message.format( + estimator="TreeNoVersion", + old_version="pre-0.18", + current_version=sklearn.__version__, + ) + # check we got the warning about using pre-0.18 pickle + with pytest.warns(UserWarning, match=message): + pickle.loads(tree_pickle_noversion) + + +def test_pickle_version_no_warning_is_issued_with_non_sklearn_estimator(): + iris = datasets.load_iris() + tree = TreeNoVersion().fit(iris.data, iris.target) + tree_pickle_noversion = pickle.dumps(tree) + try: + module_backup = TreeNoVersion.__module__ + TreeNoVersion.__module__ = "notsklearn" + + with warnings.catch_warnings(): + warnings.simplefilter("error") + + pickle.loads(tree_pickle_noversion) + finally: + TreeNoVersion.__module__ = module_backup + + +class DontPickleAttributeMixin: + def __getstate__(self): + data = self.__dict__.copy() + data["_attribute_not_pickled"] = None + return data + + def __setstate__(self, state): + state["_restored"] = True + self.__dict__.update(state) + + +class MultiInheritanceEstimator(DontPickleAttributeMixin, BaseEstimator): + def __init__(self, attribute_pickled=5): + self.attribute_pickled = attribute_pickled + self._attribute_not_pickled = None + + +def test_pickling_when_getstate_is_overwritten_by_mixin(): + estimator = MultiInheritanceEstimator() + estimator._attribute_not_pickled = "this attribute should not be pickled" + + serialized = pickle.dumps(estimator) + estimator_restored = pickle.loads(serialized) + assert estimator_restored.attribute_pickled == 5 + assert estimator_restored._attribute_not_pickled is None + assert estimator_restored._restored + + +def test_pickling_when_getstate_is_overwritten_by_mixin_outside_of_sklearn(): + try: + estimator = MultiInheritanceEstimator() + text = "this attribute should not be pickled" + estimator._attribute_not_pickled = text + old_mod = type(estimator).__module__ + type(estimator).__module__ = "notsklearn" + + serialized = estimator.__getstate__() + assert serialized == {"_attribute_not_pickled": None, "attribute_pickled": 5} + + serialized["attribute_pickled"] = 4 + estimator.__setstate__(serialized) + assert estimator.attribute_pickled == 4 + assert estimator._restored + finally: + type(estimator).__module__ = old_mod + + +class SingleInheritanceEstimator(BaseEstimator): + def __init__(self, attribute_pickled=5): + self.attribute_pickled = attribute_pickled + self._attribute_not_pickled = None + + def __getstate__(self): + state = super().__getstate__() + state["_attribute_not_pickled"] = None + return state + + +def test_pickling_works_when_getstate_is_overwritten_in_the_child_class(): + estimator = SingleInheritanceEstimator() + estimator._attribute_not_pickled = "this attribute should not be pickled" + + serialized = pickle.dumps(estimator) + estimator_restored = pickle.loads(serialized) + assert estimator_restored.attribute_pickled == 5 + assert estimator_restored._attribute_not_pickled is None + + +def test_tag_inheritance(): + # test that changing tags by inheritance is not allowed + + nan_tag_est = NaNTag() + no_nan_tag_est = NoNaNTag() + assert nan_tag_est.__sklearn_tags__().input_tags.allow_nan + assert not no_nan_tag_est.__sklearn_tags__().input_tags.allow_nan + + redefine_tags_est = OverrideTag() + assert not redefine_tags_est.__sklearn_tags__().input_tags.allow_nan + + diamond_tag_est = DiamondOverwriteTag() + assert diamond_tag_est.__sklearn_tags__().input_tags.allow_nan + + inherit_diamond_tag_est = InheritDiamondOverwriteTag() + assert inherit_diamond_tag_est.__sklearn_tags__().input_tags.allow_nan + + +def test_raises_on_get_params_non_attribute(): + class MyEstimator(BaseEstimator): + def __init__(self, param=5): + pass + + def fit(self, X, y=None): + return self + + est = MyEstimator() + msg = "'MyEstimator' object has no attribute 'param'" + + with pytest.raises(AttributeError, match=msg): + est.get_params() + + +def test_repr_mimebundle_(): + # Checks the display configuration flag controls the json output + tree = DecisionTreeClassifier() + output = tree._repr_mimebundle_() + assert "text/plain" in output + assert "text/html" in output + + with config_context(display="text"): + output = tree._repr_mimebundle_() + assert "text/plain" in output + assert "text/html" not in output + + +def test_repr_html_wraps(): + # Checks the display configuration flag controls the html output + tree = DecisionTreeClassifier() + + output = tree._repr_html_() + assert "