From d43d7d61c159a63fd2c8ffebca505b9b3ae41a4e Mon Sep 17 00:00:00 2001 From: Linus Sommer <95619282+linus-md@users.noreply.github.com> Date: Thu, 18 Jan 2024 23:22:43 +0100 Subject: [PATCH 01/32] DOC: Added drop down menus to `1.8` Cross Decomposition (#27916) --- doc/modules/cross_decomposition.rst | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/doc/modules/cross_decomposition.rst b/doc/modules/cross_decomposition.rst index 337a7bcd250bb..8f8d217f87144 100644 --- a/doc/modules/cross_decomposition.rst +++ b/doc/modules/cross_decomposition.rst @@ -92,9 +92,9 @@ Step *a)* may be performed in two ways: either by computing the whole SVD of values, or by directly computing the singular vectors using the power method (cf section 11.3 in [1]_), which corresponds to the `'nipals'` option of the `algorithm` parameter. - -Transforming data -^^^^^^^^^^^^^^^^^ +|details-start| +**Transforming data** +|details-split| To transform :math:`X` into :math:`\bar{X}`, we need to find a projection matrix :math:`P` such that :math:`\bar{X} = XP`. We know that for the @@ -106,9 +106,11 @@ training data, :math:`\Xi = XP`, and :math:`X = \Xi \Gamma^T`. Setting Similarly, :math:`Y` can be transformed using the rotation matrix :math:`V(\Delta^T V)^{-1}`, accessed via the `y_rotations_` attribute. +|details-end| -Predicting the targets Y -^^^^^^^^^^^^^^^^^^^^^^^^ +|details-start| +**Predicting the targets Y** +|details-split| To predict the targets of some data :math:`X`, we are looking for a coefficient matrix :math:`\beta \in R^{d \times t}` such that :math:`Y = @@ -125,6 +127,8 @@ P \Delta^T`, and as a result the coefficient matrix :math:`\beta = \alpha P :math:`\beta` can be accessed through the `coef_` attribute. +|details-end| + PLSSVD ------ @@ -180,14 +184,17 @@ Since :class:`CCA` involves the inversion of :math:`X_k^TX_k` and :math:`Y_k^TY_k`, this estimator can be unstable if the number of features or targets is greater than the number of samples. - -.. topic:: Reference: +|details-start| +**Reference** +|details-split| .. [1] `A survey of Partial Least Squares (PLS) methods, with emphasis on the two-block case `_ JA Wegelin +|details-end| + .. topic:: Examples: * :ref:`sphx_glr_auto_examples_cross_decomposition_plot_compare_cross_decomposition.py` From 26dfe833aa5122997a6b66197df0e03629a45e3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Fri, 19 Jan 2024 06:51:23 +0100 Subject: [PATCH 02/32] Fix prevent infinite loop in KMeans (#28165) --- doc/whats_new/v1.4.rst | 3 +++ sklearn/cluster/_k_means_common.pyx | 16 ++++++++++++++++ sklearn/cluster/tests/test_k_means.py | 18 ++++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index c674a8619e076..ee47bae7b1f5b 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -20,6 +20,9 @@ Changelog :pr:`28121` by :user:`Pietro Peterlongo ` and :user:`Yao Xiao `. +- |Fix| Avoid infinite loop in :class:`cluster.KMeans` when the number of clusters is + larger than the number of non-duplicate samples. + :pr:`28165` by :user:`Jérémie du Boisberranger `. .. _changes_1_4: diff --git a/sklearn/cluster/_k_means_common.pyx b/sklearn/cluster/_k_means_common.pyx index 151af55076b7b..7c9c1bb54eaae 100644 --- a/sklearn/cluster/_k_means_common.pyx +++ b/sklearn/cluster/_k_means_common.pyx @@ -192,6 +192,11 @@ cpdef void _relocate_empty_clusters_dense( int new_cluster_id, old_cluster_id, far_idx, idx, k floating weight + if np.max(distances) == 0: + # Happens when there are more clusters than non-duplicate samples. Relocating + # is pointless in this case. + return + for idx in range(n_empty): new_cluster_id = empty_clusters[idx] @@ -241,6 +246,11 @@ cpdef void _relocate_empty_clusters_sparse( X_indices[X_indptr[i]: X_indptr[i + 1]], centers_old[j], centers_squared_norms[j], True) + if np.max(distances) == 0: + # Happens when there are more clusters than non-duplicate samples. Relocating + # is pointless in this case. + return + cdef: int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32) @@ -274,12 +284,18 @@ cdef void _average_centers( int n_features = centers.shape[1] int j, k floating alpha + int argmax_weight = np.argmax(weight_in_clusters) for j in range(n_clusters): if weight_in_clusters[j] > 0: alpha = 1.0 / weight_in_clusters[j] for k in range(n_features): centers[j, k] *= alpha + else: + # For convenience, we avoid setting empty clusters at the origin but place + # them at the location of the biggest cluster. + for k in range(n_features): + centers[j, k] = centers[argmax_weight, k] cdef void _center_shift( diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 5b0c7ab9aace8..4a112a30b29ed 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -1352,3 +1352,21 @@ def test_sample_weight_zero(init, global_random_seed): # (i.e. be at a distance=0 from it) d = euclidean_distances(X[::2], clusters_weighted) assert not np.any(np.isclose(d, 0)) + + +@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +def test_relocating_with_duplicates(algorithm, array_constr): + """Check that kmeans stops when there are more centers than non-duplicate samples + + Non-regression test for issue: + https://github.com/scikit-learn/scikit-learn/issues/28055 + """ + X = np.array([[0, 0], [1, 1], [1, 1], [1, 0], [0, 1]]) + km = KMeans(n_clusters=5, init=X, algorithm=algorithm) + + msg = r"Number of distinct clusters \(4\) found smaller than n_clusters \(5\)" + with pytest.warns(ConvergenceWarning, match=msg): + km.fit(array_constr(X)) + + assert km.n_iter_ == 1 From 2da6d17bb472524b883d81afa4a85bd7a1c89d60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 19 Jan 2024 07:32:04 +0100 Subject: [PATCH 03/32] CI Remove temporary work-around related to scipy and pandas development wheel installing numpy<2 (#28163) --- build_tools/azure/install.sh | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 5bd4112a1820b..df20e27b3c068 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -47,6 +47,16 @@ pre_python_environment_install() { } +check_packages_dev_version() { + for package in $@; do + package_version=$(python -c "import $package; print($package.__version__)") + if ! [[ $package_version =~ "dev" ]]; then + echo "$package is not a development version: $package_version" + exit 1 + fi + done +} + python_environment_install_and_activate() { if [[ "$DISTRIB" == "conda"* ]]; then # Install/update conda with the libmamba solver because the legacy @@ -71,12 +81,10 @@ python_environment_install_and_activate() { if [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then echo "Installing development dependency wheels" dev_anaconda_url=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple - pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url numpy pandas scipy + dev_packages="numpy scipy pandas" + pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url $dev_packages - # XXX: at the time of writing, installing scipy or pandas from the dev - # wheels forces the numpy dependency to be < 2.0.0. Let's force the - # installation of numpy dev wheels instead. - pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url numpy + check_packages_dev_version $dev_packages echo "Installing Cython from latest sources" pip install https://github.com/cython/cython/archive/master.zip From 21fcab7223257d01dab5397424de9057128d5467 Mon Sep 17 00:00:00 2001 From: Andrei Dzis Date: Fri, 19 Jan 2024 13:11:23 +0300 Subject: [PATCH 04/32] DOC Added relation between ROC-AUC and Gini in docstring of roc_auc_score (#28156) Co-authored-by: Guillaume Lemaitre --- sklearn/metrics/_ranking.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index 4a2e7aa1b78a3..a117a5427a996 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -538,6 +538,21 @@ class scores must correspond to the order of ``labels``, RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic (ROC) curve given the true and predicted values. + Notes + ----- + The Gini Coefficient is a summary measure of the ranking ability of binary + classifiers. It is expressed using the area under of the ROC as follows: + + G = 2 * AUC - 1 + + Where G is the Gini coefficient and AUC is the ROC-AUC score. This normalisation + will ensure that random guessing will yield a score of 0 in expectation, and it is + upper bounded by 1. + + Note that there is another version of the Gini coefficient for regressors of a + continuous positive target variable. In this case, AUC is taken over the Lorenz + curve instead of the ROC [6]_. + References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic @@ -558,6 +573,8 @@ class scores must correspond to the order of ``labels``, Under the ROC Curve for Multiple Class Classification Problems. Machine Learning, 45(2), 171-186. `_ + .. [6] `Wikipedia entry for the Gini coefficient + `_ Examples -------- From a3c8da18af46da0d0e32027dacb20501647b078a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Fri, 19 Jan 2024 13:01:11 +0100 Subject: [PATCH 05/32] MAINT Update SECURITY.md for 1.4.0 (#28182) --- SECURITY.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SECURITY.md b/SECURITY.md index 721f2041c2b85..3f291e7a566f8 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,8 +4,8 @@ | Version | Supported | | --------- | ------------------ | -| 1.3.2 | :white_check_mark: | -| < 1.3.2 | :x: | +| 1.4.0 | :white_check_mark: | +| < 1.4.0 | :x: | ## Reporting a Vulnerability From 5c7e831306e0a087c2b6af6913fa5b3c402f6d67 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 19 Jan 2024 13:58:02 +0100 Subject: [PATCH 06/32] DOC use list for the ridge_regression docstring (#28168) --- sklearn/linear_model/_ridge.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index c4f52c68e697e..5ce4a8c2fd3b8 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -552,14 +552,15 @@ def ridge_regression( Examples -------- + >>> import numpy as np >>> from sklearn.datasets import make_regression >>> from sklearn.linear_model import ridge_regression - >>> X, y = make_regression( - ... n_features=4, n_informative=2, shuffle=False, random_state=0 - ... ) + >>> rng = np.random.RandomState(0) + >>> X = rng.randn(100, 4) + >>> y = 2.0 * X[:, 0] - 1.0 * X[:, 1] + 0.1 * rng.standard_normal(100) >>> coef, intercept = ridge_regression(X, y, alpha=1.0, return_intercept=True) - >>> coef - array([20.2..., 33.7..., 0.1..., 0.0...]) + >>> list(coef) + [1.97..., -1.00..., -0.0..., -0.0...] >>> intercept -0.0... """ From 66a6551786c3d257a7b4f0b23a705f52f868c235 Mon Sep 17 00:00:00 2001 From: Andrei Dzis Date: Fri, 19 Jan 2024 23:15:37 +0300 Subject: [PATCH 07/32] DOC Fix for roc_auc_score documentation (#28190) --- sklearn/metrics/_ranking.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index a117a5427a996..4a960a2f4402a 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -549,10 +549,6 @@ class scores must correspond to the order of ``labels``, will ensure that random guessing will yield a score of 0 in expectation, and it is upper bounded by 1. - Note that there is another version of the Gini coefficient for regressors of a - continuous positive target variable. In this case, AUC is taken over the Lorenz - curve instead of the ROC [6]_. - References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic From 2020648edfdbdeb4797465434ed4afd6e79ce2ed Mon Sep 17 00:00:00 2001 From: 101AlexMartin <101071686+101AlexMartin@users.noreply.github.com> Date: Sat, 20 Jan 2024 10:53:07 +0100 Subject: [PATCH 08/32] MNT changed order pre-commits hooks following ruff recommendation (#28062) Co-authored-by: Alejandro Martin --- .pre-commit-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index abffbbe149f2c..506e3ab4fe64e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,16 +5,16 @@ repos: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace -- repo: https://github.com/psf/black - rev: 23.3.0 - hooks: - - id: black - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.0.272 hooks: - id: ruff args: ["--fix", "--show-source"] +- repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.3.0 hooks: From 6a1022353103cefb93258f503b087d821262a1b6 Mon Sep 17 00:00:00 2001 From: Rodrigo Romero <69991220+rromer07@users.noreply.github.com> Date: Sat, 20 Jan 2024 06:48:55 -0500 Subject: [PATCH 09/32] DOC add docstring example to `sklearn.metrics.consensus_score` (#28193) --- sklearn/metrics/cluster/_bicluster.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/metrics/cluster/_bicluster.py b/sklearn/metrics/cluster/_bicluster.py index b9ca47c9b91aa..713d0bee8fa2e 100644 --- a/sklearn/metrics/cluster/_bicluster.py +++ b/sklearn/metrics/cluster/_bicluster.py @@ -89,6 +89,14 @@ def consensus_score(a, b, *, similarity="jaccard"): * Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis for bicluster acquisition `__. + + Examples + -------- + >>> from sklearn.metrics import consensus_score + >>> a = ([[True, False], [False, True]], [[False, True], [True, False]]) + >>> b = ([[False, True], [True, False]], [[True, False], [False, True]]) + >>> consensus_score(a, b, similarity='jaccard') + 1.0 """ if similarity == "jaccard": similarity = _jaccard From 836690a401057572ef7d3478a9a3aa78dfa1447b Mon Sep 17 00:00:00 2001 From: Rodrigo Romero <69991220+rromer07@users.noreply.github.com> Date: Sat, 20 Jan 2024 14:42:16 -0500 Subject: [PATCH 10/32] DOC add docstring example to `sklearn.metrics.coverage_error` (#28196) Co-authored-by: Guillaume Lemaitre --- sklearn/metrics/_ranking.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index 4a960a2f4402a..74ae6dcf04299 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -1300,6 +1300,14 @@ def coverage_error(y_true, y_score, *, sample_weight=None): .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). Mining multi-label data. In Data mining and knowledge discovery handbook (pp. 667-685). Springer US. + + Examples + -------- + >>> from sklearn.metrics import coverage_error + >>> y_true = [[1, 0, 0], [0, 1, 1]] + >>> y_score = [[1, 0, 0], [0, 1, 1]] + >>> coverage_error(y_true, y_score) + 1.5 """ y_true = check_array(y_true, ensure_2d=True) y_score = check_array(y_score, ensure_2d=True) From 897c0c570511be4b7912a335052ed479ac5ca1f3 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 20 Jan 2024 21:08:36 +0100 Subject: [PATCH 11/32] ENH improve HGBT predict classes (#27844) Co-authored-by: Guillaume Lemaitre --- doc/whats_new/v1.4.rst | 4 ++++ .../_hist_gradient_boosting/gradient_boosting.py | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index ee47bae7b1f5b..d832e4b508359 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -494,6 +494,10 @@ Changelog which allows to retrieve the training sample indices used for each tree estimator. :pr:`26736` by :user:`Adam Li `. +- |Efficiency| Improves runtime of `predict` of + :class:`ensemble.HistGradientBoostingClassifier` by avoiding to call `predict_proba`. + :pr:`27844` by :user:`Christian Lorentzen `. + - |Fix| Fixes :class:`ensemble.IsolationForest` when the input is a sparse matrix and `contamination` is set to a float value. :pr:`27645` by :user:`Guillaume Lemaitre `. diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 0837d19407030..698fd0629d02e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -2137,7 +2137,13 @@ def predict(self, X): The predicted classes. """ # TODO: This could be done in parallel - encoded_classes = np.argmax(self.predict_proba(X), axis=1) + raw_predictions = self._raw_predict(X) + if raw_predictions.shape[1] == 1: + # np.argmax([0.5, 0.5]) is 0, not 1. Therefore "> 0" not ">= 0" to be + # consistent with the multiclass case. + encoded_classes = (raw_predictions.ravel() > 0).astype(int) + else: + encoded_classes = np.argmax(raw_predictions, axis=1) return self.classes_[encoded_classes] def staged_predict(self, X): @@ -2158,8 +2164,12 @@ def staged_predict(self, X): y : generator of ndarray of shape (n_samples,) The predicted classes of the input samples, for each iteration. """ - for proba in self.staged_predict_proba(X): - encoded_classes = np.argmax(proba, axis=1) + for raw_predictions in self._staged_raw_predict(X): + if raw_predictions.shape[1] == 1: + # np.argmax([0, 0]) is 0, not 1, therefor "> 0" not ">= 0" + encoded_classes = (raw_predictions.ravel() > 0).astype(int) + else: + encoded_classes = np.argmax(raw_predictions, axis=1) yield self.classes_.take(encoded_classes, axis=0) def predict_proba(self, X): From b4754ba7eeacf1519fb827392d99207d38011627 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 22 Jan 2024 02:31:13 -0500 Subject: [PATCH 12/32] ENH Checks pandas and polars directly (#28195) --- doc/whats_new/v1.4.rst | 3 +++ sklearn/utils/validation.py | 26 ++++++++++---------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index d832e4b508359..98bfcd2d96f54 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -24,6 +24,9 @@ Changelog larger than the number of non-duplicate samples. :pr:`28165` by :user:`Jérémie du Boisberranger `. +- |Enhancement| Pandas and Polars dataframe are validated directly without ducktyping + checks. :pr:`28195` by `Thomas Fan`_. + .. _changes_1_4: Version 1.4.0 diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 6531a9da3404b..43f553eb2d2d5 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -2070,26 +2070,20 @@ def _check_method_params(X, params, indices=None): def _is_pandas_df(X): """Return True if the X is a pandas dataframe.""" - if hasattr(X, "columns") and hasattr(X, "iloc"): - # Likely a pandas DataFrame, we explicitly check the type to confirm. - try: - pd = sys.modules["pandas"] - except KeyError: - return False - return isinstance(X, pd.DataFrame) - return False + try: + pd = sys.modules["pandas"] + except KeyError: + return False + return isinstance(X, pd.DataFrame) def _is_polars_df(X): """Return True if the X is a polars dataframe.""" - if hasattr(X, "columns") and hasattr(X, "schema"): - # Likely a polars DataFrame, we explicitly check the type to confirm. - try: - pl = sys.modules["polars"] - except KeyError: - return False - return isinstance(X, pl.DataFrame) - return False + try: + pl = sys.modules["polars"] + except KeyError: + return False + return isinstance(X, pl.DataFrame) def _get_feature_names(X): From 69cef4adc1d689828958328598712e8b2937971d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 22 Jan 2024 10:53:04 +0100 Subject: [PATCH 13/32] FIX _convert_container should be able to convert from sparse to sparse (#28185) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Loïc Estève --- sklearn/utils/_testing.py | 40 ++++++++++++++++------------- sklearn/utils/tests/test_testing.py | 29 +++++++++++++++++++++ 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index b49622627c7ae..bb4da452712d2 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -775,8 +775,6 @@ def _convert_container( return tuple(np.asarray(container, dtype=dtype).tolist()) elif constructor_name == "array": return np.asarray(container, dtype=dtype) - elif constructor_name == "sparse": - return sp.sparse.csr_matrix(np.atleast_2d(container), dtype=dtype) elif constructor_name in ("pandas", "dataframe"): pd = pytest.importorskip("pandas", minversion=minversion) result = pd.DataFrame(container, columns=columns_name, dtype=dtype, copy=False) @@ -813,22 +811,28 @@ def _convert_container( return pd.Index(container, dtype=dtype) elif constructor_name == "slice": return slice(container[0], container[1]) - elif constructor_name == "sparse_csr": - return sp.sparse.csr_matrix(np.atleast_2d(container), dtype=dtype) - elif constructor_name == "sparse_csr_array": - if sp_version >= parse_version("1.8"): - return sp.sparse.csr_array(np.atleast_2d(container), dtype=dtype) - raise ValueError( - f"sparse_csr_array is only available with scipy>=1.8.0, got {sp_version}" - ) - elif constructor_name == "sparse_csc": - return sp.sparse.csc_matrix(np.atleast_2d(container), dtype=dtype) - elif constructor_name == "sparse_csc_array": - if sp_version >= parse_version("1.8"): - return sp.sparse.csc_array(np.atleast_2d(container), dtype=dtype) - raise ValueError( - f"sparse_csc_array is only available with scipy>=1.8.0, got {sp_version}" - ) + elif "sparse" in constructor_name: + if not sp.sparse.issparse(container): + # For scipy >= 1.13, sparse array constructed from 1d array may be + # 1d or raise an exception. To avoid this, we make sure that the + # input container is 2d. For more details, see + # https://github.com/scipy/scipy/pull/18530#issuecomment-1878005149 + container = np.atleast_2d(container) + + if "array" in constructor_name and sp_version < parse_version("1.8"): + raise ValueError( + f"{constructor_name} is only available with scipy>=1.8.0, got " + f"{sp_version}" + ) + if constructor_name in ("sparse", "sparse_csr"): + # sparse and sparse_csr are equivalent for legacy reasons + return sp.sparse.csr_matrix(container, dtype=dtype) + elif constructor_name == "sparse_csr_array": + return sp.sparse.csr_array(container, dtype=dtype) + elif constructor_name == "sparse_csc": + return sp.sparse.csc_matrix(container, dtype=dtype) + elif constructor_name == "sparse_csc_array": + return sp.sparse.csc_array(container, dtype=dtype) def raises(expected_exc_type, match=None, may_pass=False, err_msg=None): diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index f24b4de928201..c6132afd0c1d4 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -845,3 +845,32 @@ def test_assert_run_python_script_without_output(): match="output was not supposed to match.+got.+something to stderr", ): assert_run_python_script_without_output(code, pattern="to.+stderr") + + +@pytest.mark.parametrize( + "constructor_name", + [ + "sparse_csr", + "sparse_csc", + pytest.param( + "sparse_csr_array", + marks=pytest.mark.skipif( + sp_version < parse_version("1.8"), + reason="sparse arrays are available as of scipy 1.8.0", + ), + ), + pytest.param( + "sparse_csc_array", + marks=pytest.mark.skipif( + sp_version < parse_version("1.8"), + reason="sparse arrays are available as of scipy 1.8.0", + ), + ), + ], +) +def test_convert_container_sparse_to_sparse(constructor_name): + """Non-regression test to check that we can still convert a sparse container + from a given format to another format. + """ + X_sparse = sparse.random(10, 10, density=0.1, format="csr") + _convert_container(X_sparse, constructor_name) From 1df773fe12d54beaed1136d7b040571e51f17205 Mon Sep 17 00:00:00 2001 From: Anderson Nelson Date: Mon, 22 Jan 2024 05:16:30 -0500 Subject: [PATCH 14/32] DOC Add docstring examples for covariance module (#28192) Co-authored-by: Guillaume Lemaitre --- sklearn/covariance/_shrunk_covariance.py | 37 ++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 3a79afa30729f..5df229260b03c 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -134,6 +134,18 @@ def shrunk_covariance(emp_cov, shrinkage=0.1): (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) where `mu = trace(cov) / n_features`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_gaussian_quantiles + >>> from sklearn.covariance import empirical_covariance, shrunk_covariance + >>> real_cov = np.array([[.8, .3], [.3, .4]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500) + >>> shrunk_covariance(empirical_covariance(X)) + array([[0.73..., 0.25...], + [0.25..., 0.41...]]) """ emp_cov = check_array(emp_cov, allow_nd=True) n_features = emp_cov.shape[-1] @@ -316,6 +328,17 @@ def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000): (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) where mu = trace(cov) / n_features + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import ledoit_wolf_shrinkage + >>> real_cov = np.array([[.4, .2], [.2, .8]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50) + >>> shrinkage_coefficient = ledoit_wolf_shrinkage(X) + >>> shrinkage_coefficient + 0.23... """ X = check_array(X) # for only one feature, the result is the same whatever the shrinkage @@ -419,6 +442,20 @@ def ledoit_wolf(X, *, assume_centered=False, block_size=1000): (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) where mu = trace(cov) / n_features + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import empirical_covariance, ledoit_wolf + >>> real_cov = np.array([[.4, .2], [.2, .8]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50) + >>> covariance, shrinkage = ledoit_wolf(X) + >>> covariance + array([[0.44..., 0.16...], + [0.16..., 0.80...]]) + >>> shrinkage + 0.23... """ estimator = LedoitWolf( assume_centered=assume_centered, From 55eb8900b44d62cf665444258adf4a3ae29926a1 Mon Sep 17 00:00:00 2001 From: Shubham <134207725+shubhamparmar1@users.noreply.github.com> Date: Mon, 22 Jan 2024 15:51:08 +0530 Subject: [PATCH 15/32] DOC Add a docstring examples for utils functions (#28181) Co-authored-by: Guillaume Lemaitre --- sklearn/utils/_estimator_html_repr.py | 7 ++++++ sklearn/utils/estimator_checks.py | 7 ++++++ sklearn/utils/extmath.py | 33 +++++++++++++++++++++++++-- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py index dd51a8bbb71de..5e465234f516b 100644 --- a/sklearn/utils/_estimator_html_repr.py +++ b/sklearn/utils/_estimator_html_repr.py @@ -329,6 +329,13 @@ def estimator_html_repr(estimator): ------- html: str HTML representation of estimator. + + Examples + -------- + >>> from sklearn.utils._estimator_html_repr import estimator_html_repr + >>> from sklearn.linear_model import LogisticRegression + >>> estimator_html_repr(LogisticRegression()) + '