From c44457af96a924683b6a65bd3fd6c285043bdec6 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 10 Jul 2024 18:33:02 +0200 Subject: [PATCH 01/35] ENH verbose >= 2 for per iteration info in HGBT (#28179) --- doc/whats_new/v1.6.rst | 10 ++++++++-- .../_hist_gradient_boosting/gradient_boosting.py | 10 ++++++---- .../tests/test_gradient_boosting.py | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 62a6613e0a460..4b6a0263187da 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -71,7 +71,7 @@ more details. :class:`ensemble.StackingRegressor` now support metadata routing and pass ``**fit_params`` to the underlying estimators via their `fit` methods. :pr:`28701` by :user:`Stefanie Senger `. - + - |Feature| :class:`compose.TransformedTargetRegressor` now supports metadata routing in its `fit` and `predict` methods and routes the corresponding params to the underlying regressor. @@ -142,6 +142,12 @@ Changelog by parallelizing the initial search for bin thresholds :pr:`28064` by :user:`Christian Lorentzen `. +- |Enhancement| The verbosity of :class:`ensemble.HistGradientBoostingClassifier` + and :class:`ensemble.HistGradientBoostingRegressor` got a more granular control. Now, + `verbose = 1` prints only summary messages, `verbose >= 2` prints the full + information as before. + :pr:`28179` by :user:`Christian Lorentzen `. + - |Efficiency| :class:`ensemble.IsolationForest` now runs parallel jobs during :term:`predict` offering a speedup of up to 2-4x on sample sizes larger than 2000 using `joblib`. @@ -150,7 +156,7 @@ Changelog - |Feature| :class:`ensemble.ExtraTreesClassifier` and :class:`ensemble.ExtraTreesRegressor` now support missing-values in the data matrix `X`. Missing-values are handled by randomly moving all of - the samples to the left, or right child node as the tree is traversed. + the samples to the left, or right child node as the tree is traversed. :pr:`28268` by :user:`Adam Li `. :mod:`sklearn.impute` diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 78f8456e969de..990834a626f89 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -857,7 +857,7 @@ def fit(self, X, y, sample_weight=None): ) for iteration in range(begin_at_stage, self.max_iter): - if self.verbose: + if self.verbose >= 2: iteration_start_time = time() print( "[{}/{}] ".format(iteration + 1, self.max_iter), end="", flush=True @@ -987,7 +987,7 @@ def fit(self, X, y, sample_weight=None): raw_predictions_val=raw_predictions_val, ) - if self.verbose: + if self.verbose >= 2: self._print_iteration_stats(iteration_start_time) # maybe we could also early stop if all the trees are stumps? @@ -1617,7 +1617,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): iterations to be considered an improvement upon the reference score. verbose : int, default=0 The verbosity level. If not zero, print some information about the - fitting process. + fitting process. ``1`` prints only summary info, ``2`` prints info per + iteration. random_state : int, RandomState instance or None, default=None Pseudo-random number generator to control the subsampling in the binning process, and the train/validation data split if early stopping @@ -1996,7 +1997,8 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): considered an improvement upon the reference score. verbose : int, default=0 The verbosity level. If not zero, print some information about the - fitting process. + fitting process. ``1`` prints only summary info, ``2`` prints info per + iteration. random_state : int, RandomState instance or None, default=None Pseudo-random number generator to control the subsampling in the binning process, and the train/validation data split if early stopping diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index eedf5e73549c2..b5711413f9b75 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -160,7 +160,7 @@ def test_early_stopping_classification( X, y = data gb = HistGradientBoostingClassifier( - verbose=1, # just for coverage + verbose=2, # just for coverage min_samples_leaf=5, # easier to overfit fast scoring=scoring, tol=tol, From ab9f748e699a88af40c0baacc28e158960f86404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 10 Jul 2024 23:38:43 +0200 Subject: [PATCH 02/35] CI Update pylatest-pip-openblas-pandas build to Python 3.11 (#29444) --- ...latest_pip_openblas_pandas_environment.yml | 2 +- ...st_pip_openblas_pandas_linux-64_conda.lock | 47 +++++++++---------- .../update_environments_and_lock_files.py | 7 +-- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/build_tools/azure/pylatest_pip_openblas_pandas_environment.yml b/build_tools/azure/pylatest_pip_openblas_pandas_environment.yml index c0d6aeaa717c0..2d9ca394a6ac9 100644 --- a/build_tools/azure/pylatest_pip_openblas_pandas_environment.yml +++ b/build_tools/azure/pylatest_pip_openblas_pandas_environment.yml @@ -4,7 +4,7 @@ channels: - defaults dependencies: - - python=3.9 + - python=3.11 - ccache - pip - pip: diff --git a/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock b/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock index ac4c92a671ed4..8145a497caa86 100644 --- a/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock +++ b/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock @@ -1,6 +1,6 @@ # Generated by conda-lock. # platform: linux-64 -# input_hash: 11d97b96088b6b1eaf3b774050152e7899f0a6ab757350df2efd44b2de3a5f75 +# input_hash: 11829a15aa51e3a3ad9479d8b5b953c0af47dd8e86d725e58d55cbcfe68c6d5e @EXPLICIT https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9 https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2024.3.11-h06a4308_0.conda#08529eb3504712baabcbda266a19feb7 @@ -10,7 +10,9 @@ https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda#b37 https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-11.2.0-h1234567_1.conda#57623d10a70e09e1d048c2b2b6f4e2dd https://repo.anaconda.com/pkgs/main/linux-64/_openmp_mutex-5.1-1_gnu.conda#71d281e9c2192cb3fa425655a8defb85 https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-11.2.0-h1234567_1.conda#a87728dabf3151fb9cfa990bd2eb0464 +https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h5eee18b_6.conda#f21a3ff51c1b271977f53ce956a69297 https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.4.4-h6a678d5_1.conda#70646cc713f0c43926cfdcfe9b695fe0 +https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.41.5-h5eee18b_0.conda#4a6a2354414c9080327274aa514e5299 https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.4-h6a678d5_0.conda#5558eec6e2191741a92f832ea826251c https://repo.anaconda.com/pkgs/main/linux-64/openssl-3.0.14-h5eee18b_0.conda#37b6dad6aa49000a4230a9f0cad172f6 https://repo.anaconda.com/pkgs/main/linux-64/xz-5.4.6-h5eee18b_1.conda#1562802f843297ee776a50b9329597ed @@ -19,33 +21,33 @@ https://repo.anaconda.com/pkgs/main/linux-64/ccache-3.7.9-hfe4627d_0.conda#bef6f https://repo.anaconda.com/pkgs/main/linux-64/readline-8.2-h5eee18b_0.conda#be42180685cce6e6b0329201d9f48efb https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.14-h39e8969_0.conda#78dbc5e3c69143ebc037fc5d5b22e597 https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.45.3-h5eee18b_0.conda#acf93d6aceb74d6110e20b44cc45939e -https://repo.anaconda.com/pkgs/main/linux-64/python-3.9.19-h955ad1f_1.conda#4b453281859c293c9d577271f3b18a0d -https://repo.anaconda.com/pkgs/main/linux-64/setuptools-69.5.1-py39h06a4308_0.conda#3eb144d481b39c0fbbced789dd9b76b3 -https://repo.anaconda.com/pkgs/main/linux-64/wheel-0.43.0-py39h06a4308_0.conda#40bb60408c7433d767fd8c65b35bc4a0 -https://repo.anaconda.com/pkgs/main/linux-64/pip-24.0-py39h06a4308_0.conda#7f8ce3af15cfecd12e4dda8c5cef5fb7 +https://repo.anaconda.com/pkgs/main/linux-64/python-3.11.9-h955ad1f_0.conda#5668a8845dd35bbbc9663c8f217a2ab8 +https://repo.anaconda.com/pkgs/main/linux-64/setuptools-69.5.1-py311h06a4308_0.conda#0989470c81841dfcb22c7bbb40f543c5 +https://repo.anaconda.com/pkgs/main/linux-64/wheel-0.43.0-py311h06a4308_0.conda#ec915b5ff89bdbcea7ef943d9e296967 +https://repo.anaconda.com/pkgs/main/linux-64/pip-24.0-py311h06a4308_0.conda#84aef4db159f0daf63751d87d7d6ca56 # pip alabaster @ https://files.pythonhosted.org/packages/32/34/d4e1c02d3bee589efb5dfa17f88ea08bdb3e3eac12bc475462aec52ed223/alabaster-0.7.16-py3-none-any.whl#sha256=b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92 # pip array-api-compat @ https://files.pythonhosted.org/packages/05/ae/2f11031bb9f819f6efaaa66b720b37928fbb0087161fcbae3465ae374a18/array_api_compat-1.7.1-py3-none-any.whl#sha256=6974f51775972f39edbca39e08f1c2e43c51401c093a0fea5ac7159875095d8a # pip babel @ https://files.pythonhosted.org/packages/27/45/377f7e32a5c93d94cd56542349b34efab5ca3f9e2fd5a68c5e93169aa32d/Babel-2.15.0-py3-none-any.whl#sha256=08706bdad8d0a3413266ab61bd6c34d0c28d6e1e7badf40a2cebe67644e2e1fb # pip certifi @ https://files.pythonhosted.org/packages/1c/d5/c84e1a17bf61d4df64ca866a1c9a913874b4e9bdc131ec689a0ad013fb36/certifi-2024.7.4-py3-none-any.whl#sha256=c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90 -# pip charset-normalizer @ https://files.pythonhosted.org/packages/98/69/5d8751b4b670d623aa7a47bef061d69c279e9f922f6705147983aa76c3ce/charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 +# pip charset-normalizer @ https://files.pythonhosted.org/packages/40/26/f35951c45070edc957ba40a5b1db3cf60a9dbb1b350c2d5bef03e01e61de/charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 +# pip coverage @ https://files.pythonhosted.org/packages/1e/62/e33595d35c9fa7cbcca5df2c3745b595532ec94b68c49ca2877629c4aca1/coverage-7.5.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=ed550e7442f278af76d9d65af48069f1fb84c9f745ae249c1a183c1e9d1b025c # pip cycler @ https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl#sha256=85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30 -# pip cython @ https://files.pythonhosted.org/packages/a7/f5/3dde4d96076888ceaa981827b098274c2b45ddd4b20d75a8cfaa92b91eec/Cython-3.0.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=651a15a8534ebfb9b58cb0b87c269c70984b6f9c88bfe65e4f635f0e3f07dfcd +# pip cython @ https://files.pythonhosted.org/packages/45/82/077c13035d6f45d8b8b74d67e9f73f2bfc54ef8d1f79572790f6f7d2b4f5/Cython-3.0.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=38d40fa1324ac47c04483d151f5e092406a147eac88a18aec789cf01c089c3f2 # pip docutils @ https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl#sha256=dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2 -# pip exceptiongroup @ https://files.pythonhosted.org/packages/01/90/79fe92dd413a9cab314ef5c591b5aa9b9ba787ae4cadab75055b0ae00b33/exceptiongroup-1.2.1-py3-none-any.whl#sha256=5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad # pip execnet @ https://files.pythonhosted.org/packages/43/09/2aea36ff60d16dd8879bdb2f5b3ee0ba8d08cbbdcdfe870e695ce3784385/execnet-2.1.1-py3-none-any.whl#sha256=26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc -# pip fonttools @ https://files.pythonhosted.org/packages/7b/30/ad4483dfc5a1999f26b7bc5edc311576f433a3e00dd8aea01f2099c3a29f/fonttools-4.53.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=84ec3fb43befb54be490147b4a922b5314e16372a643004f182babee9f9c3407 +# pip fonttools @ https://files.pythonhosted.org/packages/a4/22/0a0ad59d9367997fd74a00ad2e88d10559122e09f105e94d34c155aecc0a/fonttools-4.53.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=bee32ea8765e859670c4447b0817514ca79054463b6b79784b08a8df3a4d78e3 # pip idna @ https://files.pythonhosted.org/packages/e5/3e/741d8c82801c347547f8a2a06aa57dbb1992be9e948df2ea0eda2c8b79e8/idna-3.7-py3-none-any.whl#sha256=82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # pip imagesize @ https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl#sha256=0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b # pip iniconfig @ https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl#sha256=b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 # pip joblib @ https://files.pythonhosted.org/packages/91/29/df4b9b42f2be0b623cbd5e2140cafcaa2bef0759a00b7b70104dcfe2fb51/joblib-1.4.2-py3-none-any.whl#sha256=06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6 -# pip kiwisolver @ https://files.pythonhosted.org/packages/c0/a8/841594f11d0b88d8aeb26991bc4dac38baa909dc58d0c4262a4f7893bcbf/kiwisolver-1.4.5-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl#sha256=6c3bd3cde54cafb87d74d8db50b909705c62b17c2099b8f2e25b461882e544ff -# pip markupsafe @ https://files.pythonhosted.org/packages/5f/5a/360da85076688755ea0cceb92472923086993e86b5613bbae9fbc14136b0/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 +# pip kiwisolver @ https://files.pythonhosted.org/packages/17/ba/17a706b232308e65f57deeccae503c268292e6a091313f6ce833a23093ea/kiwisolver-1.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=040c1aebeda72197ef477a906782b5ab0d387642e93bda547336b8957c61022e +# pip markupsafe @ https://files.pythonhosted.org/packages/97/18/c30da5e7a0e7f4603abfc6780574131221d9148f323752c2755d48abad30/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 # pip meson @ https://files.pythonhosted.org/packages/44/b2/d4433391a7c5e94a39b50ca7295a8ceba736e7c72c455752a60122f52453/meson-1.4.1-py3-none-any.whl#sha256=d5acc3abae2dad3c70ddcbd10acac92b78b144d34d43f40f5b8ac31dfd8a826a -# pip networkx @ https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl#sha256=f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 +# pip networkx @ https://files.pythonhosted.org/packages/38/e9/5f72929373e1a0e8d142a130f3f97e6ff920070f87f91c4e13e40e0fba5a/networkx-3.3-py3-none-any.whl#sha256=28575580c6ebdaf4505b22c6256a2b9de86b316dc63ba9e93abde3d78dfdbcf2 # pip ninja @ https://files.pythonhosted.org/packages/6d/92/8d7aebd4430ab5ff65df2bfee6d5745f95c004284db2d8ca76dcbfd9de47/ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl#sha256=84502ec98f02a037a169c4b0d5d86075eaf6afc55e1879003d6cab51ced2ea4b -# pip numpy @ https://files.pythonhosted.org/packages/87/d3/74e627205462a170f39e7d7ddd2b4166a0d8ab163377592c7f4fa935cc8c/numpy-2.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=821eedb7165ead9eebdb569986968b541f9908979c2da8a4967ecac4439bae3d +# pip numpy @ https://files.pythonhosted.org/packages/d1/27/2a7bd6855dc717aeec5f553073a3c426b9c816126555f8e616392eab856b/numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=a7039a136017eaa92c1848152827e1424701532ca8e8967fe480fe1569dae581 # pip packaging @ https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl#sha256=5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 -# pip pillow @ https://files.pythonhosted.org/packages/bc/a8/8655557c9c7202b8abbd001f61ff36711cefaf750debcaa1c24d154ef602/pillow-10.4.0-cp39-cp39-manylinux_2_28_x86_64.whl#sha256=b2724fdb354a868ddf9a880cb84d102da914e99119211ef7ecbdc613b8c96b3c +# pip pillow @ https://files.pythonhosted.org/packages/ba/e5/8c68ff608a4203085158cff5cc2a3c534ec384536d9438c405ed6370d080/pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl#sha256=76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319 # pip pluggy @ https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl#sha256=44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669 # pip pygments @ https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl#sha256=b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a # pip pyparsing @ https://files.pythonhosted.org/packages/9d/ea/6d76df31432a0e6fdf81681a895f009a4bb47b3c39036db3e1b528191d52/pyparsing-3.1.2-py3-none-any.whl#sha256=f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742 @@ -60,31 +62,26 @@ https://repo.anaconda.com/pkgs/main/linux-64/pip-24.0-py39h06a4308_0.conda#7f8ce # pip sphinxcontrib-serializinghtml @ https://files.pythonhosted.org/packages/38/24/228bb903ea87b9e08ab33470e6102402a644127108c7117ac9c00d849f82/sphinxcontrib_serializinghtml-1.1.10-py3-none-any.whl#sha256=326369b8df80a7d2d8d7f99aa5ac577f51ea51556ed974e7716cfd4fca3f6cb7 # pip tabulate @ https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl#sha256=024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f # pip threadpoolctl @ https://files.pythonhosted.org/packages/4b/2c/ffbf7a134b9ab11a67b0cf0726453cedd9c5043a4fe7a35d1cefa9a1bcfb/threadpoolctl-3.5.0-py3-none-any.whl#sha256=56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467 -# pip tomli @ https://files.pythonhosted.org/packages/97/75/10a9ebee3fd790d20926a90a2547f0bf78f371b2f13aa822c759680ca7b9/tomli-2.0.1-py3-none-any.whl#sha256=939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc # pip tzdata @ https://files.pythonhosted.org/packages/65/58/f9c9e6be752e9fcb8b6a0ee9fb87e6e7a1f6bcab2cdc73f02bb7ba91ada0/tzdata-2024.1-py2.py3-none-any.whl#sha256=9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252 # pip urllib3 @ https://files.pythonhosted.org/packages/ca/1c/89ffc63a9605b583d5df2be791a27bc1a42b7c32bab68d3c8f2f73a98cd4/urllib3-2.2.2-py3-none-any.whl#sha256=a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472 -# pip zipp @ https://files.pythonhosted.org/packages/20/38/f5c473fe9b90c8debdd29ea68d5add0289f1936d6f923b6b9cc0b931194c/zipp-3.19.2-py3-none-any.whl#sha256=f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # pip array-api-strict @ https://files.pythonhosted.org/packages/08/06/aba69bce257fd1cda0d1db616c12728af0f46878a5cc1923fcbb94201947/array_api_strict-2.0.1-py3-none-any.whl#sha256=f74cbf0d0c182fcb45c5ee7f28f9c7b77e6281610dfbbdd63be60b1a5a7872b3 -# pip contourpy @ https://files.pythonhosted.org/packages/31/a2/2f12e3a6e45935ff694654b710961b03310b0e1ec997ee9f416d3c873f87/contourpy-1.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=e1d59258c3c67c865435d8fbeb35f8c59b8bef3d6f46c1f29f6123556af28445 -# pip coverage @ https://files.pythonhosted.org/packages/c4/b4/0cbc18998613f8caaec793ad5878d2450382dfac80e65d352fb7cd9cc1dc/coverage-7.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=dbc5958cb471e5a5af41b0ddaea96a37e74ed289535e8deca404811f6cb0bc3d +# pip contourpy @ https://files.pythonhosted.org/packages/ee/c0/9bd123d676eb61750e116a2cd915b06483fc406143cfc36c7f263f0f5368/contourpy-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=d4492d82b3bc7fbb7e3610747b159869468079fe149ec5c4d771fa1f614a14df # pip imageio @ https://files.pythonhosted.org/packages/3d/84/f1647217231f6cc46883e5d26e870cc3e1520d458ecd52d6df750810d53c/imageio-2.34.2-py3-none-any.whl#sha256=a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8 -# pip importlib-metadata @ https://files.pythonhosted.org/packages/dc/ef/38766b2edb096260d9b1b6ad35adaa0bce3b0567abb452b21eb074af88c4/importlib_metadata-8.0.0-py3-none-any.whl#sha256=15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f -# pip importlib-resources @ https://files.pythonhosted.org/packages/75/06/4df55e1b7b112d183f65db9503bff189e97179b256e1ea450a3c365241e0/importlib_resources-6.4.0-py3-none-any.whl#sha256=50d10f043df931902d4194ea07ec57960f66a80449ff867bfe782b4c486ba78c # pip jinja2 @ https://files.pythonhosted.org/packages/31/80/3a54838c3fb461f6fec263ebf3a3a41771bd05190238de3486aae8540c36/jinja2-3.1.4-py3-none-any.whl#sha256=bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d # pip lazy-loader @ https://files.pythonhosted.org/packages/83/60/d497a310bde3f01cb805196ac61b7ad6dc5dcf8dce66634dc34364b20b4f/lazy_loader-0.4-py3-none-any.whl#sha256=342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc # pip pyproject-metadata @ https://files.pythonhosted.org/packages/aa/5f/bb5970d3d04173b46c9037109f7f05fc8904ff5be073ee49bb6ff00301bc/pyproject_metadata-0.8.0-py3-none-any.whl#sha256=ad858d448e1d3a1fb408ac5bac9ea7743e7a8bbb472f2693aaa334d2db42f526 # pip pytest @ https://files.pythonhosted.org/packages/4e/e7/81ebdd666d3bff6670d27349b5053605d83d55548e6bd5711f3b0ae7dd23/pytest-8.2.2-py3-none-any.whl#sha256=c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343 # pip python-dateutil @ https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl#sha256=a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 # pip requests @ https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl#sha256=70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 -# pip scipy @ https://files.pythonhosted.org/packages/35/f5/d0ad1a96f80962ba65e2ce1de6a1e59edecd1f0a7b55990ed208848012e0/scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=637e98dcf185ba7f8e663e122ebf908c4702420477ae52a04f9908707456ba4d +# pip scipy @ https://files.pythonhosted.org/packages/89/bb/80c9c98d887c855710fd31fc5ae5574133e98203b3475b07579251803662/scipy-1.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=9e3154691b9f7ed73778d746da2df67a19d046a6c8087c8b385bc4cdb2cfca74 # pip tifffile @ https://files.pythonhosted.org/packages/d2/d7/ca95f347442e82700f591f3608e336596ee607daecbcad6a7ebd16ff5de4/tifffile-2024.7.2-py3-none-any.whl#sha256=5a2ee608c9cc1f2e044d943dacebddc71d4827b6fad150ef4c644b7aefbe2d1a # pip lightgbm @ https://files.pythonhosted.org/packages/f2/3d/4f152cf694aec100ab63b4a5547f2dbfbea59ab39d9375c89bed9775e47d/lightgbm-4.4.0-py3-none-manylinux_2_28_x86_64.whl#sha256=8700b41f637717d36763a282d280b8d4722a87103030b7f0f373b96da0225022 -# pip matplotlib @ https://files.pythonhosted.org/packages/8e/67/e75134cb83d2e533e46d72e2033a413772efdc18291beb981f5d574a829f/matplotlib-3.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=4db17fea0ae3aceb8e9ac69c7e3051bae0b3d083bfec932240f9bf5d0197a049 +# pip matplotlib @ https://files.pythonhosted.org/packages/b8/63/cef838d92c1918ae28afd12b8aeaa9c104a0686cf6447aa0546f7c6dd1f0/matplotlib-3.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=ab38a4f3772523179b2f772103d8030215b318fef6360cb40558f585bf3d017f # pip meson-python @ https://files.pythonhosted.org/packages/91/c0/104cb6244c83fe6bc3886f144cc433db0c0c78efac5dc00e409a5a08c87d/meson_python-0.16.0-py3-none-any.whl#sha256=842dc9f5dc29e55fc769ff1b6fe328412fe6c870220fc321060a1d2d395e69e8 -# pip pandas @ https://files.pythonhosted.org/packages/bb/30/f6f1f1ac36250f50c421b1b6af08c35e5a8b5a84385ef928625336b93e6f/pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921 -# pip pyamg @ https://files.pythonhosted.org/packages/de/b6/411b3de91fb23aebee8082af5c970f8345e06d8357ce1c33464837130770/pyamg-5.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=40acf38eb733dd33f054f2b9a6aa3cee1786b7a5420b602787f2d38eca8051ac +# pip pandas @ https://files.pythonhosted.org/packages/fc/a5/4d82be566f069d7a9a702dcdf6f9106df0e0b042e738043c0cc7ddd7e3f6/pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee +# pip pyamg @ https://files.pythonhosted.org/packages/d3/e8/6898b3b791f369605012e896ed903b6626f3bd1208c6a647d7219c070209/pyamg-5.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=679a5904eac3a4880288c8c0e6a29f110a2627ea15a443a4e9d5997c7dc5fab6 # pip pytest-cov @ https://files.pythonhosted.org/packages/78/3a/af5b4fa5961d9a1e6237b530eb87dd04aea6eb83da09d2a4073d81b54ccf/pytest_cov-5.0.0-py3-none-any.whl#sha256=4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652 # pip pytest-xdist @ https://files.pythonhosted.org/packages/6d/82/1d96bf03ee4c0fdc3c0cbe61470070e659ca78dc0086fb88b66c185e2449/pytest_xdist-3.6.1-py3-none-any.whl#sha256=9ed4adfb68a016610848639bb7e02c9352d5d9f03d04809919e2dafc3be4cca7 -# pip scikit-image @ https://files.pythonhosted.org/packages/f0/cc/1a58efefb9b17c60d15626b33416728003028d5d51f0521482151a222560/scikit_image-0.24.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=4688c18bd7ec33c08d7bf0fd19549be246d90d5f2c1d795a89986629af0a1e83 +# pip scikit-image @ https://files.pythonhosted.org/packages/ad/96/138484302b8ec9a69cdf65e8d4ab47a640a3b1a8ea3c437e1da3e1a5a6b8/scikit_image-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c # pip sphinx @ https://files.pythonhosted.org/packages/b4/fa/130c32ed94cf270e3d0b9ded16fb7b2c8fea86fa7263c29a696a30c1dde7/sphinx-7.3.7-py3-none-any.whl#sha256=413f75440be4cacf328f580b4274ada4565fb2187d696a84970c23f77b64d8c3 # pip numpydoc @ https://files.pythonhosted.org/packages/f0/fa/dcfe0f65660661db757ee9ebd84e170ff98edd5d80235f62457d9088f85f/numpydoc-1.7.0-py3-none-any.whl#sha256=5a56419d931310d79a06cfc2a126d1558700feeb9b4f3d8dcae1a8134be829c9 diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py index a7f99c91a0735..ac42bf55c8ed5 100644 --- a/build_tools/update_environments_and_lock_files.py +++ b/build_tools/update_environments_and_lock_files.py @@ -230,9 +230,10 @@ def remove_from(alist, to_remove): + ["array-api-compat", "array-api-strict"] ), "package_constraints": { - # XXX: we would like to use the latest version of Python but this makes - # the CI much slower. We need to investigate why. - "python": "3.9", + # XXX: we would like to use the latest Python version, but for now using + # Python 3.12 makes the CI much slower so we use Python 3.11. See + # https://github.com/scikit-learn/scikit-learn/pull/29444#issuecomment-2219550662. + "python": "3.11", }, }, { From 7eb7effae192f8fa9e29e14507970f5a7da6088c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 11 Jul 2024 08:34:46 +0200 Subject: [PATCH 03/35] BLD Remove support for setuptools build (#29400) Co-authored-by: Olivier Grisel --- .codecov.yml | 1 - .coveragerc | 1 - .github/workflows/wheels.yml | 3 - Makefile | 15 +- azure-pipelines.yml | 3 +- build_tools/azure/install.sh | 14 +- ...in_conda_defaults_openblas_environment.yml | 2 + ...onda_defaults_openblas_linux-64_conda.lock | 9 +- build_tools/circle/build_doc.sh | 3 - build_tools/cirrus/arm_wheel.yml | 1 - build_tools/cirrus/build_test_arm.sh | 4 - .../update_environments_and_lock_files.py | 5 +- doc/whats_new/v1.6.rst | 9 + maint_tools/check_pxd_in_installation.py | 60 -- setup.py | 627 ------------------ sklearn/_build_utils/__init__.py | 116 ---- sklearn/_build_utils/openmp_helpers.py | 127 ---- sklearn/_build_utils/pre_build_helpers.py | 75 --- sklearn/ensemble/tests/test_forest.py | 3 +- sklearn/tests/test_common.py | 35 +- 20 files changed, 32 insertions(+), 1081 deletions(-) delete mode 100644 maint_tools/check_pxd_in_installation.py delete mode 100755 setup.py delete mode 100644 sklearn/_build_utils/openmp_helpers.py delete mode 100644 sklearn/_build_utils/pre_build_helpers.py diff --git a/.codecov.yml b/.codecov.yml index 54ce77b9c1b0e..f4ecd6e7d8fee 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -30,5 +30,4 @@ ignore: - "sklearn/_build_utils" - "sklearn/__check_build" - "sklearn/_min_dependencies.py" -- "**/setup.py" - "**/conftest.py" diff --git a/.coveragerc b/.coveragerc index a8601458a0b07..31f9fa1b4ceae 100644 --- a/.coveragerc +++ b/.coveragerc @@ -6,4 +6,3 @@ omit = */sklearn/externals/* */sklearn/_build_utils/* */benchmarks/* - **/setup.py diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index efeec5aa95a0d..ea52057f70c66 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -164,7 +164,6 @@ jobs: CIBW_PRERELEASE_PYTHONS: ${{ matrix.prerelease_pythons }} CIBW_FREE_THREADED_SUPPORT: ${{ matrix.free_threaded_support }} CIBW_ENVIRONMENT: SKLEARN_SKIP_NETWORK_TESTS=1 - SKLEARN_BUILD_PARALLEL=3 CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform_id }} CIBW_ARCHS: all CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.manylinux_image }} @@ -216,8 +215,6 @@ jobs: - name: Build source distribution run: bash build_tools/github/build_source.sh - env: - SKLEARN_BUILD_PARALLEL: 3 - name: Test source distribution run: bash build_tools/github/test_source.sh diff --git a/Makefile b/Makefile index 8ddf6a188ef6d..66c7a6c0e93a4 100644 --- a/Makefile +++ b/Makefile @@ -7,9 +7,7 @@ all: @echo "Please use 'make ' where is one of" @echo " dev build scikit-learn with Meson" @echo " clean clean scikit-learn Meson build. Very rarely needed," - @echo " one use case is when switching back to setuptools" - @echo " dev-setuptools build scikit-learn with setuptools (deprecated)" - @echo " clean-setuptools clean scikit-learn setuptools build (deprecated)" + @echo " since meson-python recompiles on import." .PHONY: all @@ -23,14 +21,7 @@ clean: clean-meson clean-meson: pip uninstall -y scikit-learn # It seems in some cases removing the folder avoids weird compilation - # errors (e.g. when switching from numpy>=2 to numpy<2). For some + # errors (e.g. when switching from numpy>=2 to numpy<2). For some # reason ninja clean -C $(DEFAULT_MESON_BUILD_DIR) is not - # enough + # enough. rm -rf $(DEFAULT_MESON_BUILD_DIR) - -dev-setuptools: - $(PYTHON) setup.py build_ext -i - -clean-setuptools: - $(PYTHON) setup.py clean - rm -rf dist diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e8e2956107797..3887be64be4a9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -208,7 +208,6 @@ jobs: SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES: '1' SKLEARN_RUN_FLOAT32_TESTS: '1' SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '2' # non-default seed - BUILD_WITH_SETUPTOOLS: 'true' # Linux environment to test the latest available dependencies. # It runs tests requiring lightgbm, pandas and PyAMG. pylatest_pip_openblas_pandas: @@ -236,7 +235,7 @@ jobs: ) matrix: debian_atlas_32bit: - DOCKER_CONTAINER: 'i386/debian:11.2' + DOCKER_CONTAINER: 'i386/debian:12' DISTRIB: 'debian-32' COVERAGE: "true" LOCK_FILE: './build_tools/azure/debian_atlas_32bit_lock.txt' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 3b8d0dae87a55..73e732e35a05f 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -107,10 +107,6 @@ scikit_learn_install() { setup_ccache show_installed_libraries - # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI - # workers with 2 cores when building the compiled extensions of scikit-learn. - export SKLEARN_BUILD_PARALLEL=3 - if [[ "$UNAMESTR" == "Darwin" && "$SKLEARN_TEST_NO_OPENMP" == "true" ]]; then # Without openmp, we use the system clang. Here we use /usr/bin/ar # instead because llvm-ar errors @@ -129,9 +125,7 @@ scikit_learn_install() { export LDFLAGS="$LDFLAGS -Wl,--sysroot=/" fi - if [[ "$BUILD_WITH_SETUPTOOLS" == "true" ]]; then - python setup.py develop - elif [[ "$PIP_BUILD_ISOLATION" == "true" ]]; then + if [[ "$PIP_BUILD_ISOLATION" == "true" ]]; then # Check that pip can automatically build scikit-learn with the build # dependencies specified in pyproject.toml using an isolated build # environment: @@ -143,12 +137,6 @@ scikit_learn_install() { # toolchain ADDITIONAL_PIP_OPTIONS='-Csetup-args=--vsenv' fi - # TODO Always add --check-build-dependencies when all CI builds have - # pip >= 22.1.1. At the time of writing, two CI builds (debian32_atlas and - # ubuntu_atlas) have an older pip - if pip install --help | grep check-build-dependencies; then - ADDITIONAL_PIP_OPTIONS="$ADDITIONAL_PIP_OPTIONS --check-build-dependencies" - fi # Use the pre-installed build dependencies and build directly in the # current environment. pip install --verbose --no-build-isolation --editable . $ADDITIONAL_PIP_OPTIONS diff --git a/build_tools/azure/pymin_conda_defaults_openblas_environment.yml b/build_tools/azure/pymin_conda_defaults_openblas_environment.yml index a82ba18e27980..83b0627ff296f 100644 --- a/build_tools/azure/pymin_conda_defaults_openblas_environment.yml +++ b/build_tools/azure/pymin_conda_defaults_openblas_environment.yml @@ -15,9 +15,11 @@ dependencies: - pytest - pytest-xdist - pillow + - ninja - pytest-cov - coverage - ccache - pip - pip: - threadpoolctl==3.1.0 # min + - meson-python==0.16.0 # min diff --git a/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock b/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock index 5eb168a898f32..95289022eccec 100644 --- a/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock +++ b/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock @@ -1,6 +1,6 @@ # Generated by conda-lock. # platform: linux-64 -# input_hash: 244b6a064d3785ea62baaf9436848821d153846b455c2976f5e811182e848c83 +# input_hash: e4db53ad2240ff5f57679dd93701c30b6712ac3a43ec04e18b74132f2948b4cd @EXPLICIT https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9 https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-openblas.conda#9ddfcaef10d79366c90128f5dc444be8 @@ -28,6 +28,7 @@ https://repo.anaconda.com/pkgs/main/linux-64/libwebp-base-1.3.2-h5eee18b_0.conda https://repo.anaconda.com/pkgs/main/linux-64/libxcb-1.15-h7f8727e_0.conda#ada518dcadd6aaee9aae47ba9a671553 https://repo.anaconda.com/pkgs/main/linux-64/lz4-c-1.9.4-h6a678d5_1.conda#2ee58861f2b92b868ce761abb831819d https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.4-h6a678d5_0.conda#5558eec6e2191741a92f832ea826251c +https://repo.anaconda.com/pkgs/main/linux-64/ninja-base-1.10.2-hd09550d_5.conda#09dcbad622d58caaeefe46cd399f0a76 https://repo.anaconda.com/pkgs/main/linux-64/openssl-3.0.14-h5eee18b_0.conda#37b6dad6aa49000a4230a9f0cad172f6 https://repo.anaconda.com/pkgs/main/linux-64/xz-5.4.6-h5eee18b_1.conda#1562802f843297ee776a50b9329597ed https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.13-h5eee18b_1.conda#92e42d8310108b0a440fb2e60b2b2a25 @@ -56,7 +57,7 @@ https://repo.anaconda.com/pkgs/main/linux-64/libclang-14.0.6-default_hc6dbbc7_1. https://repo.anaconda.com/pkgs/main/linux-64/libpq-12.17-hdbd6064_0.conda#6bed363e25859faff66bf546a11c10e8 https://repo.anaconda.com/pkgs/main/linux-64/openjpeg-2.4.0-h9ca470c_1.conda#dfd4b36eb8ddaffeca0ab412de63c3e2 https://repo.anaconda.com/pkgs/main/linux-64/python-3.9.19-h955ad1f_1.conda#4b453281859c293c9d577271f3b18a0d -https://repo.anaconda.com/pkgs/main/linux-64/certifi-2024.6.2-py39h06a4308_0.conda#738daf43271605d7291ecae0e8cac41c +https://repo.anaconda.com/pkgs/main/linux-64/certifi-2024.7.4-py39h06a4308_0.conda#add87fa3b69a43e4e9ea1e619b267c4b https://repo.anaconda.com/pkgs/main/noarch/cycler-0.11.0-pyhd3eb1b0_0.conda#f5e365d2cdb66d547eb8c3ab93843aab https://repo.anaconda.com/pkgs/main/linux-64/cython-3.0.10-py39h5eee18b_0.conda#1419a658ed2b4d5c3ac1964f33143b64 https://repo.anaconda.com/pkgs/main/linux-64/exceptiongroup-1.2.0-py39h06a4308_0.conda#960e2cb83ac5134df8e593a130aa11af @@ -66,6 +67,7 @@ https://repo.anaconda.com/pkgs/main/noarch/iniconfig-1.1.1-pyhd3eb1b0_0.tar.bz2# https://repo.anaconda.com/pkgs/main/linux-64/joblib-1.2.0-py39h06a4308_0.conda#ac1f5687d70aa1128cbecb26bc9e559d https://repo.anaconda.com/pkgs/main/linux-64/kiwisolver-1.4.4-py39h6a678d5_0.conda#3d57aedbfbd054ce57fb3c1e4448828c https://repo.anaconda.com/pkgs/main/linux-64/mysql-5.7.24-h721c034_2.conda#dfc19ca2466d275c4c1f73b62c57f37b +https://repo.anaconda.com/pkgs/main/linux-64/ninja-1.10.2-h06a4308_5.conda#6fc219bbc4c8dbb9060b5b7fe31ae83d https://repo.anaconda.com/pkgs/main/linux-64/numpy-base-1.21.6-py39h375b286_1.conda#0061d9193658774ab79fc85d143a94fc https://repo.anaconda.com/pkgs/main/linux-64/packaging-24.1-py39h06a4308_0.conda#e80d41ffc9450162ef10cbbb9b4ec7e9 https://repo.anaconda.com/pkgs/main/linux-64/pillow-10.3.0-py39h5eee18b_0.conda#b346d6c71267c1553b6c18d3db5fdf6d @@ -96,4 +98,7 @@ https://repo.anaconda.com/pkgs/main/linux-64/pyamg-4.2.3-py39h79cecc1_0.conda#af https://repo.anaconda.com/pkgs/main/linux-64/qt-main-5.15.2-h53bd1ea_10.conda#bd0c79e82df6323f638bdcb871891b61 https://repo.anaconda.com/pkgs/main/linux-64/pyqt-5.15.10-py39h6a678d5_0.conda#52da5ff9b1144b078d2f41bab0b213f2 https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-3.3.4-py39h06a4308_0.conda#384fc5e01ebfcf30e7161119d3029b5a +# pip meson @ https://files.pythonhosted.org/packages/44/b2/d4433391a7c5e94a39b50ca7295a8ceba736e7c72c455752a60122f52453/meson-1.4.1-py3-none-any.whl#sha256=d5acc3abae2dad3c70ddcbd10acac92b78b144d34d43f40f5b8ac31dfd8a826a # pip threadpoolctl @ https://files.pythonhosted.org/packages/61/cf/6e354304bcb9c6413c4e02a747b600061c21d38ba51e7e544ac7bc66aecc/threadpoolctl-3.1.0-py3-none-any.whl#sha256=8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b +# pip pyproject-metadata @ https://files.pythonhosted.org/packages/aa/5f/bb5970d3d04173b46c9037109f7f05fc8904ff5be073ee49bb6ff00301bc/pyproject_metadata-0.8.0-py3-none-any.whl#sha256=ad858d448e1d3a1fb408ac5bac9ea7743e7a8bbb472f2693aaa334d2db42f526 +# pip meson-python @ https://files.pythonhosted.org/packages/91/c0/104cb6244c83fe6bc3886f144cc433db0c0c78efac5dc00e409a5a08c87d/meson_python-0.16.0-py3-none-any.whl#sha256=842dc9f5dc29e55fc769ff1b6fe328412fe6c870220fc321060a1d2d395e69e8 diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 014ac0fac8d7a..5555468d88b18 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -176,9 +176,6 @@ conda activate $CONDA_ENV_NAME show_installed_libraries -# Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI -# workers with 2 cores when building the compiled extensions of scikit-learn. -export SKLEARN_BUILD_PARALLEL=3 pip install -e . --no-build-isolation echo "ccache build summary:" diff --git a/build_tools/cirrus/arm_wheel.yml b/build_tools/cirrus/arm_wheel.yml index c3dfcfbc53ad9..aad1770188335 100644 --- a/build_tools/cirrus/arm_wheel.yml +++ b/build_tools/cirrus/arm_wheel.yml @@ -8,7 +8,6 @@ linux_arm64_wheel_task: memory: 4G env: CIBW_ENVIRONMENT: SKLEARN_SKIP_NETWORK_TESTS=1 - SKLEARN_BUILD_PARALLEL=5 CIBW_TEST_COMMAND: bash {project}/build_tools/wheels/test_wheels.sh CIBW_TEST_REQUIRES: pytest pandas threadpoolctl pytest-xdist CIBW_BUILD_VERBOSITY: 1 diff --git a/build_tools/cirrus/build_test_arm.sh b/build_tools/cirrus/build_test_arm.sh index 7ab95200bee50..b406a1673a13a 100755 --- a/build_tools/cirrus/build_test_arm.sh +++ b/build_tools/cirrus/build_test_arm.sh @@ -37,10 +37,6 @@ setup_ccache python --version -# Set parallelism to $N_CORES + 1 to overlap IO bound tasks with CPU bound tasks on CI -# workers with $N_CORES cores when building the compiled extensions of scikit-learn. -export SKLEARN_BUILD_PARALLEL=$(($N_CORES + 1)) - # Disable the build isolation and build in the tree so that the same folder can be # cached between CI runs. pip install --verbose --no-build-isolation . diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py index ac42bf55c8ed5..68a593f65e495 100644 --- a/build_tools/update_environments_and_lock_files.py +++ b/build_tools/update_environments_and_lock_files.py @@ -179,7 +179,7 @@ def remove_from(alist, to_remove): "channels": ["defaults"], "conda_dependencies": remove_from( common_dependencies, - ["pandas", "threadpoolctl", "pip", "ninja", "meson-python"], + ["pandas", "threadpoolctl", "pip", "meson-python"], ) + ["ccache"], "package_constraints": { @@ -191,10 +191,11 @@ def remove_from(alist, to_remove): "cython": "min", "joblib": "min", "threadpoolctl": "min", + "meson-python": "min", }, # TODO: put pip dependencies back to conda dependencies when required # version is available on the defaults channel. - "pip_dependencies": ["threadpoolctl"], + "pip_dependencies": ["threadpoolctl", "meson-python"], }, { "name": "pymin_conda_forge_openblas_ubuntu_2204", diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 4b6a0263187da..f87a85c52e0a2 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -89,6 +89,15 @@ more details. passed to the underlying estimators via their respective methods. :pr:`28494` by :user:`Adam Li `. +Dropping support for building with setuptools +--------------------------------------------- + +From scikit-learn 1.6 onwards, support for building with setuptools has been +removed. Meson is the only supported way to build scikit-learn, see +:ref:`Building from source ` for more details. + +:pr:`29400` by :user:`Loïc Estève ` + Dropping official support for PyPy ---------------------------------- diff --git a/maint_tools/check_pxd_in_installation.py b/maint_tools/check_pxd_in_installation.py deleted file mode 100644 index 380edbd6350b6..0000000000000 --- a/maint_tools/check_pxd_in_installation.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Utility for testing presence and usability of .pxd files in the installation - -Usage: ------- -python check_pxd_in_installation.py path/to/install_dir/of/scikit-learn -""" - -import os -import pathlib -import subprocess -import sys -import tempfile -import textwrap - -sklearn_dir = pathlib.Path(sys.argv[1]) -pxd_files = list(sklearn_dir.glob("**/*.pxd")) - -print("> Found pxd files:") -for pxd_file in pxd_files: - print(" -", pxd_file) - -print("\n> Trying to compile a cython extension cimporting all corresponding modules\n") -with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = pathlib.Path(tmpdir) - # A cython test file which cimports all modules corresponding to found - # pxd files. - # e.g. sklearn/tree/_utils.pxd becomes `cimport sklearn.tree._utils` - with open(tmpdir / "tst.pyx", "w") as f: - for pxd_file in pxd_files: - to_import = str(pxd_file.relative_to(sklearn_dir)) - to_import = to_import.replace(os.path.sep, ".") - to_import = to_import.replace(".pxd", "") - f.write("cimport sklearn." + to_import + "\n") - - # A basic setup file to build the test file. - # We set the language to c++ and we use numpy.get_include() because - # some modules require it. - with open(tmpdir / "setup_tst.py", "w") as f: - f.write( - textwrap.dedent( - """ - from setuptools import setup, Extension - from Cython.Build import cythonize - import numpy - - extensions = [Extension("tst", - sources=["tst.pyx"], - language="c++", - include_dirs=[numpy.get_include()])] - - setup(ext_modules=cythonize(extensions)) - """ - ) - ) - - subprocess.run( - ["python", "setup_tst.py", "build_ext", "-i"], check=True, cwd=tmpdir - ) - - print("\n> Compilation succeeded !") diff --git a/setup.py b/setup.py deleted file mode 100755 index 7e67210736305..0000000000000 --- a/setup.py +++ /dev/null @@ -1,627 +0,0 @@ -#! /usr/bin/env python -# -# Authors: The scikit-learn developers -# License: 3-clause BSD - -import importlib -import os -import platform -import shutil -import sys -import traceback -from os.path import join - -from setuptools import Command, Extension, setup -from setuptools.command.build_ext import build_ext - -try: - import builtins -except ImportError: - # Python 2 compat: just to be able to declare that Python >=3.8 is needed. - import __builtin__ as builtins - -# This is a bit (!) hackish: we are setting a global variable so that the main -# sklearn __init__ can detect if it is being loaded by the setup routine, to -# avoid attempting to load components that aren't built yet. -# TODO: can this be simplified or removed since the switch to setuptools -# away from numpy.distutils? -builtins.__SKLEARN_SETUP__ = True - - -DISTNAME = "scikit-learn" -DESCRIPTION = "A set of python modules for machine learning and data mining" -with open("README.rst") as f: - LONG_DESCRIPTION = f.read() -MAINTAINER = "scikit-learn developers" -MAINTAINER_EMAIL = "scikit-learn@python.org" -URL = "https://scikit-learn.org" -DOWNLOAD_URL = "https://pypi.org/project/scikit-learn/#files" -LICENSE = "new BSD" -PROJECT_URLS = { - "Bug Tracker": "https://github.com/scikit-learn/scikit-learn/issues", - "Documentation": "https://scikit-learn.org/stable/documentation.html", - "Source Code": "https://github.com/scikit-learn/scikit-learn", -} - -# We can actually import a restricted version of sklearn that -# does not need the compiled code -import sklearn # noqa -import sklearn._min_dependencies as min_deps # noqa -from sklearn._build_utils import _check_cython_version # noqa -from sklearn.externals._packaging.version import parse as parse_version # noqa - - -VERSION = sklearn.__version__ - -# Custom clean command to remove build artifacts - - -class CleanCommand(Command): - description = "Remove build artifacts from the source tree" - - user_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - - def run(self): - # Remove c files if we are not within a sdist package - cwd = os.path.abspath(os.path.dirname(__file__)) - remove_c_files = not os.path.exists(os.path.join(cwd, "PKG-INFO")) - if remove_c_files: - print("Will remove generated .c files") - if os.path.exists("build"): - shutil.rmtree("build") - for dirpath, dirnames, filenames in os.walk("sklearn"): - for filename in filenames: - root, extension = os.path.splitext(filename) - - if extension in [".so", ".pyd", ".dll", ".pyc"]: - os.unlink(os.path.join(dirpath, filename)) - - if remove_c_files and extension in [".c", ".cpp"]: - pyx_file = str.replace(filename, extension, ".pyx") - if os.path.exists(os.path.join(dirpath, pyx_file)): - os.unlink(os.path.join(dirpath, filename)) - - if remove_c_files and extension == ".tp": - if os.path.exists(os.path.join(dirpath, root)): - os.unlink(os.path.join(dirpath, root)) - - for dirname in dirnames: - if dirname == "__pycache__": - shutil.rmtree(os.path.join(dirpath, dirname)) - - -# Custom build_ext command to set OpenMP compile flags depending on os and -# compiler. Also makes it possible to set the parallelism level via -# and environment variable (useful for the wheel building CI). -# build_ext has to be imported after setuptools - - -class build_ext_subclass(build_ext): - def finalize_options(self): - build_ext.finalize_options(self) - if self.parallel is None: - # Do not override self.parallel if already defined by - # command-line flag (--parallel or -j) - - parallel = os.environ.get("SKLEARN_BUILD_PARALLEL") - if parallel: - self.parallel = int(parallel) - if self.parallel: - print("setting parallel=%d " % self.parallel) - - def build_extensions(self): - from sklearn._build_utils.openmp_helpers import get_openmp_flag - - # Always use NumPy 1.7 C API for all compiled extensions. - # See: https://numpy.org/doc/stable/reference/c-api/deprecations.html - DEFINE_MACRO_NUMPY_C_API = ( - "NPY_NO_DEPRECATED_API", - "NPY_1_7_API_VERSION", - ) - for ext in self.extensions: - ext.define_macros.append(DEFINE_MACRO_NUMPY_C_API) - - if sklearn._OPENMP_SUPPORTED: - openmp_flag = get_openmp_flag() - - for e in self.extensions: - e.extra_compile_args += openmp_flag - e.extra_link_args += openmp_flag - - build_ext.build_extensions(self) - - def run(self): - # Specifying `build_clib` allows running `python setup.py develop` - # fully from a fresh clone. - self.run_command("build_clib") - build_ext.run(self) - - -cmdclass = { - "clean": CleanCommand, - "build_ext": build_ext_subclass, -} - - -def check_package_status(package, min_version): - """ - Returns a dictionary containing a boolean specifying whether given package - is up-to-date, along with the version string (empty string if - not installed). - """ - package_status = {} - try: - module = importlib.import_module(package) - package_version = module.__version__ - package_status["up_to_date"] = parse_version(package_version) >= parse_version( - min_version - ) - package_status["version"] = package_version - except ImportError: - traceback.print_exc() - package_status["up_to_date"] = False - package_status["version"] = "" - - req_str = "scikit-learn requires {} >= {}.\n".format(package, min_version) - - instructions = ( - "Installation instructions are available on the " - "scikit-learn website: " - "https://scikit-learn.org/stable/install.html\n" - ) - - if package_status["up_to_date"] is False: - if package_status["version"]: - raise ImportError( - "Your installation of {} {} is out-of-date.\n{}{}".format( - package, package_status["version"], req_str, instructions - ) - ) - else: - raise ImportError( - "{} is not installed.\n{}{}".format(package, req_str, instructions) - ) - - -extension_config = { - "__check_build": [ - {"sources": ["_check_build.pyx"]}, - ], - "": [ - {"sources": ["_isotonic.pyx"]}, - ], - "_loss": [ - {"sources": ["_loss.pyx.tp"]}, - ], - "cluster": [ - {"sources": ["_dbscan_inner.pyx"], "language": "c++"}, - {"sources": ["_hierarchical_fast.pyx"], "language": "c++", "include_np": True}, - {"sources": ["_k_means_common.pyx"], "include_np": True}, - {"sources": ["_k_means_lloyd.pyx"], "include_np": True}, - {"sources": ["_k_means_elkan.pyx"], "include_np": True}, - {"sources": ["_k_means_minibatch.pyx"], "include_np": True}, - ], - "cluster._hdbscan": [ - {"sources": ["_linkage.pyx"], "include_np": True}, - {"sources": ["_reachability.pyx"], "include_np": True}, - {"sources": ["_tree.pyx"], "include_np": True}, - ], - "datasets": [ - { - "sources": ["_svmlight_format_fast.pyx"], - "include_np": True, - "compile_for_pypy": False, - } - ], - "decomposition": [ - {"sources": ["_online_lda_fast.pyx"]}, - {"sources": ["_cdnmf_fast.pyx"], "include_np": True}, - ], - "ensemble": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, - ], - "ensemble._hist_gradient_boosting": [ - {"sources": ["_gradient_boosting.pyx"]}, - {"sources": ["histogram.pyx"]}, - {"sources": ["splitting.pyx"]}, - {"sources": ["_binning.pyx"]}, - {"sources": ["_predictor.pyx"]}, - {"sources": ["_bitset.pyx"]}, - {"sources": ["common.pyx"]}, - ], - "feature_extraction": [ - {"sources": ["_hashing_fast.pyx"], "language": "c++", "include_np": True}, - ], - "linear_model": [ - {"sources": ["_cd_fast.pyx"]}, - {"sources": ["_sgd_fast.pyx.tp"]}, - {"sources": ["_sag_fast.pyx.tp"]}, - ], - "manifold": [ - {"sources": ["_utils.pyx"]}, - {"sources": ["_barnes_hut_tsne.pyx"], "include_np": True}, - ], - "metrics": [ - {"sources": ["_pairwise_fast.pyx"]}, - { - "sources": ["_dist_metrics.pyx.tp", "_dist_metrics.pxd.tp"], - "include_np": True, - }, - ], - "metrics.cluster": [ - {"sources": ["_expected_mutual_info_fast.pyx"]}, - ], - "metrics._pairwise_distances_reduction": [ - { - "sources": ["_datasets_pair.pyx.tp", "_datasets_pair.pxd.tp"], - "language": "c++", - "include_np": True, - "extra_compile_args": ["-std=c++11"], - }, - { - "sources": ["_middle_term_computer.pyx.tp", "_middle_term_computer.pxd.tp"], - "language": "c++", - "extra_compile_args": ["-std=c++11"], - }, - { - "sources": ["_base.pyx.tp", "_base.pxd.tp"], - "language": "c++", - "include_np": True, - "extra_compile_args": ["-std=c++11"], - }, - { - "sources": ["_argkmin.pyx.tp", "_argkmin.pxd.tp"], - "language": "c++", - "include_np": True, - "extra_compile_args": ["-std=c++11"], - }, - { - "sources": ["_argkmin_classmode.pyx.tp"], - "language": "c++", - "include_np": True, - "extra_compile_args": ["-std=c++11"], - }, - { - "sources": ["_radius_neighbors.pyx.tp", "_radius_neighbors.pxd.tp"], - "language": "c++", - "include_np": True, - "extra_compile_args": ["-std=c++11"], - }, - { - "sources": ["_radius_neighbors_classmode.pyx.tp"], - "language": "c++", - "include_np": True, - "extra_compile_args": ["-std=c++11"], - }, - ], - "preprocessing": [ - {"sources": ["_csr_polynomial_expansion.pyx"]}, - { - "sources": ["_target_encoder_fast.pyx"], - "language": "c++", - "extra_compile_args": ["-std=c++11"], - }, - ], - "neighbors": [ - {"sources": ["_binary_tree.pxi.tp"], "include_np": True}, - {"sources": ["_ball_tree.pyx.tp"], "include_np": True}, - {"sources": ["_kd_tree.pyx.tp"], "include_np": True}, - {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, - {"sources": ["_quad_tree.pyx"], "include_np": True}, - ], - "svm": [ - { - "sources": ["_newrand.pyx"], - "include_dirs": [join("src", "newrand")], - "language": "c++", - # Use C++11 random number generator fix - "extra_compile_args": ["-std=c++11"], - }, - { - "sources": ["_libsvm.pyx"], - "depends": [ - join("src", "libsvm", "libsvm_helper.c"), - join("src", "libsvm", "libsvm_template.cpp"), - join("src", "libsvm", "svm.cpp"), - join("src", "libsvm", "svm.h"), - join("src", "newrand", "newrand.h"), - ], - "include_dirs": [ - join("src", "libsvm"), - join("src", "newrand"), - ], - "libraries": ["libsvm-skl"], - "extra_link_args": ["-lstdc++"], - }, - { - "sources": ["_liblinear.pyx"], - "libraries": ["liblinear-skl"], - "include_dirs": [ - join("src", "liblinear"), - join("src", "newrand"), - join("..", "utils"), - ], - "depends": [ - join("src", "liblinear", "tron.h"), - join("src", "liblinear", "linear.h"), - join("src", "liblinear", "liblinear_helper.c"), - join("src", "newrand", "newrand.h"), - ], - "extra_link_args": ["-lstdc++"], - }, - { - "sources": ["_libsvm_sparse.pyx"], - "libraries": ["libsvm-skl"], - "include_dirs": [ - join("src", "libsvm"), - join("src", "newrand"), - ], - "depends": [ - join("src", "libsvm", "svm.h"), - join("src", "newrand", "newrand.h"), - join("src", "libsvm", "libsvm_sparse_helper.c"), - ], - "extra_link_args": ["-lstdc++"], - }, - ], - "tree": [ - { - "sources": ["_tree.pyx"], - "language": "c++", - "include_np": True, - "optimization_level": "O3", - }, - {"sources": ["_splitter.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_criterion.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_utils.pyx"], "include_np": True, "optimization_level": "O3"}, - ], - "utils": [ - {"sources": ["sparsefuncs_fast.pyx"]}, - {"sources": ["_cython_blas.pyx"]}, - {"sources": ["arrayfuncs.pyx"]}, - { - "sources": ["murmurhash.pyx", join("src", "MurmurHash3.cpp")], - "include_dirs": ["src"], - }, - {"sources": ["_fast_dict.pyx"], "language": "c++"}, - {"sources": ["_openmp_helpers.pyx"]}, - {"sources": ["_seq_dataset.pyx.tp", "_seq_dataset.pxd.tp"]}, - {"sources": ["_weight_vector.pyx.tp", "_weight_vector.pxd.tp"]}, - {"sources": ["_random.pyx"]}, - {"sources": ["_typedefs.pyx"]}, - {"sources": ["_heap.pyx"]}, - {"sources": ["_sorting.pyx"]}, - {"sources": ["_vector_sentinel.pyx"], "language": "c++", "include_np": True}, - {"sources": ["_isfinite.pyx"]}, - ], -} - -# Paths in `libraries` must be relative to the root directory because `libraries` is -# passed directly to `setup` -libraries = [ - ( - "libsvm-skl", - { - "sources": [ - join("sklearn", "svm", "src", "libsvm", "libsvm_template.cpp"), - ], - "depends": [ - join("sklearn", "svm", "src", "libsvm", "svm.cpp"), - join("sklearn", "svm", "src", "libsvm", "svm.h"), - join("sklearn", "svm", "src", "newrand", "newrand.h"), - ], - # Use C++11 to use the random number generator fix - "extra_compiler_args": ["-std=c++11"], - "extra_link_args": ["-lstdc++"], - }, - ), - ( - "liblinear-skl", - { - "sources": [ - join("sklearn", "svm", "src", "liblinear", "linear.cpp"), - join("sklearn", "svm", "src", "liblinear", "tron.cpp"), - ], - "depends": [ - join("sklearn", "svm", "src", "liblinear", "linear.h"), - join("sklearn", "svm", "src", "liblinear", "tron.h"), - join("sklearn", "svm", "src", "newrand", "newrand.h"), - ], - # Use C++11 to use the random number generator fix - "extra_compiler_args": ["-std=c++11"], - "extra_link_args": ["-lstdc++"], - }, - ), -] - - -def configure_extension_modules(): - # Skip cythonization as we do not want to include the generated - # C/C++ files in the release tarballs as they are not necessarily - # forward compatible with future versions of Python for instance. - if "sdist" in sys.argv or "--help" in sys.argv: - return [] - - import numpy - - from sklearn._build_utils import cythonize_extensions, gen_from_templates - - is_pypy = platform.python_implementation() == "PyPy" - np_include = numpy.get_include() - default_optimization_level = "O2" - - if os.name == "posix": - default_libraries = ["m"] - else: - default_libraries = [] - - default_extra_compile_args = [] - build_with_debug_symbols = ( - os.environ.get("SKLEARN_BUILD_ENABLE_DEBUG_SYMBOLS", "0") != "0" - ) - if os.name == "posix": - if build_with_debug_symbols: - default_extra_compile_args.append("-g") - else: - # Setting -g0 will strip symbols, reducing the binary size of extensions - default_extra_compile_args.append("-g0") - - cython_exts = [] - for submodule, extensions in extension_config.items(): - submodule_parts = submodule.split(".") - parent_dir = join("sklearn", *submodule_parts) - for extension in extensions: - if is_pypy and not extension.get("compile_for_pypy", True): - continue - - # Generate files with Tempita - tempita_sources = [] - sources = [] - for source in extension["sources"]: - source = join(parent_dir, source) - new_source_path, path_ext = os.path.splitext(source) - - if path_ext != ".tp": - sources.append(source) - continue - - # `source` is a Tempita file - tempita_sources.append(source) - - # Only include source files that are pyx files - if os.path.splitext(new_source_path)[-1] == ".pyx": - sources.append(new_source_path) - - gen_from_templates(tempita_sources) - - # Do not progress if we only have a tempita file which we don't - # want to include like the .pxi.tp extension. In such a case - # sources would be empty. - if not sources: - continue - - # By convention, our extensions always use the name of the first source - source_name = os.path.splitext(os.path.basename(sources[0]))[0] - if submodule: - name_parts = ["sklearn", submodule, source_name] - else: - name_parts = ["sklearn", source_name] - name = ".".join(name_parts) - - # Make paths start from the root directory - include_dirs = [ - join(parent_dir, include_dir) - for include_dir in extension.get("include_dirs", []) - ] - if extension.get("include_np", False): - include_dirs.append(np_include) - - depends = [ - join(parent_dir, depend) for depend in extension.get("depends", []) - ] - - extra_compile_args = ( - extension.get("extra_compile_args", []) + default_extra_compile_args - ) - optimization_level = extension.get( - "optimization_level", default_optimization_level - ) - if os.name == "posix": - extra_compile_args.append(f"-{optimization_level}") - else: - extra_compile_args.append(f"/{optimization_level}") - - libraries_ext = extension.get("libraries", []) + default_libraries - - new_ext = Extension( - name=name, - sources=sources, - language=extension.get("language", None), - include_dirs=include_dirs, - libraries=libraries_ext, - depends=depends, - extra_link_args=extension.get("extra_link_args", None), - extra_compile_args=extra_compile_args, - ) - cython_exts.append(new_ext) - - return cythonize_extensions(cython_exts) - - -def setup_package(): - python_requires = ">=3.9" - required_python_version = (3, 9) - - metadata = dict( - name=DISTNAME, - maintainer=MAINTAINER, - maintainer_email=MAINTAINER_EMAIL, - description=DESCRIPTION, - license=LICENSE, - url=URL, - download_url=DOWNLOAD_URL, - project_urls=PROJECT_URLS, - version=VERSION, - long_description=LONG_DESCRIPTION, - classifiers=[ - "Intended Audience :: Science/Research", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Programming Language :: C", - "Programming Language :: Python", - "Topic :: Software Development", - "Topic :: Scientific/Engineering", - "Development Status :: 5 - Production/Stable", - "Operating System :: Microsoft :: Windows", - "Operating System :: POSIX", - "Operating System :: Unix", - "Operating System :: MacOS", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: Implementation :: CPython", - ], - cmdclass=cmdclass, - python_requires=python_requires, - install_requires=min_deps.tag_to_packages["install"], - package_data={ - "": ["*.csv", "*.gz", "*.txt", "*.pxd", "*.rst", "*.jpg", "*.css"] - }, - zip_safe=False, # the package can run out of an .egg file - extras_require={ - key: min_deps.tag_to_packages[key] - for key in ["examples", "docs", "tests", "benchmark"] - }, - ) - - commands = [arg for arg in sys.argv[1:] if not arg.startswith("-")] - if not all( - command in ("egg_info", "dist_info", "clean", "check") for command in commands - ): - if sys.version_info < required_python_version: - required_version = "%d.%d" % required_python_version - raise RuntimeError( - "Scikit-learn requires Python %s or later. The current" - " Python version is %s installed in %s." - % (required_version, platform.python_version(), sys.executable) - ) - - check_package_status("numpy", min_deps.NUMPY_MIN_VERSION) - check_package_status("scipy", min_deps.SCIPY_MIN_VERSION) - - _check_cython_version() - metadata["ext_modules"] = configure_extension_modules() - metadata["libraries"] = libraries - setup(**metadata) - - -if __name__ == "__main__": - setup_package() diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py index ceb72441000c3..e69de29bb2d1d 100644 --- a/sklearn/_build_utils/__init__.py +++ b/sklearn/_build_utils/__init__.py @@ -1,116 +0,0 @@ -""" -Utilities useful during the build. -""" - -# author: Andy Mueller, Gael Varoquaux -# license: BSD - - -import contextlib -import os - -import sklearn - -from .._min_dependencies import CYTHON_MIN_VERSION -from ..externals._packaging.version import parse -from .openmp_helpers import check_openmp_support -from .pre_build_helpers import basic_check_build - -DEFAULT_ROOT = "sklearn" - - -def _check_cython_version(): - message = ( - "Please install Cython with a version >= {0} in order " - "to build a scikit-learn from source." - ).format(CYTHON_MIN_VERSION) - try: - import Cython - except ModuleNotFoundError as e: - # Re-raise with more informative error message instead: - raise ModuleNotFoundError(message) from e - - if parse(Cython.__version__) < parse(CYTHON_MIN_VERSION): - message += " The current version of Cython is {} installed in {}.".format( - Cython.__version__, Cython.__path__ - ) - raise ValueError(message) - - -def cythonize_extensions(extension): - """Check that a recent Cython is available and cythonize extensions""" - _check_cython_version() - from Cython.Build import cythonize - - # Fast fail before cythonization if compiler fails compiling basic test - # code even without OpenMP - basic_check_build() - - # check simple compilation with OpenMP. If it fails scikit-learn will be - # built without OpenMP and the test test_openmp_supported in the test suite - # will fail. - # `check_openmp_support` compiles a small test program to see if the - # compilers are properly configured to build with OpenMP. This is expensive - # and we only want to call this function once. - # The result of this check is cached as a private attribute on the sklearn - # module (only at build-time) to be used in the build_ext subclass defined - # in the top-level setup.py file to actually build the compiled extensions - # with OpenMP flags if needed. - sklearn._OPENMP_SUPPORTED = check_openmp_support() - - n_jobs = 1 - with contextlib.suppress(ImportError): - import joblib - - n_jobs = joblib.cpu_count() - - # Additional checks for Cython - cython_enable_debug_directives = ( - os.environ.get("SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES", "0") != "0" - ) - - compiler_directives = { - "language_level": 3, - "boundscheck": cython_enable_debug_directives, - "wraparound": False, - "initializedcheck": False, - "nonecheck": False, - "cdivision": True, - "profile": False, - } - - return cythonize( - extension, - nthreads=n_jobs, - compiler_directives=compiler_directives, - annotate=False, - ) - - -def gen_from_templates(templates): - """Generate cython files from a list of templates""" - # Lazy import because cython is not a runtime dependency. - from Cython import Tempita - - for template in templates: - outfile = template.replace(".tp", "") - - # if the template is not updated, no need to output the cython file - if not ( - os.path.exists(outfile) - and os.stat(template).st_mtime < os.stat(outfile).st_mtime - ): - with open(template, "r") as f: - tmpl = f.read() - - tmpl_ = Tempita.sub(tmpl) - - warn_msg = ( - "# WARNING: Do not edit this file directly.\n" - f"# It is automatically generated from {template!r}.\n" - "# Changes must be made there.\n\n" - ) - - with open(outfile, "w") as f: - f.write(warn_msg) - f.write(tmpl_) diff --git a/sklearn/_build_utils/openmp_helpers.py b/sklearn/_build_utils/openmp_helpers.py deleted file mode 100644 index 66e6089e33fef..0000000000000 --- a/sklearn/_build_utils/openmp_helpers.py +++ /dev/null @@ -1,127 +0,0 @@ -"""Helpers for OpenMP support during the build.""" - -# This code is adapted for a large part from the astropy openmp helpers, which -# can be found at: https://github.com/astropy/extension-helpers/blob/master/extension_helpers/_openmp_helpers.py # noqa - - -import os -import sys -import textwrap -import warnings - -from .pre_build_helpers import compile_test_program - - -def get_openmp_flag(): - if sys.platform == "win32": - return ["/openmp"] - elif sys.platform == "darwin" and "openmp" in os.getenv("CPPFLAGS", ""): - # -fopenmp can't be passed as compile flag when using Apple-clang. - # OpenMP support has to be enabled during preprocessing. - # - # For example, our macOS wheel build jobs use the following environment - # variables to build with Apple-clang and the brew installed "libomp": - # - # export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp" - # export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include" - # export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include" - # export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib - # -L/usr/local/opt/libomp/lib -lomp" - return [] - # Default flag for GCC and clang: - return ["-fopenmp"] - - -def check_openmp_support(): - """Check whether OpenMP test code can be compiled and run""" - if "PYODIDE" in os.environ: - # Pyodide doesn't support OpenMP - return False - - code = textwrap.dedent( - """\ - #include - #include - int main(void) { - #pragma omp parallel - printf("nthreads=%d\\n", omp_get_num_threads()); - return 0; - } - """ - ) - - extra_preargs = os.getenv("LDFLAGS", None) - if extra_preargs is not None: - extra_preargs = extra_preargs.strip().split(" ") - # FIXME: temporary fix to link against system libraries on linux - # "-Wl,--sysroot=/" should be removed - extra_preargs = [ - flag - for flag in extra_preargs - if flag.startswith(("-L", "-Wl,-rpath", "-l", "-Wl,--sysroot=/")) - ] - - extra_postargs = get_openmp_flag() - - openmp_exception = None - try: - output = compile_test_program( - code, extra_preargs=extra_preargs, extra_postargs=extra_postargs - ) - - if output and "nthreads=" in output[0]: - nthreads = int(output[0].strip().split("=")[1]) - openmp_supported = len(output) == nthreads - elif "PYTHON_CROSSENV" in os.environ: - # Since we can't run the test program when cross-compiling - # assume that openmp is supported if the program can be - # compiled. - openmp_supported = True - else: - openmp_supported = False - - except Exception as exception: - # We could be more specific and only catch: CompileError, LinkError, - # and subprocess.CalledProcessError. - # setuptools introduced CompileError and LinkError, but that requires - # version 61.1. Even the latest version of Ubuntu (22.04LTS) only - # ships with 59.6. So for now we catch all exceptions and reraise a - # generic exception with the original error message instead: - openmp_supported = False - openmp_exception = exception - - if not openmp_supported: - if os.getenv("SKLEARN_FAIL_NO_OPENMP"): - raise Exception( - "Failed to build scikit-learn with OpenMP support" - ) from openmp_exception - else: - message = textwrap.dedent( - """ - - *********** - * WARNING * - *********** - - It seems that scikit-learn cannot be built with OpenMP. - - - Make sure you have followed the installation instructions: - - https://scikit-learn.org/dev/developers/advanced_installation.html - - - If your compiler supports OpenMP but you still see this - message, please submit a bug report at: - - https://github.com/scikit-learn/scikit-learn/issues - - - The build will continue with OpenMP-based parallelism - disabled. Note however that some estimators will run in - sequential mode instead of leveraging thread-based - parallelism. - - *** - """ - ) - warnings.warn(message) - - return openmp_supported diff --git a/sklearn/_build_utils/pre_build_helpers.py b/sklearn/_build_utils/pre_build_helpers.py deleted file mode 100644 index 73adb26f5416b..0000000000000 --- a/sklearn/_build_utils/pre_build_helpers.py +++ /dev/null @@ -1,75 +0,0 @@ -"""Helpers to check build environment before actual build of scikit-learn""" - -import glob -import os -import subprocess -import sys -import tempfile -import textwrap - - -def compile_test_program(code, extra_preargs=None, extra_postargs=None): - """Check that some C code can be compiled and run""" - from setuptools.command.build_ext import customize_compiler, new_compiler - - ccompiler = new_compiler() - customize_compiler(ccompiler) - - start_dir = os.path.abspath(".") - - with tempfile.TemporaryDirectory() as tmp_dir: - try: - os.chdir(tmp_dir) - - # Write test program - with open("test_program.c", "w") as f: - f.write(code) - - os.mkdir("objects") - - # Compile, test program - ccompiler.compile( - ["test_program.c"], output_dir="objects", extra_postargs=extra_postargs - ) - - # Link test program - objects = glob.glob(os.path.join("objects", "*" + ccompiler.obj_extension)) - ccompiler.link_executable( - objects, - "test_program", - extra_preargs=extra_preargs, - extra_postargs=extra_postargs, - ) - - if "PYTHON_CROSSENV" not in os.environ: - # Run test program if not cross compiling - # will raise a CalledProcessError if return code was non-zero - output = subprocess.check_output("./test_program") - output = output.decode(sys.stdout.encoding or "utf-8").splitlines() - else: - # Return an empty output if we are cross compiling - # as we cannot run the test_program - output = [] - except Exception: - raise - finally: - os.chdir(start_dir) - - return output - - -def basic_check_build(): - """Check basic compilation and linking of C code""" - if "PYODIDE" in os.environ: - # The following check won't work in pyodide - return - - code = textwrap.dedent( - """\ - #include - int main(void) { - return 0; - } - """ - ) - compile_test_program(code) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index f2f687c2fb1d2..a750282a3139c 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -512,7 +512,8 @@ def test_forest_classifier_oob( test_score = classifier.score(X_test, y_test) assert classifier.oob_score_ >= lower_bound_accuracy - assert abs(test_score - classifier.oob_score_) <= 0.1 + abs_diff = abs(test_score - classifier.oob_score_) + assert abs_diff <= 0.11, f"{abs_diff=} is greater than 0.11" assert hasattr(classifier, "oob_score_") assert not hasattr(classifier, "oob_prediction_") diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 47af38a563a77..67ba6397655c8 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -8,12 +8,10 @@ import os import pkgutil import re -import sys import warnings from functools import partial from inspect import isgenerator, signature from itertools import chain, product -from pathlib import Path import numpy as np import pytest @@ -176,31 +174,6 @@ def test_check_estimator_generate_only(): assert isgenerator(all_instance_gen_checks) -def test_setup_py_check(): - pytest.importorskip("setuptools") - # Smoke test `python setup.py check` command run at the root of the - # scikit-learn source tree. - cwd = os.getcwd() - setup_path = Path(sklearn.__file__).parent.parent - setup_filename = os.path.join(setup_path, "setup.py") - if not os.path.exists(setup_filename): - pytest.skip("setup.py not available") - try: - os.chdir(setup_path) - old_argv = sys.argv - sys.argv = ["setup.py", "check"] - - with warnings.catch_warnings(): - # The configuration spits out warnings when not finding - # Blas/Atlas development headers - warnings.simplefilter("ignore", UserWarning) - with open("setup.py") as f: - exec(f.read(), dict(__name__="__main__")) - finally: - sys.argv = old_argv - os.chdir(cwd) - - def _tested_linear_classifiers(): classifiers = all_estimators(type_filter="classifier") @@ -235,7 +208,7 @@ def test_import_all_consistency(): for modname in submods + ["sklearn"]: if ".tests." in modname: continue - # Avoid test suite depending on setuptools + # Avoid test suite depending on build dependencies, for example Cython if "sklearn._build_utils" in modname: continue package = __import__(modname, fromlist="dummy") @@ -247,7 +220,7 @@ def test_import_all_consistency(): def test_root_import_all_completeness(): sklearn_path = [os.path.dirname(sklearn.__file__)] - EXCEPTIONS = ("utils", "tests", "base", "setup", "conftest") + EXCEPTIONS = ("utils", "tests", "base", "conftest") for _, modname, _ in pkgutil.walk_packages( path=sklearn_path, onerror=lambda _: None ): @@ -288,9 +261,9 @@ def test_all_tests_are_importable(): assert missing_tests == [], ( "{0} do not have `tests` subpackages. " "Perhaps they require " - "__init__.py or an add_subpackage directive " + "__init__.py or a meson.build " "in the parent " - "setup.py".format(missing_tests) + "directory".format(missing_tests) ) From 20c7bd0248a0cce656606f2b87383abc9110afb1 Mon Sep 17 00:00:00 2001 From: Anurag Varma Date: Thu, 11 Jul 2024 13:25:15 +0530 Subject: [PATCH 04/35] FIX Improve error message when RepeatedStratifiedKFold.split is called without a y argument (#29402) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Loïc Estève Co-authored-by: Lucy Liu --- doc/whats_new/v1.6.rst | 3 ++ sklearn/model_selection/_split.py | 37 +++++++++++++++++++++ sklearn/model_selection/tests/test_split.py | 15 +++++++++ 3 files changed, 55 insertions(+) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index f87a85c52e0a2..065be410a6273 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -210,6 +210,9 @@ Changelog estimator without re-fitting it. :pr:`29067` by :user:`Guillaume Lemaitre `. +- |Fix| Improve error message when :func:`model_selection.RepeatedStratifiedKFold.split` is called without a `y` argument + :pr:`29402` by :user:`Anurag Varma `. + :mod:`sklearn.neighbors` ........................ diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 2ab621e78dd69..bfd741eee5811 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1769,6 +1769,43 @@ def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): n_splits=n_splits, ) + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Note that providing ``y`` is sufficient to generate the splits and + hence ``np.zeros(n_samples)`` may be used as a placeholder for + ``X`` instead of actual training data. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + Stratification is done based on the y labels. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + y = check_array(y, input_name="y", ensure_2d=False, dtype=None) + return super().split(X, y, groups=groups) + class BaseShuffleSplit(_MetadataRequester, metaclass=ABCMeta): """Base class for *ShuffleSplit. diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index fa425a5e6a18b..4e594499ae59a 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -86,6 +86,12 @@ ALL_SPLITTERS = NO_GROUP_SPLITTERS + GROUP_SPLITTERS # type: ignore +SPLITTERS_REQUIRING_TARGET = [ + StratifiedKFold(), + StratifiedShuffleSplit(), + RepeatedStratifiedKFold(), +] + X = np.ones(10) y = np.arange(10) // 2 test_groups = ( @@ -2054,3 +2060,12 @@ def test_no_group_splitters_warns_with_groups(cv): with pytest.warns(UserWarning, match=msg): cv.split(X, y, groups=groups) + + +@pytest.mark.parametrize( + "cv", SPLITTERS_REQUIRING_TARGET, ids=[str(cv) for cv in SPLITTERS_REQUIRING_TARGET] +) +def test_stratified_splitter_without_y(cv): + msg = "missing 1 required positional argument: 'y'" + with pytest.raises(TypeError, match=msg): + cv.split(X) From afee65a7e534c771e5465539b88c139f02ecb1ba Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 11 Jul 2024 09:25:19 -0400 Subject: [PATCH 05/35] FEAT SLEP006 `permutation_test_score` to support metadata routing (#29266) --- doc/metadata_routing.rst | 2 +- doc/whats_new/v1.6.rst | 4 + sklearn/model_selection/_validation.py | 109 ++++++++++++++++-- .../model_selection/tests/test_validation.py | 11 +- 4 files changed, 113 insertions(+), 13 deletions(-) diff --git a/doc/metadata_routing.rst b/doc/metadata_routing.rst index 440300b7e44bb..31dae6813bda5 100644 --- a/doc/metadata_routing.rst +++ b/doc/metadata_routing.rst @@ -301,6 +301,7 @@ Meta-estimators and functions supporting metadata routing: - :class:`sklearn.model_selection.HalvingGridSearchCV` - :class:`sklearn.model_selection.HalvingRandomSearchCV` - :class:`sklearn.model_selection.RandomizedSearchCV` +- :class:`sklearn.model_selection.permutation_test_score` - :func:`sklearn.model_selection.cross_validate` - :func:`sklearn.model_selection.cross_val_score` - :func:`sklearn.model_selection.cross_val_predict` @@ -324,4 +325,3 @@ Meta-estimators and tools not supporting metadata routing yet: - :class:`sklearn.feature_selection.RFE` - :class:`sklearn.feature_selection.RFECV` - :class:`sklearn.feature_selection.SequentialFeatureSelector` -- :class:`sklearn.model_selection.permutation_test_score` diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 065be410a6273..4e669f32c2f71 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -89,6 +89,10 @@ more details. passed to the underlying estimators via their respective methods. :pr:`28494` by :user:`Adam Li `. +- |Feature| :func:`model_selection.permutation_test_score` now supports metadata routing + for the `fit` method of its estimator and for its underlying CV splitter and scorer. + :pr:`29266` by :user:`Adam Li `. + Dropping support for building with setuptools --------------------------------------------- diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 698e8255c6138..ddc9b542b0a5e 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -1493,6 +1493,7 @@ def _check_is_permutation(indices, n_samples): "verbose": ["verbose"], "scoring": [StrOptions(set(get_scorer_names())), callable, None], "fit_params": [dict, None], + "params": [dict, None], }, prefer_skip_nested_validation=False, # estimator is not validated yet ) @@ -1509,6 +1510,7 @@ def permutation_test_score( verbose=0, scoring=None, fit_params=None, + params=None, ): """Evaluate the significance of a cross-validated score with permutations. @@ -1548,6 +1550,13 @@ def permutation_test_score( cross-validator uses them for grouping the samples while splitting the dataset into train/test set. + .. versionchanged:: 1.6 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``permutation_test_score(..., params={'groups': groups})``. + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -1594,7 +1603,24 @@ def permutation_test_score( fit_params : dict, default=None Parameters to pass to the fit method of the estimator. - .. versionadded:: 0.24 + .. deprecated:: 1.6 + This parameter is deprecated and will be removed in version 1.6. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the `fit` method of the estimator, the scorer + and the cv splitter. + + - If `enable_metadata_routing=False` (default): + Parameters directly passed to the `fit` method of the estimator. + + - If `enable_metadata_routing=True`: + Parameters safely routed to the `fit` method of the estimator, + `cv` object and `scorer`. + See :ref:`Metadata Routing User Guide ` for more + details. + + .. versionadded:: 1.6 Returns ------- @@ -1643,26 +1669,86 @@ def permutation_test_score( >>> print(f"P-value: {pvalue:.3f}") P-value: 0.010 """ + params = _check_params_groups_deprecation(fit_params, params, groups, "1.8") + X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) random_state = check_random_state(random_state) + if _routing_enabled(): + router = ( + MetadataRouter(owner="permutation_test_score") + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata to the predict method for + # scoring? + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + scorer=scorer, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + unrequested_params = sorted(e.unrequested_params) + raise UnsetMetadataPassedError( + message=( + f"{unrequested_params} are passed to `permutation_test_score`" + " but are not explicitly set as requested or not requested" + " for permutation_test_score's" + f" estimator: {estimator.__class__.__name__}. Call" + " `.set_fit_request({{metadata}}=True)` on the estimator for" + f" each metadata in {unrequested_params} that you" + " want to use and `metadata=False` for not using it. See the" + " Metadata Routing User guide" + " for more" + " information." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + + else: + routed_params = Bunch() + routed_params.estimator = Bunch(fit=params) + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.scorer = Bunch(score={}) + # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. score = _permutation_test_score( - clone(estimator), X, y, groups, cv, scorer, fit_params=fit_params + clone(estimator), + X, + y, + cv, + scorer, + split_params=routed_params.splitter.split, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, ) permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_permutation_test_score)( clone(estimator), X, _shuffle(y, groups, random_state), - groups, cv, scorer, - fit_params=fit_params, + split_params=routed_params.splitter.split, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, ) for _ in range(n_permutations) ) @@ -1671,17 +1757,22 @@ def permutation_test_score( return score, permutation_scores, pvalue -def _permutation_test_score(estimator, X, y, groups, cv, scorer, fit_params): +def _permutation_test_score( + estimator, X, y, cv, scorer, split_params, fit_params, score_params +): """Auxiliary function for permutation_test_score""" # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} + score_params = score_params if score_params is not None else {} + avg_score = [] - for train, test in cv.split(X, y, groups): + for train, test in cv.split(X, y, **split_params): X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) - fit_params = _check_method_params(X, params=fit_params, indices=train) - estimator.fit(X_train, y_train, **fit_params) - avg_score.append(scorer(estimator, X_test, y_test)) + fit_params_train = _check_method_params(X, params=fit_params, indices=train) + score_params_test = _check_method_params(X, params=score_params, indices=test) + estimator.fit(X_train, y_train, **fit_params_train) + avg_score.append(scorer(estimator, X_test, y_test, **score_params_test)) return np.mean(avg_score) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 4fca9e0c42e3c..33d4d366bf17a 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -862,7 +862,7 @@ def test_permutation_test_score_allow_nans(): permutation_test_score(p, X, y) -def test_permutation_test_score_fit_params(): +def test_permutation_test_score_params(): X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) clf = CheckingClassifier(expected_sample_weight=True) @@ -873,8 +873,8 @@ def test_permutation_test_score_fit_params(): err_msg = r"sample_weight.shape == \(1,\), expected \(8,\)!" with pytest.raises(ValueError, match=err_msg): - permutation_test_score(clf, X, y, fit_params={"sample_weight": np.ones(1)}) - permutation_test_score(clf, X, y, fit_params={"sample_weight": np.ones(10)}) + permutation_test_score(clf, X, y, params={"sample_weight": np.ones(1)}) + permutation_test_score(clf, X, y, params={"sample_weight": np.ones(10)}) def test_cross_val_score_allow_nans(): @@ -2495,6 +2495,7 @@ def test_cross_validate_return_indices(global_random_seed): (cross_val_score, {}), (cross_val_predict, {}), (learning_curve, {}), + (permutation_test_score, {}), (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), ], ) @@ -2526,6 +2527,7 @@ def test_fit_param_deprecation(func, extra_args): (cross_val_score, {}), (cross_val_predict, {}), (learning_curve, {}), + (permutation_test_score, {}), (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), ], ) @@ -2551,6 +2553,7 @@ def test_groups_with_routing_validation(func, extra_args): (cross_val_score, {}), (cross_val_predict, {}), (learning_curve, {}), + (permutation_test_score, {}), (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), ], ) @@ -2576,6 +2579,7 @@ def test_passed_unrequested_metadata(func, extra_args): (cross_val_score, {}), (cross_val_predict, {}), (learning_curve, {}), + (permutation_test_score, {}), (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), ], ) @@ -2609,6 +2613,7 @@ def test_validation_functions_routing(func, extra_args): cross_val_score: dict(scoring=scorer), learning_curve: dict(scoring=scorer), validation_curve: dict(scoring=scorer), + permutation_test_score: dict(scoring=scorer), cross_val_predict: dict(), } From 2b2e2903e5635dd93a741c955a87260fb69cfc3d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 11 Jul 2024 15:32:52 +0200 Subject: [PATCH 06/35] ENH fetch_file to fetch data files by URL with retries, checksuming and local caching (#29354) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Guillaume Lemaitre Co-authored-by: Loïc Estève --- doc/whats_new/v1.6.rst | 9 + .../plot_time_series_lagged_features.py | 860 +++++++++--------- sklearn/datasets/__init__.py | 2 + sklearn/datasets/_base.py | 207 ++++- sklearn/datasets/tests/test_base.py | 265 ++++++ 5 files changed, 892 insertions(+), 451 deletions(-) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 4e669f32c2f71..d7d3a71eba636 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -139,6 +139,15 @@ Changelog on the input data. :pr:`29124` by :user:`Yao Xiao `. + +:mod:`sklearn.datasets` +....................... + +- |Feature| :func:`datasets.fetch_file` allows downloading arbitrary data-file + from the web. It handles local caching, integrity checks with SHA256 digests + and automatic retries in case of HTTP errors. :pr:`29354` by :user:`Olivier + Grisel `. + :mod:`sklearn.discriminant_analysis` .................................... diff --git a/examples/applications/plot_time_series_lagged_features.py b/examples/applications/plot_time_series_lagged_features.py index 9159825cbbd43..2efc12acae276 100644 --- a/examples/applications/plot_time_series_lagged_features.py +++ b/examples/applications/plot_time_series_lagged_features.py @@ -1,425 +1,435 @@ -""" -=========================================== -Lagged features for time series forecasting -=========================================== - -This example demonstrates how Polars-engineered lagged features can be used -for time series forecasting with -:class:`~sklearn.ensemble.HistGradientBoostingRegressor` on the Bike Sharing -Demand dataset. - -See the example on -:ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py` -for some data exploration on this dataset and a demo on periodic feature -engineering. - -""" - -# %% -# Analyzing the Bike Sharing Demand dataset -# ----------------------------------------- -# -# We start by loading the data from the OpenML repository -# as a pandas dataframe. This will be replaced with Polars -# once `fetch_openml` adds a native support for it. -# We convert to Polars for feature engineering, as it automatically caches -# common subexpressions which are reused in multiple expressions -# (like `pl.col("count").shift(1)` below). See -# https://docs.pola.rs/user-guide/lazy/optimizations/ for more information. - -import numpy as np -import polars as pl - -from sklearn.datasets import fetch_openml - -pl.Config.set_fmt_str_lengths(20) - -bike_sharing = fetch_openml( - "Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas" -) -df = bike_sharing.frame -df = pl.DataFrame({col: df[col].to_numpy() for col in df.columns}) - -# %% -# Next, we take a look at the statistical summary of the dataset -# so that we can better understand the data that we are working with. -import polars.selectors as cs - -summary = df.select(cs.numeric()).describe() -summary - -# %% -# Let us look at the count of the seasons `"fall"`, `"spring"`, `"summer"` -# and `"winter"` present in the dataset to confirm they are balanced. - -import matplotlib.pyplot as plt - -df["season"].value_counts() - - -# %% -# Generating Polars-engineered lagged features -# -------------------------------------------- -# Let's consider the problem of predicting the demand at the -# next hour given past demands. Since the demand is a continuous -# variable, one could intuitively use any regression model. However, we do -# not have the usual `(X_train, y_train)` dataset. Instead, we just have -# the `y_train` demand data sequentially organized by time. -lagged_df = df.select( - "count", - *[pl.col("count").shift(i).alias(f"lagged_count_{i}h") for i in [1, 2, 3]], - lagged_count_1d=pl.col("count").shift(24), - lagged_count_1d_1h=pl.col("count").shift(24 + 1), - lagged_count_7d=pl.col("count").shift(7 * 24), - lagged_count_7d_1h=pl.col("count").shift(7 * 24 + 1), - lagged_mean_24h=pl.col("count").shift(1).rolling_mean(24), - lagged_max_24h=pl.col("count").shift(1).rolling_max(24), - lagged_min_24h=pl.col("count").shift(1).rolling_min(24), - lagged_mean_7d=pl.col("count").shift(1).rolling_mean(7 * 24), - lagged_max_7d=pl.col("count").shift(1).rolling_max(7 * 24), - lagged_min_7d=pl.col("count").shift(1).rolling_min(7 * 24), -) -lagged_df.tail(10) - -# %% -# Watch out however, the first lines have undefined values because their own -# past is unknown. This depends on how much lag we used: -lagged_df.head(10) - -# %% -# We can now separate the lagged features in a matrix `X` and the target variable -# (the counts to predict) in an array of the same first dimension `y`. -lagged_df = lagged_df.drop_nulls() -X = lagged_df.drop("count") -y = lagged_df["count"] -print("X shape: {}\ny shape: {}".format(X.shape, y.shape)) - -# %% -# Naive evaluation of the next hour bike demand regression -# -------------------------------------------------------- -# Let's randomly split our tabularized dataset to train a gradient -# boosting regression tree (GBRT) model and evaluate it using Mean -# Absolute Percentage Error (MAPE). If our model is aimed at forecasting -# (i.e., predicting future data from past data), we should not use training -# data that are ulterior to the testing data. In time series machine learning -# the "i.i.d" (independent and identically distributed) assumption does not -# hold true as the data points are not independent and have a temporal -# relationship. -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.model_selection import train_test_split - -X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 -) - -model = HistGradientBoostingRegressor().fit(X_train, y_train) - -# %% -# Taking a look at the performance of the model. -from sklearn.metrics import mean_absolute_percentage_error - -y_pred = model.predict(X_test) -mean_absolute_percentage_error(y_test, y_pred) - -# %% -# Proper next hour forecasting evaluation -# --------------------------------------- -# Let's use a proper evaluation splitting strategies that takes into account -# the temporal structure of the dataset to evaluate our model's ability to -# predict data points in the future (to avoid cheating by reading values from -# the lagged features in the training set). -from sklearn.model_selection import TimeSeriesSplit - -ts_cv = TimeSeriesSplit( - n_splits=3, # to keep the notebook fast enough on common laptops - gap=48, # 2 days data gap between train and test - max_train_size=10000, # keep train sets of comparable sizes - test_size=3000, # for 2 or 3 digits of precision in scores -) -all_splits = list(ts_cv.split(X, y)) - -# %% -# Training the model and evaluating its performance based on MAPE. -train_idx, test_idx = all_splits[0] -X_train, X_test = X[train_idx, :], X[test_idx, :] -y_train, y_test = y[train_idx], y[test_idx] - -model = HistGradientBoostingRegressor().fit(X_train, y_train) -y_pred = model.predict(X_test) -mean_absolute_percentage_error(y_test, y_pred) - -# %% -# The generalization error measured via a shuffled trained test split -# is too optimistic. The generalization via a time-based split is likely to -# be more representative of the true performance of the regression model. -# Let's assess this variability of our error evaluation with proper -# cross-validation: -from sklearn.model_selection import cross_val_score - -cv_mape_scores = -cross_val_score( - model, X, y, cv=ts_cv, scoring="neg_mean_absolute_percentage_error" -) -cv_mape_scores - -# %% -# The variability across splits is quite large! In a real life setting -# it would be advised to use more splits to better assess the variability. -# Let's report the mean CV scores and their standard deviation from now on. -print(f"CV MAPE: {cv_mape_scores.mean():.3f} ± {cv_mape_scores.std():.3f}") - -# %% -# We can compute several combinations of evaluation metrics and loss functions, -# which are reported a bit below. -from collections import defaultdict - -from sklearn.metrics import ( - make_scorer, - mean_absolute_error, - mean_pinball_loss, - root_mean_squared_error, -) -from sklearn.model_selection import cross_validate - - -def consolidate_scores(cv_results, scores, metric): - if metric == "MAPE": - scores[metric].append(f"{value.mean():.2f} ± {value.std():.2f}") - else: - scores[metric].append(f"{value.mean():.1f} ± {value.std():.1f}") - - return scores - - -scoring = { - "MAPE": make_scorer(mean_absolute_percentage_error), - "RMSE": make_scorer(root_mean_squared_error), - "MAE": make_scorer(mean_absolute_error), - "pinball_loss_05": make_scorer(mean_pinball_loss, alpha=0.05), - "pinball_loss_50": make_scorer(mean_pinball_loss, alpha=0.50), - "pinball_loss_95": make_scorer(mean_pinball_loss, alpha=0.95), -} -loss_functions = ["squared_error", "poisson", "absolute_error"] -scores = defaultdict(list) -for loss_func in loss_functions: - model = HistGradientBoostingRegressor(loss=loss_func) - cv_results = cross_validate( - model, - X, - y, - cv=ts_cv, - scoring=scoring, - n_jobs=2, - ) - time = cv_results["fit_time"] - scores["loss"].append(loss_func) - scores["fit_time"].append(f"{time.mean():.2f} ± {time.std():.2f} s") - - for key, value in cv_results.items(): - if key.startswith("test_"): - metric = key.split("test_")[1] - scores = consolidate_scores(cv_results, scores, metric) - - -# %% -# Modeling predictive uncertainty via quantile regression -# ------------------------------------------------------- -# Instead of modeling the expected value of the distribution of -# :math:`Y|X` like the least squares and Poisson losses do, one could try to -# estimate quantiles of the conditional distribution. -# -# :math:`Y|X=x_i` is expected to be a random variable for a given data point -# :math:`x_i` because we expect that the number of rentals cannot be 100% -# accurately predicted from the features. It can be influenced by other -# variables not properly captured by the existing lagged features. For -# instance whether or not it will rain in the next hour cannot be fully -# anticipated from the past hours bike rental data. This is what we -# call aleatoric uncertainty. -# -# Quantile regression makes it possible to give a finer description of that -# distribution without making strong assumptions on its shape. -quantile_list = [0.05, 0.5, 0.95] - -for quantile in quantile_list: - model = HistGradientBoostingRegressor(loss="quantile", quantile=quantile) - cv_results = cross_validate( - model, - X, - y, - cv=ts_cv, - scoring=scoring, - n_jobs=2, - ) - time = cv_results["fit_time"] - scores["fit_time"].append(f"{time.mean():.2f} ± {time.std():.2f} s") - - scores["loss"].append(f"quantile {int(quantile*100)}") - for key, value in cv_results.items(): - if key.startswith("test_"): - metric = key.split("test_")[1] - scores = consolidate_scores(cv_results, scores, metric) - -scores_df = pl.DataFrame(scores) -scores_df - - -# %% -# Let us take a look at the losses that minimise each metric. -def min_arg(col): - col_split = pl.col(col).str.split(" ") - return pl.arg_sort_by( - col_split.list.get(0).cast(pl.Float64), - col_split.list.get(2).cast(pl.Float64), - ).first() - - -scores_df.select( - pl.col("loss").get(min_arg(col_name)).alias(col_name) - for col_name in scores_df.columns - if col_name != "loss" -) - -# %% -# Even if the score distributions overlap due to the variance in the dataset, -# it is true that the average RMSE is lower when `loss="squared_error"`, whereas -# the average MAPE is lower when `loss="absolute_error"` as expected. That is -# also the case for the Mean Pinball Loss with the quantiles 5 and 95. The score -# corresponding to the 50 quantile loss is overlapping with the score obtained -# by minimizing other loss functions, which is also the case for the MAE. -# -# A qualitative look at the predictions -# ------------------------------------- -# We can now visualize the performance of the model with regards -# to the 5th percentile, median and the 95th percentile: -all_splits = list(ts_cv.split(X, y)) -train_idx, test_idx = all_splits[0] - -X_train, X_test = X[train_idx, :], X[test_idx, :] -y_train, y_test = y[train_idx], y[test_idx] - -max_iter = 50 -gbrt_mean_poisson = HistGradientBoostingRegressor(loss="poisson", max_iter=max_iter) -gbrt_mean_poisson.fit(X_train, y_train) -mean_predictions = gbrt_mean_poisson.predict(X_test) - -gbrt_median = HistGradientBoostingRegressor( - loss="quantile", quantile=0.5, max_iter=max_iter -) -gbrt_median.fit(X_train, y_train) -median_predictions = gbrt_median.predict(X_test) - -gbrt_percentile_5 = HistGradientBoostingRegressor( - loss="quantile", quantile=0.05, max_iter=max_iter -) -gbrt_percentile_5.fit(X_train, y_train) -percentile_5_predictions = gbrt_percentile_5.predict(X_test) - -gbrt_percentile_95 = HistGradientBoostingRegressor( - loss="quantile", quantile=0.95, max_iter=max_iter -) -gbrt_percentile_95.fit(X_train, y_train) -percentile_95_predictions = gbrt_percentile_95.predict(X_test) - -# %% -# We can now take a look at the predictions made by the regression models: -last_hours = slice(-96, None) -fig, ax = plt.subplots(figsize=(15, 7)) -plt.title("Predictions by regression models") -ax.plot( - y_test[last_hours], - "x-", - alpha=0.2, - label="Actual demand", - color="black", -) -ax.plot( - median_predictions[last_hours], - "^-", - label="GBRT median", -) -ax.plot( - mean_predictions[last_hours], - "x-", - label="GBRT mean (Poisson)", -) -ax.fill_between( - np.arange(96), - percentile_5_predictions[last_hours], - percentile_95_predictions[last_hours], - alpha=0.3, - label="GBRT 90% interval", -) -_ = ax.legend() - -# %% -# Here it's interesting to notice that the blue area between the 5% and 95% -# percentile estimators has a width that varies with the time of the day: -# -# - At night, the blue band is much narrower: the pair of models is quite -# certain that there will be a small number of bike rentals. And furthermore -# these seem correct in the sense that the actual demand stays in that blue -# band. -# - During the day, the blue band is much wider: the uncertainty grows, probably -# because of the variability of the weather that can have a very large impact, -# especially on week-ends. -# - We can also see that during week-days, the commute pattern is still visible in -# the 5% and 95% estimations. -# - Finally, it is expected that 10% of the time, the actual demand does not lie -# between the 5% and 95% percentile estimates. On this test span, the actual -# demand seems to be higher, especially during the rush hours. It might reveal that -# our 95% percentile estimator underestimates the demand peaks. This could be be -# quantitatively confirmed by computing empirical coverage numbers as done in -# the :ref:`calibration of confidence intervals `. -# -# Looking at the performance of non-linear regression models vs -# the best models: -from sklearn.metrics import PredictionErrorDisplay - -fig, axes = plt.subplots(ncols=3, figsize=(15, 6), sharey=True) -fig.suptitle("Non-linear regression models") -predictions = [ - median_predictions, - percentile_5_predictions, - percentile_95_predictions, -] -labels = [ - "Median", - "5th percentile", - "95th percentile", -] -for ax, pred, label in zip(axes, predictions, labels): - PredictionErrorDisplay.from_predictions( - y_true=y_test, - y_pred=pred, - kind="residual_vs_predicted", - scatter_kwargs={"alpha": 0.3}, - ax=ax, - ) - ax.set(xlabel="Predicted demand", ylabel="True demand") - ax.legend(["Best model", label]) - -plt.show() - -# %% -# Conclusion -# ---------- -# Through this example we explored time series forecasting using lagged -# features. We compared a naive regression (using the standardized -# :class:`~sklearn.model_selection.train_test_split`) with a proper time -# series evaluation strategy using -# :class:`~sklearn.model_selection.TimeSeriesSplit`. We observed that the -# model trained using :class:`~sklearn.model_selection.train_test_split`, -# having a default value of `shuffle` set to `True` produced an overly -# optimistic Mean Average Percentage Error (MAPE). The results -# produced from the time-based split better represent the performance -# of our time-series regression model. We also analyzed the predictive uncertainty -# of our model via Quantile Regression. Predictions based on the 5th and -# 95th percentile using `loss="quantile"` provide us with a quantitative estimate -# of the uncertainty of the forecasts made by our time series regression model. -# Uncertainty estimation can also be performed -# using `MAPIE `_, -# that provides an implementation based on recent work on conformal prediction -# methods and estimates both aleatoric and epistemic uncertainty at the same time. -# Furthermore, functionalities provided -# by `sktime `_ -# can be used to extend scikit-learn estimators by making use of recursive time -# series forecasting, that enables dynamic predictions of future values. +""" +=========================================== +Lagged features for time series forecasting +=========================================== + +This example demonstrates how Polars-engineered lagged features can be used +for time series forecasting with +:class:`~sklearn.ensemble.HistGradientBoostingRegressor` on the Bike Sharing +Demand dataset. + +See the example on +:ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py` +for some data exploration on this dataset and a demo on periodic feature +engineering. + +""" + +# %% +# Analyzing the Bike Sharing Demand dataset +# ----------------------------------------- +# +# We start by loading the data from the OpenML repository as a raw parquet file +# to illustrate how to work with an arbitrary parquet file instead of hiding this +# step in a convenience tool such as `sklearn.datasets.fetch_openml`. +# +# The URL of the parquet file can be found in the JSON description of the +# Bike Sharing Demand dataset with id 44063 on openml.org +# (https://openml.org/search?type=data&status=active&id=44063). +# +# The `sha256` hash of the file is also provided to ensure the integrity of the +# downloaded file. +import numpy as np +import polars as pl + +from sklearn.datasets import fetch_file + +pl.Config.set_fmt_str_lengths(20) + +bike_sharing_data_file = fetch_file( + "https://openml1.win.tue.nl/datasets/0004/44063/dataset_44063.pq", + sha256="d120af76829af0d256338dc6dd4be5df4fd1f35bf3a283cab66a51c1c6abd06a", +) +bike_sharing_data_file + +# %% +# We load the parquet file with Polars for feature engineering. Polars +# automatically caches common subexpressions which are reused in multiple +# expressions (like `pl.col("count").shift(1)` below). See +# https://docs.pola.rs/user-guide/lazy/optimizations/ for more information. + +df = pl.read_parquet(bike_sharing_data_file) + +# %% +# Next, we take a look at the statistical summary of the dataset +# so that we can better understand the data that we are working with. +import polars.selectors as cs + +summary = df.select(cs.numeric()).describe() +summary + +# %% +# Let us look at the count of the seasons `"fall"`, `"spring"`, `"summer"` +# and `"winter"` present in the dataset to confirm they are balanced. + +import matplotlib.pyplot as plt + +df["season"].value_counts() + + +# %% +# Generating Polars-engineered lagged features +# -------------------------------------------- +# Let's consider the problem of predicting the demand at the +# next hour given past demands. Since the demand is a continuous +# variable, one could intuitively use any regression model. However, we do +# not have the usual `(X_train, y_train)` dataset. Instead, we just have +# the `y_train` demand data sequentially organized by time. +lagged_df = df.select( + "count", + *[pl.col("count").shift(i).alias(f"lagged_count_{i}h") for i in [1, 2, 3]], + lagged_count_1d=pl.col("count").shift(24), + lagged_count_1d_1h=pl.col("count").shift(24 + 1), + lagged_count_7d=pl.col("count").shift(7 * 24), + lagged_count_7d_1h=pl.col("count").shift(7 * 24 + 1), + lagged_mean_24h=pl.col("count").shift(1).rolling_mean(24), + lagged_max_24h=pl.col("count").shift(1).rolling_max(24), + lagged_min_24h=pl.col("count").shift(1).rolling_min(24), + lagged_mean_7d=pl.col("count").shift(1).rolling_mean(7 * 24), + lagged_max_7d=pl.col("count").shift(1).rolling_max(7 * 24), + lagged_min_7d=pl.col("count").shift(1).rolling_min(7 * 24), +) +lagged_df.tail(10) + +# %% +# Watch out however, the first lines have undefined values because their own +# past is unknown. This depends on how much lag we used: +lagged_df.head(10) + +# %% +# We can now separate the lagged features in a matrix `X` and the target variable +# (the counts to predict) in an array of the same first dimension `y`. +lagged_df = lagged_df.drop_nulls() +X = lagged_df.drop("count") +y = lagged_df["count"] +print("X shape: {}\ny shape: {}".format(X.shape, y.shape)) + +# %% +# Naive evaluation of the next hour bike demand regression +# -------------------------------------------------------- +# Let's randomly split our tabularized dataset to train a gradient +# boosting regression tree (GBRT) model and evaluate it using Mean +# Absolute Percentage Error (MAPE). If our model is aimed at forecasting +# (i.e., predicting future data from past data), we should not use training +# data that are ulterior to the testing data. In time series machine learning +# the "i.i.d" (independent and identically distributed) assumption does not +# hold true as the data points are not independent and have a temporal +# relationship. +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 +) + +model = HistGradientBoostingRegressor().fit(X_train, y_train) + +# %% +# Taking a look at the performance of the model. +from sklearn.metrics import mean_absolute_percentage_error + +y_pred = model.predict(X_test) +mean_absolute_percentage_error(y_test, y_pred) + +# %% +# Proper next hour forecasting evaluation +# --------------------------------------- +# Let's use a proper evaluation splitting strategies that takes into account +# the temporal structure of the dataset to evaluate our model's ability to +# predict data points in the future (to avoid cheating by reading values from +# the lagged features in the training set). +from sklearn.model_selection import TimeSeriesSplit + +ts_cv = TimeSeriesSplit( + n_splits=3, # to keep the notebook fast enough on common laptops + gap=48, # 2 days data gap between train and test + max_train_size=10000, # keep train sets of comparable sizes + test_size=3000, # for 2 or 3 digits of precision in scores +) +all_splits = list(ts_cv.split(X, y)) + +# %% +# Training the model and evaluating its performance based on MAPE. +train_idx, test_idx = all_splits[0] +X_train, X_test = X[train_idx, :], X[test_idx, :] +y_train, y_test = y[train_idx], y[test_idx] + +model = HistGradientBoostingRegressor().fit(X_train, y_train) +y_pred = model.predict(X_test) +mean_absolute_percentage_error(y_test, y_pred) + +# %% +# The generalization error measured via a shuffled trained test split +# is too optimistic. The generalization via a time-based split is likely to +# be more representative of the true performance of the regression model. +# Let's assess this variability of our error evaluation with proper +# cross-validation: +from sklearn.model_selection import cross_val_score + +cv_mape_scores = -cross_val_score( + model, X, y, cv=ts_cv, scoring="neg_mean_absolute_percentage_error" +) +cv_mape_scores + +# %% +# The variability across splits is quite large! In a real life setting +# it would be advised to use more splits to better assess the variability. +# Let's report the mean CV scores and their standard deviation from now on. +print(f"CV MAPE: {cv_mape_scores.mean():.3f} ± {cv_mape_scores.std():.3f}") + +# %% +# We can compute several combinations of evaluation metrics and loss functions, +# which are reported a bit below. +from collections import defaultdict + +from sklearn.metrics import ( + make_scorer, + mean_absolute_error, + mean_pinball_loss, + root_mean_squared_error, +) +from sklearn.model_selection import cross_validate + + +def consolidate_scores(cv_results, scores, metric): + if metric == "MAPE": + scores[metric].append(f"{value.mean():.2f} ± {value.std():.2f}") + else: + scores[metric].append(f"{value.mean():.1f} ± {value.std():.1f}") + + return scores + + +scoring = { + "MAPE": make_scorer(mean_absolute_percentage_error), + "RMSE": make_scorer(root_mean_squared_error), + "MAE": make_scorer(mean_absolute_error), + "pinball_loss_05": make_scorer(mean_pinball_loss, alpha=0.05), + "pinball_loss_50": make_scorer(mean_pinball_loss, alpha=0.50), + "pinball_loss_95": make_scorer(mean_pinball_loss, alpha=0.95), +} +loss_functions = ["squared_error", "poisson", "absolute_error"] +scores = defaultdict(list) +for loss_func in loss_functions: + model = HistGradientBoostingRegressor(loss=loss_func) + cv_results = cross_validate( + model, + X, + y, + cv=ts_cv, + scoring=scoring, + n_jobs=2, + ) + time = cv_results["fit_time"] + scores["loss"].append(loss_func) + scores["fit_time"].append(f"{time.mean():.2f} ± {time.std():.2f} s") + + for key, value in cv_results.items(): + if key.startswith("test_"): + metric = key.split("test_")[1] + scores = consolidate_scores(cv_results, scores, metric) + + +# %% +# Modeling predictive uncertainty via quantile regression +# ------------------------------------------------------- +# Instead of modeling the expected value of the distribution of +# :math:`Y|X` like the least squares and Poisson losses do, one could try to +# estimate quantiles of the conditional distribution. +# +# :math:`Y|X=x_i` is expected to be a random variable for a given data point +# :math:`x_i` because we expect that the number of rentals cannot be 100% +# accurately predicted from the features. It can be influenced by other +# variables not properly captured by the existing lagged features. For +# instance whether or not it will rain in the next hour cannot be fully +# anticipated from the past hours bike rental data. This is what we +# call aleatoric uncertainty. +# +# Quantile regression makes it possible to give a finer description of that +# distribution without making strong assumptions on its shape. +quantile_list = [0.05, 0.5, 0.95] + +for quantile in quantile_list: + model = HistGradientBoostingRegressor(loss="quantile", quantile=quantile) + cv_results = cross_validate( + model, + X, + y, + cv=ts_cv, + scoring=scoring, + n_jobs=2, + ) + time = cv_results["fit_time"] + scores["fit_time"].append(f"{time.mean():.2f} ± {time.std():.2f} s") + + scores["loss"].append(f"quantile {int(quantile*100)}") + for key, value in cv_results.items(): + if key.startswith("test_"): + metric = key.split("test_")[1] + scores = consolidate_scores(cv_results, scores, metric) + +scores_df = pl.DataFrame(scores) +scores_df + + +# %% +# Let us take a look at the losses that minimise each metric. +def min_arg(col): + col_split = pl.col(col).str.split(" ") + return pl.arg_sort_by( + col_split.list.get(0).cast(pl.Float64), + col_split.list.get(2).cast(pl.Float64), + ).first() + + +scores_df.select( + pl.col("loss").get(min_arg(col_name)).alias(col_name) + for col_name in scores_df.columns + if col_name != "loss" +) + +# %% +# Even if the score distributions overlap due to the variance in the dataset, +# it is true that the average RMSE is lower when `loss="squared_error"`, whereas +# the average MAPE is lower when `loss="absolute_error"` as expected. That is +# also the case for the Mean Pinball Loss with the quantiles 5 and 95. The score +# corresponding to the 50 quantile loss is overlapping with the score obtained +# by minimizing other loss functions, which is also the case for the MAE. +# +# A qualitative look at the predictions +# ------------------------------------- +# We can now visualize the performance of the model with regards +# to the 5th percentile, median and the 95th percentile: +all_splits = list(ts_cv.split(X, y)) +train_idx, test_idx = all_splits[0] + +X_train, X_test = X[train_idx, :], X[test_idx, :] +y_train, y_test = y[train_idx], y[test_idx] + +max_iter = 50 +gbrt_mean_poisson = HistGradientBoostingRegressor(loss="poisson", max_iter=max_iter) +gbrt_mean_poisson.fit(X_train, y_train) +mean_predictions = gbrt_mean_poisson.predict(X_test) + +gbrt_median = HistGradientBoostingRegressor( + loss="quantile", quantile=0.5, max_iter=max_iter +) +gbrt_median.fit(X_train, y_train) +median_predictions = gbrt_median.predict(X_test) + +gbrt_percentile_5 = HistGradientBoostingRegressor( + loss="quantile", quantile=0.05, max_iter=max_iter +) +gbrt_percentile_5.fit(X_train, y_train) +percentile_5_predictions = gbrt_percentile_5.predict(X_test) + +gbrt_percentile_95 = HistGradientBoostingRegressor( + loss="quantile", quantile=0.95, max_iter=max_iter +) +gbrt_percentile_95.fit(X_train, y_train) +percentile_95_predictions = gbrt_percentile_95.predict(X_test) + +# %% +# We can now take a look at the predictions made by the regression models: +last_hours = slice(-96, None) +fig, ax = plt.subplots(figsize=(15, 7)) +plt.title("Predictions by regression models") +ax.plot( + y_test[last_hours], + "x-", + alpha=0.2, + label="Actual demand", + color="black", +) +ax.plot( + median_predictions[last_hours], + "^-", + label="GBRT median", +) +ax.plot( + mean_predictions[last_hours], + "x-", + label="GBRT mean (Poisson)", +) +ax.fill_between( + np.arange(96), + percentile_5_predictions[last_hours], + percentile_95_predictions[last_hours], + alpha=0.3, + label="GBRT 90% interval", +) +_ = ax.legend() + +# %% +# Here it's interesting to notice that the blue area between the 5% and 95% +# percentile estimators has a width that varies with the time of the day: +# +# - At night, the blue band is much narrower: the pair of models is quite +# certain that there will be a small number of bike rentals. And furthermore +# these seem correct in the sense that the actual demand stays in that blue +# band. +# - During the day, the blue band is much wider: the uncertainty grows, probably +# because of the variability of the weather that can have a very large impact, +# especially on week-ends. +# - We can also see that during week-days, the commute pattern is still visible in +# the 5% and 95% estimations. +# - Finally, it is expected that 10% of the time, the actual demand does not lie +# between the 5% and 95% percentile estimates. On this test span, the actual +# demand seems to be higher, especially during the rush hours. It might reveal that +# our 95% percentile estimator underestimates the demand peaks. This could be be +# quantitatively confirmed by computing empirical coverage numbers as done in +# the :ref:`calibration of confidence intervals `. +# +# Looking at the performance of non-linear regression models vs +# the best models: +from sklearn.metrics import PredictionErrorDisplay + +fig, axes = plt.subplots(ncols=3, figsize=(15, 6), sharey=True) +fig.suptitle("Non-linear regression models") +predictions = [ + median_predictions, + percentile_5_predictions, + percentile_95_predictions, +] +labels = [ + "Median", + "5th percentile", + "95th percentile", +] +for ax, pred, label in zip(axes, predictions, labels): + PredictionErrorDisplay.from_predictions( + y_true=y_test, + y_pred=pred, + kind="residual_vs_predicted", + scatter_kwargs={"alpha": 0.3}, + ax=ax, + ) + ax.set(xlabel="Predicted demand", ylabel="True demand") + ax.legend(["Best model", label]) + +plt.show() + +# %% +# Conclusion +# ---------- +# Through this example we explored time series forecasting using lagged +# features. We compared a naive regression (using the standardized +# :class:`~sklearn.model_selection.train_test_split`) with a proper time +# series evaluation strategy using +# :class:`~sklearn.model_selection.TimeSeriesSplit`. We observed that the +# model trained using :class:`~sklearn.model_selection.train_test_split`, +# having a default value of `shuffle` set to `True` produced an overly +# optimistic Mean Average Percentage Error (MAPE). The results +# produced from the time-based split better represent the performance +# of our time-series regression model. We also analyzed the predictive uncertainty +# of our model via Quantile Regression. Predictions based on the 5th and +# 95th percentile using `loss="quantile"` provide us with a quantitative estimate +# of the uncertainty of the forecasts made by our time series regression model. +# Uncertainty estimation can also be performed +# using `MAPIE `_, +# that provides an implementation based on recent work on conformal prediction +# methods and estimates both aleatoric and epistemic uncertainty at the same time. +# Furthermore, functionalities provided +# by `sktime `_ +# can be used to extend scikit-learn estimators by making use of recursive time +# series forecasting, that enables dynamic predictions of future values. diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index 58cddb099faff..f41b8fa7e73fb 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -4,6 +4,7 @@ from ._base import ( clear_data_home, + fetch_file, get_data_home, load_breast_cancer, load_diabetes, @@ -57,6 +58,7 @@ "dump_svmlight_file", "fetch_20newsgroups", "fetch_20newsgroups_vectorized", + "fetch_file", "fetch_lfw_pairs", "fetch_lfw_people", "fetch_olivetti_faces", diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 7dd2f181dee12..62055d296402b 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -8,8 +8,10 @@ import gzip import hashlib import os +import re import shutil import time +import unicodedata import warnings from collections import namedtuple from importlib import resources @@ -17,7 +19,9 @@ from os import environ, listdir, makedirs from os.path import expanduser, isdir, join, splitext from pathlib import Path +from tempfile import NamedTemporaryFile from urllib.error import URLError +from urllib.parse import urlparse from urllib.request import urlretrieve import numpy as np @@ -1427,20 +1431,26 @@ def _sha256(path): def _fetch_remote(remote, dirname=None, n_retries=3, delay=1): - """Helper function to download a remote dataset into path + """Helper function to download a remote dataset. Fetch a dataset pointed by remote's url, save into path using remote's - filename and ensure its integrity based on the SHA256 Checksum of the + filename and ensure its integrity based on the SHA256 checksum of the downloaded file. + .. versionchanged:: 1.6 + + If the file already exists locally and the SHA256 checksums match, the + path to the local file is returned without re-downloading. + Parameters ---------- remote : RemoteFileMetadata Named tuple containing remote dataset meta information: url, filename - and checksum + and checksum. - dirname : str - Directory to save the file to. + dirname : str or Path, default=None + Directory to save the file to. If None, the current working directory + is used. n_retries : int, default=3 Number of retries when HTTP errors are encountered. @@ -1454,28 +1464,173 @@ def _fetch_remote(remote, dirname=None, n_retries=3, delay=1): Returns ------- - file_path: str + file_path: Path Full path of the created file. """ + if dirname is None: + folder_path = Path(".") + else: + folder_path = Path(dirname) + + file_path = folder_path / remote.filename + + if file_path.exists(): + if remote.checksum is None: + return file_path + + checksum = _sha256(file_path) + if checksum == remote.checksum: + return file_path + else: + warnings.warn( + f"SHA256 checksum of existing local file {file_path.name} " + f"({checksum}) differs from expected ({remote.checksum}): " + f"re-downloading from {remote.url} ." + ) + + # We create a temporary file dedicated to this particular download to avoid + # conflicts with parallel downloads. If the download is successful, the + # temporary file is atomically renamed to the final file path (with + # `shutil.move`). We therefore pass `delete=False` to `NamedTemporaryFile`. + # Otherwise, garbage collecting temp_file would raise an error when + # attempting to delete a file that was already renamed. If the download + # fails or the result does not match the expected SHA256 digest, the + # temporary file is removed manually in the except block. + temp_file = NamedTemporaryFile( + prefix=remote.filename + ".part_", dir=folder_path, delete=False + ) + # Note that Python 3.12's `delete_on_close=True` is ignored as we set + # `delete=False` explicitly. So after this line the empty temporary file still + # exists on disk to make sure that it's uniquely reserved for this specific call of + # `_fetch_remote` and therefore it protects against any corruption by parallel + # calls. + temp_file.close() + try: + temp_file_path = Path(temp_file.name) + while True: + try: + urlretrieve(remote.url, temp_file_path) + break + except (URLError, TimeoutError): + if n_retries == 0: + # If no more retries are left, re-raise the caught exception. + raise + warnings.warn(f"Retry downloading from url: {remote.url}") + n_retries -= 1 + time.sleep(delay) + + checksum = _sha256(temp_file_path) + if remote.checksum is not None and remote.checksum != checksum: + raise OSError( + f"The SHA256 checksum of {remote.filename} ({checksum}) " + f"differs from expected ({remote.checksum})." + ) + except (Exception, KeyboardInterrupt): + os.unlink(temp_file.name) + raise + + # The following renaming is atomic whenever temp_file_path and + # file_path are on the same filesystem. This should be the case most of + # the time, but we still use shutil.move instead of os.rename in case + # they are not. + shutil.move(temp_file_path, file_path) - file_path = remote.filename if dirname is None else join(dirname, remote.filename) - while True: - try: - urlretrieve(remote.url, file_path) - break - except (URLError, TimeoutError): - if n_retries == 0: - # If no more retries are left, re-raise the caught exception. - raise - warnings.warn(f"Retry downloading from url: {remote.url}") - n_retries -= 1 - time.sleep(delay) - - checksum = _sha256(file_path) - if remote.checksum != checksum: - raise OSError( - "{} has an SHA256 checksum ({}) " - "differing from expected ({}), " - "file may be corrupted.".format(file_path, checksum, remote.checksum) - ) return file_path + + +def _filter_filename(value, filter_dots=True): + """Derive a name that is safe to use as filename from the given string. + + Adapted from the `slugify` function of django: + https://github.com/django/django/blob/master/django/utils/text.py + + Convert spaces or repeated dashes to single dashes. Replace characters that + aren't alphanumerics, underscores, hyphens or dots by underscores. Convert + to lowercase. Also strip leading and trailing whitespace, dashes, and + underscores. + """ + value = unicodedata.normalize("NFKD", value).lower() + if filter_dots: + value = re.sub(r"[^\w\s-]+", "_", value) + else: + value = re.sub(r"[^.\w\s-]+", "_", value) + value = re.sub(r"[\s-]+", "-", value) + return value.strip("-_.") + + +def _derive_folder_and_filename_from_url(url): + parsed_url = urlparse(url) + if not parsed_url.hostname: + raise ValueError(f"Invalid URL: {url}") + folder_components = [_filter_filename(parsed_url.hostname, filter_dots=False)] + path = parsed_url.path + + if "/" in path: + base_folder, raw_filename = path.rsplit("/", 1) + + base_folder = _filter_filename(base_folder) + if base_folder: + folder_components.append(base_folder) + else: + raw_filename = path + + filename = _filter_filename(raw_filename, filter_dots=False) + if not filename: + filename = "downloaded_file" + + return "/".join(folder_components), filename + + +def fetch_file( + url, folder=None, local_filename=None, sha256=None, n_retries=3, delay=1 +): + """Fetch a file from the web if not already present in the local folder. + + If the file already exists locally (and the SHA256 checksums match when + provided), the path to the local file is returned without re-downloading. + + .. versionadded:: 1.6 + + Parameters + ---------- + url : str + URL of the file to download. + + folder : str or Path, default=None + Directory to save the file to. If None, the file is downloaded in a + folder with a name derived from the URL host name and path under + scikit-learn data home folder. + + local_filename : str, default=None + Name of the file to save. If None, the filename is inferred from the + URL. + + sha256 : str, default=None + SHA256 checksum of the file. If None, no checksum is verified. + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + delay : int, default=1 + Number of seconds between retries. + + Returns + ------- + file_path : Path + Full path of the downloaded file. + """ + folder_from_url, filename_from_url = _derive_folder_and_filename_from_url(url) + + if local_filename is None: + local_filename = filename_from_url + + if folder is None: + folder = Path(get_data_home()) / folder_from_url + makedirs(folder, exist_ok=True) + + remote_metadata = RemoteFileMetadata( + filename=local_filename, url=url, checksum=sha256 + ) + return _fetch_remote( + remote_metadata, dirname=folder, n_retries=n_retries, delay=delay + ) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index b79f8c47c55c5..8b5231f68abdd 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -1,5 +1,7 @@ +import hashlib import io import os +import re import shutil import tempfile import warnings @@ -9,12 +11,14 @@ from pickle import dumps, loads from unittest.mock import Mock from urllib.error import HTTPError +from urllib.parse import urlparse import numpy as np import pytest from sklearn.datasets import ( clear_data_home, + fetch_file, get_data_home, load_breast_cancer, load_diabetes, @@ -28,6 +32,7 @@ ) from sklearn.datasets._base import ( RemoteFileMetadata, + _derive_folder_and_filename_from_url, _fetch_remote, load_csv_data, load_gzip_compressed_csv_data, @@ -391,3 +396,263 @@ def test_fetch_remote_raise_warnings_with_invalid_url(monkeypatch): for r in record: assert str(r.message) == f"Retry downloading from url: {url}" assert len(record) == 3 + + +def test_derive_folder_and_filename_from_url(): + folder, filename = _derive_folder_and_filename_from_url( + "https://example.com/file.tar.gz" + ) + assert folder == "example.com" + assert filename == "file.tar.gz" + + folder, filename = _derive_folder_and_filename_from_url( + "https://example.com/نمونه نماینده.data" + ) + assert folder == "example.com" + assert filename == "نمونه-نماینده.data" + + folder, filename = _derive_folder_and_filename_from_url( + "https://example.com/path/to-/.file.tar.gz" + ) + assert folder == "example.com/path_to" + assert filename == "file.tar.gz" + + folder, filename = _derive_folder_and_filename_from_url("https://example.com/") + assert folder == "example.com" + assert filename == "downloaded_file" + + folder, filename = _derive_folder_and_filename_from_url("https://example.com") + assert folder == "example.com" + assert filename == "downloaded_file" + + folder, filename = _derive_folder_and_filename_from_url( + "https://example.com/path/@to/data.json?param=value" + ) + assert folder == "example.com/path_to" + assert filename == "data.json" + + folder, filename = _derive_folder_and_filename_from_url( + "https://example.com/path/@@to._/-_.data.json.#anchor" + ) + assert folder == "example.com/path_to" + assert filename == "data.json" + + folder, filename = _derive_folder_and_filename_from_url( + "https://example.com//some_file.txt" + ) + assert folder == "example.com" + assert filename == "some_file.txt" + + folder, filename = _derive_folder_and_filename_from_url( + "http://example/../some_file.txt" + ) + assert folder == "example" + assert filename == "some_file.txt" + + folder, filename = _derive_folder_and_filename_from_url( + "https://example.com/!.'.,/some_file.txt" + ) + assert folder == "example.com" + assert filename == "some_file.txt" + + folder, filename = _derive_folder_and_filename_from_url( + "https://example.com/a/!.'.,/b/some_file.txt" + ) + assert folder == "example.com/a_b" + assert filename == "some_file.txt" + + folder, filename = _derive_folder_and_filename_from_url("https://example.com/!.'.,") + assert folder == "example.com" + assert filename == "downloaded_file" + + with pytest.raises(ValueError, match="Invalid URL"): + _derive_folder_and_filename_from_url("https:/../") + + +def _mock_urlretrieve(server_side): + def _urlretrieve_mock(url, local_path): + server_root = Path(server_side) + file_path = urlparse(url).path.strip("/") + if not (server_root / file_path).exists(): + raise HTTPError(url, 404, "Not Found", None, None) + shutil.copy(server_root / file_path, local_path) + + return Mock(side_effect=_urlretrieve_mock) + + +def test_fetch_file_using_data_home(monkeypatch, tmpdir): + tmpdir = Path(tmpdir) + server_side = tmpdir / "server_side" + server_side.mkdir() + data_file = server_side / "data.jsonl" + server_data = '{"a": 1, "b": 2}\n' + data_file.write_text(server_data, encoding="utf-8") + + server_subfolder = server_side / "subfolder" + server_subfolder.mkdir() + other_data_file = server_subfolder / "other_file.txt" + other_data_file.write_text("Some important text data.", encoding="utf-8") + + data_home = tmpdir / "data_home" + data_home.mkdir() + + urlretrieve_mock = _mock_urlretrieve(server_side) + monkeypatch.setattr("sklearn.datasets._base.urlretrieve", urlretrieve_mock) + + monkeypatch.setattr( + "sklearn.datasets._base.get_data_home", Mock(return_value=data_home) + ) + fetched_file_path = fetch_file( + "https://example.com/data.jsonl", + ) + assert fetched_file_path == data_home / "example.com" / "data.jsonl" + assert fetched_file_path.read_text(encoding="utf-8") == server_data + + fetched_file_path = fetch_file( + "https://example.com/subfolder/other_file.txt", + ) + assert ( + fetched_file_path == data_home / "example.com" / "subfolder" / "other_file.txt" + ) + assert fetched_file_path.read_text(encoding="utf-8") == other_data_file.read_text( + "utf-8" + ) + + expected_warning_msg = re.escape( + "Retry downloading from url: https://example.com/subfolder/invalid.txt" + ) + with pytest.raises(HTTPError): + with pytest.warns(match=expected_warning_msg): + fetch_file( + "https://example.com/subfolder/invalid.txt", + delay=0, + ) + + local_subfolder = data_home / "example.com" / "subfolder" + assert sorted(local_subfolder.iterdir()) == [local_subfolder / "other_file.txt"] + + +def test_fetch_file_without_sha256(monkeypatch, tmpdir): + server_side = tmpdir.mkdir("server_side") + data_file = Path(server_side / "data.jsonl") + server_data = '{"a": 1, "b": 2}\n' + data_file.write_text(server_data, encoding="utf-8") + + client_side = tmpdir.mkdir("client_side") + + urlretrieve_mock = _mock_urlretrieve(server_side) + monkeypatch.setattr("sklearn.datasets._base.urlretrieve", urlretrieve_mock) + + # The first call should trigger a download: + fetched_file_path = fetch_file( + "https://example.com/data.jsonl", + folder=client_side, + ) + assert fetched_file_path == client_side / "data.jsonl" + assert fetched_file_path.read_text(encoding="utf-8") == server_data + assert urlretrieve_mock.call_count == 1 + + # Fetching again the same file to the same folder should do nothing: + fetched_file_path = fetch_file( + "https://example.com/data.jsonl", + folder=client_side, + ) + assert fetched_file_path == client_side / "data.jsonl" + assert fetched_file_path.read_text(encoding="utf-8") == server_data + assert urlretrieve_mock.call_count == 1 + + # Deleting and calling again should re-download + fetched_file_path.unlink() + fetched_file_path = fetch_file( + "https://example.com/data.jsonl", + folder=client_side, + ) + assert fetched_file_path == client_side / "data.jsonl" + assert fetched_file_path.read_text(encoding="utf-8") == server_data + assert urlretrieve_mock.call_count == 2 + + +def test_fetch_file_with_sha256(monkeypatch, tmpdir): + server_side = tmpdir.mkdir("server_side") + data_file = Path(server_side / "data.jsonl") + server_data = '{"a": 1, "b": 2}\n' + data_file.write_text(server_data, encoding="utf-8") + expected_sha256 = hashlib.sha256(data_file.read_bytes()).hexdigest() + + client_side = tmpdir.mkdir("client_side") + + urlretrieve_mock = _mock_urlretrieve(server_side) + monkeypatch.setattr("sklearn.datasets._base.urlretrieve", urlretrieve_mock) + + # The first call should trigger a download. + fetched_file_path = fetch_file( + "https://example.com/data.jsonl", folder=client_side, sha256=expected_sha256 + ) + assert fetched_file_path == client_side / "data.jsonl" + assert fetched_file_path.read_text(encoding="utf-8") == server_data + assert urlretrieve_mock.call_count == 1 + + # Fetching again the same file to the same folder should do nothing when + # the sha256 match: + fetched_file_path = fetch_file( + "https://example.com/data.jsonl", folder=client_side, sha256=expected_sha256 + ) + assert fetched_file_path == client_side / "data.jsonl" + assert fetched_file_path.read_text(encoding="utf-8") == server_data + assert urlretrieve_mock.call_count == 1 + + # Corrupting the local data should yield a warning and trigger a new download: + fetched_file_path.write_text("corrupted contents", encoding="utf-8") + expected_msg = ( + r"SHA256 checksum of existing local file data.jsonl " + rf"\(.*\) differs from expected \({expected_sha256}\): " + r"re-downloading from https://example.com/data.jsonl \." + ) + with pytest.warns(match=expected_msg): + fetched_file_path = fetch_file( + "https://example.com/data.jsonl", folder=client_side, sha256=expected_sha256 + ) + assert fetched_file_path == client_side / "data.jsonl" + assert fetched_file_path.read_text(encoding="utf-8") == server_data + assert urlretrieve_mock.call_count == 2 + + # Calling again should do nothing: + fetched_file_path = fetch_file( + "https://example.com/data.jsonl", folder=client_side, sha256=expected_sha256 + ) + assert fetched_file_path == client_side / "data.jsonl" + assert fetched_file_path.read_text(encoding="utf-8") == server_data + assert urlretrieve_mock.call_count == 2 + + # Deleting the local file and calling again should redownload without warning: + fetched_file_path.unlink() + fetched_file_path = fetch_file( + "https://example.com/data.jsonl", folder=client_side, sha256=expected_sha256 + ) + assert fetched_file_path == client_side / "data.jsonl" + assert fetched_file_path.read_text(encoding="utf-8") == server_data + assert urlretrieve_mock.call_count == 3 + + # Calling without a sha256 should also work without redownloading: + fetched_file_path = fetch_file( + "https://example.com/data.jsonl", + folder=client_side, + ) + assert fetched_file_path == client_side / "data.jsonl" + assert fetched_file_path.read_text(encoding="utf-8") == server_data + assert urlretrieve_mock.call_count == 3 + + # Calling with a wrong sha256 should raise an informative exception: + non_matching_sha256 = "deadbabecafebeef" + expected_warning_msg = "differs from expected" + expected_error_msg = re.escape( + f"The SHA256 checksum of data.jsonl ({expected_sha256}) differs from " + f"expected ({non_matching_sha256})." + ) + with pytest.raises(OSError, match=expected_error_msg): + with pytest.warns(match=expected_warning_msg): + fetch_file( + "https://example.com/data.jsonl", + folder=client_side, + sha256=non_matching_sha256, + ) From e7af1955724167a382df8fbab6112a84b3fa4d5a Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Thu, 11 Jul 2024 20:54:29 +0500 Subject: [PATCH 07/35] ENH Array API support for euclidean_distances and rbf_kernel (#29433) Co-authored-by: Olivier Grisel --- doc/modules/array_api.rst | 18 ++++++ doc/whats_new/v1.6.rst | 4 +- sklearn/decomposition/_base.py | 8 +-- sklearn/metrics/pairwise.py | 71 +++++++++++++++-------- sklearn/metrics/tests/test_common.py | 4 ++ sklearn/utils/_array_api.py | 83 ++++++++++++++++++++++----- sklearn/utils/extmath.py | 8 ++- sklearn/utils/tests/test_array_api.py | 34 +++++++++++ 8 files changed, 184 insertions(+), 46 deletions(-) diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index a51ee60e47e04..9afedeb7ccecb 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -123,7 +123,9 @@ Metrics - :func:`sklearn.metrics.pairwise.additive_chi2_kernel` - :func:`sklearn.metrics.pairwise.chi2_kernel` - :func:`sklearn.metrics.pairwise.cosine_similarity` +- :func:`sklearn.metrics.pairwise.euclidean_distances` (see :ref:`device_support_for_float64`) - :func:`sklearn.metrics.pairwise.paired_cosine_distances` +- :func:`sklearn.metrics.pairwise.rbf_kernel` (see :ref:`device_support_for_float64`) - :func:`sklearn.metrics.r2_score` - :func:`sklearn.metrics.zero_one_loss` @@ -172,6 +174,8 @@ automatically skipped. Therefore it's important to run the tests with the pip install array-api-compat # and other libraries as needed pytest -k "array_api" -v +.. _mps_support: + Note on MPS device support -------------------------- @@ -191,3 +195,17 @@ To enable the MPS support in PyTorch, set the environment variable At the time of writing all scikit-learn tests should pass, however, the computational speed is not necessarily better than with the CPU device. + +.. _device_support_for_float64: + +Note on device support for ``float64`` +-------------------------------------- + +Certain operations within scikit-learn will automatically perform operations +on floating-point values with `float64` precision to prevent overflows and ensure +correctness (e.g., :func:`metrics.pairwise.euclidean_distances`). However, +certain combinations of array namespaces and devices, such as `PyTorch on MPS` +(see :ref:`mps_support`) do not support the `float64` data type. In these cases, +scikit-learn will revert to using the `float32` data type instead. This can result in +different behavior (typically numerically unstable results) compared to not using array +API dispatching or using a device with `float64` support. diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index d7d3a71eba636..3971f60eb5f4b 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -43,7 +43,9 @@ See :ref:`array_api` for more details. - :func:`sklearn.metrics.pairwise.additive_chi2_kernel` :pr:`29144` by :user:`Yaroslav Korobko `; - :func:`sklearn.metrics.pairwise.chi2_kernel` :pr:`29267` by :user:`Yaroslav Korobko `; - :func:`sklearn.metrics.pairwise.cosine_similarity` :pr:`29014` by :user:`Edoardo Abati `; -- :func:`sklearn.metrics.pairwise.paired_cosine_distances` :pr:`29112` by :user:`Edoardo Abati `. +- :func:`sklearn.metrics.pairwise.euclidean_distances` :pr:`29433` by :user:`Omar Salman `; +- :func:`sklearn.metrics.pairwise.paired_cosine_distances` :pr:`29112` by :user:`Edoardo Abati `; +- :func:`sklearn.metrics.pairwise.rbf_kernel` :pr:`29433` by :user:`Omar Salman `. **Classes:** diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index f2d0ad663569a..970294efe0184 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -9,7 +9,7 @@ from scipy import linalg from ..base import BaseEstimator, ClassNamePrefixFeaturesOutMixin, TransformerMixin -from ..utils._array_api import _add_to_diagonal, device, get_namespace +from ..utils._array_api import _fill_or_add_to_diagonal, device, get_namespace from ..utils.validation import check_is_fitted @@ -47,7 +47,7 @@ def get_covariance(self): xp.asarray(0.0, device=device(exp_var)), ) cov = (components_.T * exp_var_diff) @ components_ - _add_to_diagonal(cov, self.noise_variance_, xp) + _fill_or_add_to_diagonal(cov, self.noise_variance_, xp) return cov def get_precision(self): @@ -89,10 +89,10 @@ def get_precision(self): xp.asarray(0.0, device=device(exp_var)), ) precision = components_ @ components_.T / self.noise_variance_ - _add_to_diagonal(precision, 1.0 / exp_var_diff, xp) + _fill_or_add_to_diagonal(precision, 1.0 / exp_var_diff, xp) precision = components_.T @ linalg_inv(precision) @ components_ precision /= -(self.noise_variance_**2) - _add_to_diagonal(precision, 1.0 / self.noise_variance_, xp) + _fill_or_add_to_diagonal(precision, 1.0 / self.noise_variance_, xp) return precision @abstractmethod diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 9382d585a5fe7..b7db4d94c4f07 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -22,9 +22,13 @@ gen_even_slices, ) from ..utils._array_api import ( + _fill_or_add_to_diagonal, _find_matching_floating_dtype, _is_numpy_namespace, + _max_precision_float_dtype, + _modify_in_place_if_numpy, get_namespace, + get_namespace_and_device, ) from ..utils._chunking import get_chunk_n_rows from ..utils._mask import _get_mask @@ -335,13 +339,14 @@ def euclidean_distances( array([[1. ], [1.41421356]]) """ + xp, _ = get_namespace(X, Y) X, Y = check_pairwise_arrays(X, Y) if X_norm_squared is not None: X_norm_squared = check_array(X_norm_squared, ensure_2d=False) original_shape = X_norm_squared.shape if X_norm_squared.shape == (X.shape[0],): - X_norm_squared = X_norm_squared.reshape(-1, 1) + X_norm_squared = xp.reshape(X_norm_squared, (-1, 1)) if X_norm_squared.shape == (1, X.shape[0]): X_norm_squared = X_norm_squared.T if X_norm_squared.shape != (X.shape[0], 1): @@ -354,7 +359,7 @@ def euclidean_distances( Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False) original_shape = Y_norm_squared.shape if Y_norm_squared.shape == (Y.shape[0],): - Y_norm_squared = Y_norm_squared.reshape(1, -1) + Y_norm_squared = xp.reshape(Y_norm_squared, (1, -1)) if Y_norm_squared.shape == (Y.shape[0], 1): Y_norm_squared = Y_norm_squared.T if Y_norm_squared.shape != (1, Y.shape[0]): @@ -375,24 +380,25 @@ def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, squared float32, norms needs to be recomputed on upcast chunks. TODO: use a float64 accumulator in row_norms to avoid the latter. """ - if X_norm_squared is not None and X_norm_squared.dtype != np.float32: - XX = X_norm_squared.reshape(-1, 1) - elif X.dtype != np.float32: - XX = row_norms(X, squared=True)[:, np.newaxis] + xp, _, device_ = get_namespace_and_device(X, Y) + if X_norm_squared is not None and X_norm_squared.dtype != xp.float32: + XX = xp.reshape(X_norm_squared, (-1, 1)) + elif X.dtype != xp.float32: + XX = row_norms(X, squared=True)[:, None] else: XX = None if Y is X: YY = None if XX is None else XX.T else: - if Y_norm_squared is not None and Y_norm_squared.dtype != np.float32: - YY = Y_norm_squared.reshape(1, -1) - elif Y.dtype != np.float32: - YY = row_norms(Y, squared=True)[np.newaxis, :] + if Y_norm_squared is not None and Y_norm_squared.dtype != xp.float32: + YY = xp.reshape(Y_norm_squared, (1, -1)) + elif Y.dtype != xp.float32: + YY = row_norms(Y, squared=True)[None, :] else: YY = None - if X.dtype == np.float32 or Y.dtype == np.float32: + if X.dtype == xp.float32 or Y.dtype == xp.float32: # To minimize precision issues with float32, we compute the distance # matrix on chunks of X and Y upcast to float64 distances = _euclidean_distances_upcast(X, XX, Y, YY) @@ -401,14 +407,22 @@ def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, squared distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True) distances += XX distances += YY - np.maximum(distances, 0, out=distances) + + xp_zero = xp.asarray(0, device=device_, dtype=distances.dtype) + distances = _modify_in_place_if_numpy( + xp, xp.maximum, distances, xp_zero, out=distances + ) # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. if X is Y: - np.fill_diagonal(distances, 0) + _fill_or_add_to_diagonal(distances, 0, xp=xp, add_value=False) - return distances if squared else np.sqrt(distances, out=distances) + if squared: + return distances + + distances = _modify_in_place_if_numpy(xp, xp.sqrt, distances, out=distances) + return distances @validate_params( @@ -552,15 +566,20 @@ def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None): X and Y are upcast to float64 by chunks, which size is chosen to limit memory increase by approximately 10% (at least 10MiB). """ + xp, _, device_ = get_namespace_and_device(X, Y) n_samples_X = X.shape[0] n_samples_Y = Y.shape[0] n_features = X.shape[1] - distances = np.empty((n_samples_X, n_samples_Y), dtype=np.float32) + distances = xp.empty((n_samples_X, n_samples_Y), dtype=xp.float32, device=device_) if batch_size is None: - x_density = X.nnz / np.prod(X.shape) if issparse(X) else 1 - y_density = Y.nnz / np.prod(Y.shape) if issparse(Y) else 1 + x_density = ( + X.nnz / xp.prod(X.shape) if issparse(X) else xp.asarray(1, device=device_) + ) + y_density = ( + Y.nnz / xp.prod(Y.shape) if issparse(Y) else xp.asarray(1, device=device_) + ) # Allow 10% more memory than X, Y and the distance matrix take (at # least 10MiB) @@ -580,15 +599,15 @@ def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None): # Hence x² + (xd+yd)kx = M, where x=batch_size, k=n_features, M=maxmem # xd=x_density and yd=y_density tmp = (x_density + y_density) * n_features - batch_size = (-tmp + np.sqrt(tmp**2 + 4 * maxmem)) / 2 + batch_size = (-tmp + xp.sqrt(tmp**2 + 4 * maxmem)) / 2 batch_size = max(int(batch_size), 1) x_batches = gen_batches(n_samples_X, batch_size) - + xp_max_float = _max_precision_float_dtype(xp=xp, device=device_) for i, x_slice in enumerate(x_batches): - X_chunk = X[x_slice].astype(np.float64) + X_chunk = xp.astype(X[x_slice], xp_max_float) if XX is None: - XX_chunk = row_norms(X_chunk, squared=True)[:, np.newaxis] + XX_chunk = row_norms(X_chunk, squared=True)[:, None] else: XX_chunk = XX[x_slice] @@ -601,9 +620,9 @@ def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None): d = distances[y_slice, x_slice].T else: - Y_chunk = Y[y_slice].astype(np.float64) + Y_chunk = xp.astype(Y[y_slice], xp_max_float) if YY is None: - YY_chunk = row_norms(Y_chunk, squared=True)[np.newaxis, :] + YY_chunk = row_norms(Y_chunk, squared=True)[None, :] else: YY_chunk = YY[:, y_slice] @@ -611,7 +630,7 @@ def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None): d += XX_chunk d += YY_chunk - distances[x_slice, y_slice] = d.astype(np.float32, copy=False) + distances[x_slice, y_slice] = xp.astype(d, xp.float32, copy=False) return distances @@ -1549,13 +1568,15 @@ def rbf_kernel(X, Y=None, gamma=None): array([[0.71..., 0.51...], [0.51..., 0.71...]]) """ + xp, _ = get_namespace(X, Y) X, Y = check_pairwise_arrays(X, Y) if gamma is None: gamma = 1.0 / X.shape[1] K = euclidean_distances(X, Y, squared=True) K *= -gamma - np.exp(K, K) # exponentiate K in-place + # exponentiate K in-place when using numpy + K = _modify_in_place_if_numpy(xp, xp.exp, K, out=K) return K diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 6110cbd3d1d13..14e96cc9fcd98 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -55,7 +55,9 @@ additive_chi2_kernel, chi2_kernel, cosine_similarity, + euclidean_distances, paired_cosine_distances, + rbf_kernel, ) from sklearn.preprocessing import LabelBinarizer from sklearn.utils import shuffle @@ -2014,6 +2016,8 @@ def check_array_api_metric_pairwise(metric, array_namespace, device, dtype_name) mean_gamma_deviance: [check_array_api_regression_metric], max_error: [check_array_api_regression_metric], chi2_kernel: [check_array_api_metric_pairwise], + euclidean_distances: [check_array_api_metric_pairwise], + rbf_kernel: [check_array_api_metric_pairwise], } diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index a00d250ab31d2..51caacb71c9e2 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -302,6 +302,15 @@ def __eq__(self, other): def isdtype(self, dtype, kind): return isdtype(dtype, kind, xp=self._namespace) + def maximum(self, x1, x2): + # TODO: Remove when `maximum` is made compatible in `array_api_compat`, + # based on the `2023.12` specification. + # https://github.com/data-apis/array-api-compat/issues/127 + x1_np = _convert_to_numpy(x1, xp=self._namespace) + x2_np = _convert_to_numpy(x2, xp=self._namespace) + x_max = numpy.maximum(x1_np, x2_np) + return self._namespace.asarray(x_max, device=device(x1, x2)) + def _check_device_cpu(device): # noqa if device not in {"cpu", None}: @@ -566,7 +575,28 @@ def get_namespace(*arrays, remove_none=True, remove_types=(str,), xp=None): def get_namespace_and_device(*array_list, remove_none=True, remove_types=(str,)): - """Combination into one single function of `get_namespace` and `device`.""" + """Combination into one single function of `get_namespace` and `device`. + + Parameters + ---------- + *array_list : array objects + Array objects. + remove_none : bool, default=True + Whether to ignore None objects passed in arrays. + remove_types : tuple or list, default=(str,) + Types to ignore in the arrays. + + Returns + ------- + namespace : module + Namespace shared by array objects. If any of the `arrays` are not arrays, + the namespace defaults to NumPy. + is_array_api_compliant : bool + True if the arrays are containers that implement the Array API spec. + Always False when array_api_dispatch=False. + device : device + `device` object (see the "Device Support" section of the array API spec). + """ array_list = _remove_non_arrays( *array_list, remove_none=remove_none, remove_types=remove_types ) @@ -592,21 +622,36 @@ def _expit(X, xp=None): return 1.0 / (1.0 + xp.exp(-X)) -def _add_to_diagonal(array, value, xp): - # Workaround for the lack of support for xp.reshape(a, shape, copy=False) in - # numpy.array_api: https://github.com/numpy/numpy/issues/23410 - value = xp.asarray(value, dtype=array.dtype) - if _is_numpy_namespace(xp): - array_np = numpy.asarray(array) - array_np.flat[:: array.shape[0] + 1] += value - return xp.asarray(array_np) - elif value.ndim == 1: - for i in range(array.shape[0]): - array[i, i] += value[i] +def _fill_or_add_to_diagonal(array, value, xp, add_value=True, wrap=False): + """Implementation to facilitate adding or assigning specified values to the + diagonal of a 2-d array. + + If ``add_value`` is `True` then the values will be added to the diagonal + elements otherwise the values will be assigned to the diagonal elements. + By default, ``add_value`` is set to `True. This is currently only + supported for 2-d arrays. + + The implementation is taken from the `numpy.fill_diagonal` function: + https://github.com/numpy/numpy/blob/v2.0.0/numpy/lib/_index_tricks_impl.py#L799-L929 + """ + if array.ndim != 2: + raise ValueError( + f"array should be 2-d. Got array with shape {tuple(array.shape)}" + ) + + value = xp.asarray(value, dtype=array.dtype, device=device(array)) + end = None + # Explicit, fast formula for the common case. For 2-d arrays, we + # accept rectangular ones. + step = array.shape[1] + 1 + if not wrap: + end = array.shape[1] * array.shape[1] + + array_flat = xp.reshape(array, (-1,)) + if add_value: + array_flat[:end:step] += value else: - # scalar value - for i in range(array.shape[0]): - array[i, i] += value + array_flat[:end:step] = value def _max_precision_float_dtype(xp, device): @@ -1000,3 +1045,11 @@ def _count_nonzero(X, xp, device, axis=None, sample_weight=None): zero_scalar = xp.asarray(0, device=device, dtype=weights.dtype) return xp.sum(xp.where(X != 0, weights, zero_scalar), axis=axis) + + +def _modify_in_place_if_numpy(xp, func, *args, out=None, **kwargs): + if _is_numpy_namespace(xp): + func(*args, out=out, **kwargs) + else: + out = func(*args, **kwargs) + return out diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 717bbed76513b..7b5720473848a 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -178,6 +178,7 @@ def safe_sparse_dot(a, b, *, dense_output=False): [11, 25, 39], [17, 39, 61]]) """ + xp, _ = get_namespace(a, b) if a.ndim > 2 or b.ndim > 2: if sparse.issparse(a): # sparse is always 2D. Implies b is 3D+ @@ -193,7 +194,12 @@ def safe_sparse_dot(a, b, *, dense_output=False): ret = a_2d @ b ret = ret.reshape(*a.shape[:-1], b.shape[1]) else: - ret = np.dot(a, b) + # Alternative for `np.dot` when dealing with a or b having + # more than 2 dimensions, that works with the array api. + # If b is 1-dim then the last axis for b is taken otherwise + # if b is >= 2-dim then the second to last axis is taken. + b_axis = -1 if b.ndim == 1 else -2 + ret = xp.tensordot(a, b, axes=[-1, b_axis]) else: ret = a @ b diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index 71f499f7a8dae..707304edacd11 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -15,6 +15,7 @@ _convert_to_numpy, _count_nonzero, _estimator_with_converted_arrays, + _fill_or_add_to_diagonal, _is_numpy_namespace, _isin, _max_precision_float_dtype, @@ -112,6 +113,26 @@ def test_array_api_wrapper_astype(): assert X_converted.dtype == xp.float32 +def test_array_api_wrapper_maximum(): + """Test _ArrayAPIWrapper `maximum` for ArrayAPIs other than NumPy. + + This is mainly used to test for `cupy.array_api` but since that is + not available on our coverage-enabled PR CI, we resort to using + `array-api-strict`. + """ + array_api_strict = pytest.importorskip("array_api_strict") + xp_ = _AdjustableNameAPITestWrapper(array_api_strict, "array_api_strict") + xp = _ArrayAPIWrapper(xp_) + + x1 = xp.asarray(([[1, 2, 3], [3, 9, 5]]), dtype=xp.int64) + x2 = xp.asarray(([[0, 1, 6], [8, 4, 5]]), dtype=xp.int64) + result = xp.asarray([[1, 2, 6], [8, 9, 5]], dtype=xp.int64) + + x_max = xp.maximum(x1, x2) + assert x_max.dtype == x1.dtype + assert xp.all(xp.equal(x_max, result)) + + @pytest.mark.parametrize("array_api", ["numpy", "array_api_strict"]) def test_asarray_with_order(array_api): """Test _asarray_with_order passes along order for NumPy arrays.""" @@ -624,3 +645,16 @@ def test_count_nonzero( # NumPy 2.0 has a problem with the device attribute of scalar arrays: # https://github.com/numpy/numpy/issues/26850 assert device(array_xp) == device(result) + + +@pytest.mark.parametrize( + "array_namespace, device_, dtype_name", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize("wrap", [True, False]) +def test_fill_or_add_to_diagonal(array_namespace, device_, dtype_name, wrap): + xp = _array_api_for_tests(array_namespace, device_) + array_np = numpy.zeros((5, 4), dtype=numpy.int64) + array_xp = xp.asarray(array_np) + _fill_or_add_to_diagonal(array_xp, value=1, xp=xp, add_value=False, wrap=wrap) + numpy.fill_diagonal(array_np, val=1, wrap=wrap) + assert_array_equal(_convert_to_numpy(array_xp, xp=xp), array_np) From b0f86e797190d1fd62331eac8b88a5b0dfa522b9 Mon Sep 17 00:00:00 2001 From: Nithish Bolleddula Date: Fri, 12 Jul 2024 01:44:26 -0700 Subject: [PATCH 08/35] MAINT Remove scipy<1.6 specific code (#29461) --- .../linear_model/plot_quantile_regression.py | 11 +--- sklearn/kernel_approximation.py | 6 +- sklearn/linear_model/_quantile.py | 29 +++------ sklearn/linear_model/tests/test_quantile.py | 62 ++++--------------- .../tests/test_precision_recall_display.py | 2 +- .../_plot/tests/test_roc_curve_display.py | 2 +- sklearn/metrics/_ranking.py | 2 +- sklearn/metrics/tests/test_pairwise.py | 35 +---------- sklearn/tests/test_docstring_parameters.py | 5 -- sklearn/utils/estimator_checks.py | 7 +-- sklearn/utils/fixes.py | 7 --- sklearn/utils/optimize.py | 7 +-- 12 files changed, 34 insertions(+), 141 deletions(-) diff --git a/examples/linear_model/plot_quantile_regression.py b/examples/linear_model/plot_quantile_regression.py index a14755173d1f7..61fd3f1c91804 100644 --- a/examples/linear_model/plot_quantile_regression.py +++ b/examples/linear_model/plot_quantile_regression.py @@ -109,11 +109,6 @@ # # We will use the quantiles at 5% and 95% to find the outliers in the training # sample beyond the central 90% interval. -from sklearn.utils.fixes import parse_version, sp_version - -# This is line is to avoid incompatibility if older SciPy version. -# You should use `solver="highs"` with recent version of SciPy. -solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point" # %% from sklearn.linear_model import QuantileRegressor @@ -122,7 +117,7 @@ predictions = {} out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_) for quantile in quantiles: - qr = QuantileRegressor(quantile=quantile, alpha=0, solver=solver) + qr = QuantileRegressor(quantile=quantile, alpha=0) y_pred = qr.fit(X, y_normal).predict(X) predictions[quantile] = y_pred @@ -184,7 +179,7 @@ predictions = {} out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_) for quantile in quantiles: - qr = QuantileRegressor(quantile=quantile, alpha=0, solver=solver) + qr = QuantileRegressor(quantile=quantile, alpha=0) y_pred = qr.fit(X, y_pareto).predict(X) predictions[quantile] = y_pred @@ -254,7 +249,7 @@ from sklearn.metrics import mean_absolute_error, mean_squared_error linear_regression = LinearRegression() -quantile_regression = QuantileRegressor(quantile=0.5, alpha=0, solver=solver) +quantile_regression = QuantileRegressor(quantile=0.5, alpha=0) y_pred_lr = linear_regression.fit(X, y_pareto).predict(X) y_pred_qr = quantile_regression.fit(X, y_pareto).predict(X) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index fb687dd85f229..2c1981295dffa 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -8,13 +8,9 @@ import numpy as np import scipy.sparse as sp +from scipy.fft import fft, ifft from scipy.linalg import svd -try: - from scipy.fft import fft, ifft -except ImportError: # scipy < 1.4 - from scipy.fftpack import fft, ifft - from .base import ( BaseEstimator, ClassNamePrefixFeaturesOutMixin, diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py index 8a3365bfc7a51..79d2e6b67ca5e 100644 --- a/sklearn/linear_model/_quantile.py +++ b/sklearn/linear_model/_quantile.py @@ -47,7 +47,7 @@ class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator): Method used by :func:`scipy.optimize.linprog` to solve the linear programming formulation. - From `scipy>=1.6.0`, it is recommended to use the highs methods because + It is recommended to use the highs methods because they are the fastest ones. Solvers "highs-ds", "highs-ipm" and "highs" support sparse input data and, in fact, always convert to sparse csc. @@ -100,8 +100,7 @@ class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator): >>> X = rng.randn(n_samples, n_features) >>> # the two following lines are optional in practice >>> from sklearn.utils.fixes import sp_version, parse_version - >>> solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point" - >>> reg = QuantileRegressor(quantile=0.8, solver=solver).fit(X, y) + >>> reg = QuantileRegressor(quantile=0.8).fit(X, y) >>> np.mean(y <= reg.predict(X)) 0.8 """ @@ -180,30 +179,18 @@ def fit(self, X, y, sample_weight=None): # So we rescale the penalty term, which is equivalent. alpha = np.sum(sample_weight) * self.alpha - if self.solver in ( - "highs-ds", - "highs-ipm", - "highs", - ) and sp_version < parse_version("1.6.0"): + if self.solver == "interior-point" and sp_version >= parse_version("1.11.0"): raise ValueError( - f"Solver {self.solver} is only available " - f"with scipy>=1.6.0, got {sp_version}" - ) - else: - solver = self.solver - - if solver == "interior-point" and sp_version >= parse_version("1.11.0"): - raise ValueError( - f"Solver {solver} is not anymore available in SciPy >= 1.11.0." + f"Solver {self.solver} is not anymore available in SciPy >= 1.11.0." ) - if sparse.issparse(X) and solver not in ["highs", "highs-ds", "highs-ipm"]: + if sparse.issparse(X) and self.solver not in ["highs", "highs-ds", "highs-ipm"]: raise ValueError( f"Solver {self.solver} does not support sparse X. " "Use solver 'highs' for example." ) # make default solver more stable - if self.solver_options is None and solver == "interior-point": + if self.solver_options is None and self.solver == "interior-point": solver_options = {"lstsq": True} else: solver_options = self.solver_options @@ -246,7 +233,7 @@ def fit(self, X, y, sample_weight=None): c[0] = 0 c[n_params] = 0 - if solver in ["highs", "highs-ds", "highs-ipm"]: + if self.solver in ["highs", "highs-ds", "highs-ipm"]: # Note that highs methods always use a sparse CSC memory layout internally, # even for optimization problems parametrized using dense numpy arrays. # Therefore, we work with CSC matrices as early as possible to limit @@ -271,7 +258,7 @@ def fit(self, X, y, sample_weight=None): c=c, A_eq=A_eq, b_eq=b_eq, - method=solver, + method=self.solver, options=solver_options, ) solution = result.x diff --git a/sklearn/linear_model/tests/test_quantile.py b/sklearn/linear_model/tests/test_quantile.py index 971c849dfac1d..da96593de00f2 100644 --- a/sklearn/linear_model/tests/test_quantile.py +++ b/sklearn/linear_model/tests/test_quantile.py @@ -26,11 +26,6 @@ def X_y_data(): return X, y -@pytest.fixture -def default_solver(): - return "highs" if sp_version >= parse_version("1.6.0") else "interior-point" - - @pytest.mark.skipif( parse_version(sp_version.base_version) >= parse_version("1.11"), reason="interior-point solver is not available in SciPy 1.11", @@ -47,18 +42,6 @@ def test_incompatible_solver_for_sparse_input(X_y_data, solver, csc_container): QuantileRegressor(solver=solver).fit(X_sparse, y) -@pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs")) -@pytest.mark.skipif( - sp_version >= parse_version("1.6.0"), - reason="Solvers are available as of scipy 1.6.0", -) -def test_too_new_solver_methods_raise_error(X_y_data, solver): - """Test that highs solver raises for scipy<1.6.0.""" - X, y = X_y_data - with pytest.raises(ValueError, match="scipy>=1.6.0"): - QuantileRegressor(solver=solver).fit(X, y) - - @pytest.mark.parametrize( "quantile, alpha, intercept, coef", [ @@ -74,13 +57,11 @@ def test_too_new_solver_methods_raise_error(X_y_data, solver): [0.5, 100, 2, 0], ], ) -def test_quantile_toy_example(quantile, alpha, intercept, coef, default_solver): +def test_quantile_toy_example(quantile, alpha, intercept, coef): # test how different parameters affect a small intuitive example X = [[0], [1], [1]] y = [1, 2, 11] - model = QuantileRegressor( - quantile=quantile, alpha=alpha, solver=default_solver - ).fit(X, y) + model = QuantileRegressor(quantile=quantile, alpha=alpha).fit(X, y) assert_allclose(model.intercept_, intercept, atol=1e-2) if coef is not None: assert_allclose(model.coef_[0], coef, atol=1e-2) @@ -90,15 +71,13 @@ def test_quantile_toy_example(quantile, alpha, intercept, coef, default_solver): @pytest.mark.parametrize("fit_intercept", [True, False]) -def test_quantile_equals_huber_for_low_epsilon(fit_intercept, default_solver): +def test_quantile_equals_huber_for_low_epsilon(fit_intercept): X, y = make_regression(n_samples=100, n_features=20, random_state=0, noise=1.0) alpha = 1e-4 huber = HuberRegressor( epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept ).fit(X, y) - quant = QuantileRegressor( - alpha=alpha, fit_intercept=fit_intercept, solver=default_solver - ).fit(X, y) + quant = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(X, y) assert_allclose(huber.coef_, quant.coef_, atol=1e-1) if fit_intercept: assert huber.intercept_ == approx(quant.intercept_, abs=1e-1) @@ -107,18 +86,14 @@ def test_quantile_equals_huber_for_low_epsilon(fit_intercept, default_solver): @pytest.mark.parametrize("q", [0.5, 0.9, 0.05]) -def test_quantile_estimates_calibration(q, default_solver): +def test_quantile_estimates_calibration(q): # Test that model estimates percentage of points below the prediction X, y = make_regression(n_samples=1000, n_features=20, random_state=0, noise=1.0) - quant = QuantileRegressor( - quantile=q, - alpha=0, - solver=default_solver, - ).fit(X, y) + quant = QuantileRegressor(quantile=q, alpha=0).fit(X, y) assert np.mean(y < quant.predict(X)) == approx(q, abs=1e-2) -def test_quantile_sample_weight(default_solver): +def test_quantile_sample_weight(): # test that with unequal sample weights we still estimate weighted fraction n = 1000 X, y = make_regression(n_samples=n, n_features=5, random_state=0, noise=10.0) @@ -126,7 +101,7 @@ def test_quantile_sample_weight(default_solver): # when we increase weight of upper observations, # estimate of quantile should go up weight[y > y.mean()] = 100 - quant = QuantileRegressor(quantile=0.5, alpha=1e-8, solver=default_solver) + quant = QuantileRegressor(quantile=0.5, alpha=1e-8) quant.fit(X, y, sample_weight=weight) fraction_below = np.mean(y < quant.predict(X)) assert fraction_below > 0.5 @@ -134,12 +109,8 @@ def test_quantile_sample_weight(default_solver): assert weighted_fraction_below == approx(0.5, abs=3e-2) -@pytest.mark.skipif( - sp_version < parse_version("1.6.0"), - reason="The `highs` solver is available from the 1.6.0 scipy version", -) @pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8]) -def test_asymmetric_error(quantile, default_solver): +def test_asymmetric_error(quantile): """Test quantile regression for asymmetric distributed targets.""" n_samples = 1000 rng = np.random.RandomState(42) @@ -164,7 +135,6 @@ def test_asymmetric_error(quantile, default_solver): model = QuantileRegressor( quantile=quantile, alpha=0, - solver=default_solver, ).fit(X, y) # This test can be made to pass with any solver but in the interest # of sparing continuous integration resources, the test is performed @@ -199,7 +169,7 @@ def func(coef): @pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8]) -def test_equivariance(quantile, default_solver): +def test_equivariance(quantile): """Test equivariace of quantile regression. See Koenker (2005) Quantile Regression, Chapter 2.2.3. @@ -216,7 +186,7 @@ def test_equivariance(quantile, default_solver): ) # make y asymmetric y += rng.exponential(scale=100, size=y.shape) - params = dict(alpha=0, solver=default_solver) + params = dict(alpha=0) model1 = QuantileRegressor(quantile=quantile, **params).fit(X, y) # coef(q; a*y, X) = a * coef(q; y, X) @@ -264,23 +234,17 @@ def test_linprog_failure(): @skip_if_32bit -@pytest.mark.skipif( - sp_version <= parse_version("1.6.0"), - reason="Solvers are available as of scipy 1.6.0", -) @pytest.mark.parametrize( "sparse_container", CSC_CONTAINERS + CSR_CONTAINERS + COO_CONTAINERS ) @pytest.mark.parametrize("solver", ["highs", "highs-ds", "highs-ipm"]) @pytest.mark.parametrize("fit_intercept", [True, False]) -def test_sparse_input(sparse_container, solver, fit_intercept, default_solver): +def test_sparse_input(sparse_container, solver, fit_intercept): """Test that sparse and dense X give same results.""" X, y = make_regression(n_samples=100, n_features=20, random_state=1, noise=1.0) X_sparse = sparse_container(X) alpha = 1e-4 - quant_dense = QuantileRegressor( - alpha=alpha, fit_intercept=fit_intercept, solver=default_solver - ).fit(X, y) + quant_dense = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(X, y) quant_sparse = QuantileRegressor( alpha=alpha, fit_intercept=fit_intercept, solver=solver ).fit(X_sparse, y) diff --git a/sklearn/metrics/_plot/tests/test_precision_recall_display.py b/sklearn/metrics/_plot/tests/test_precision_recall_display.py index 0173e5338d722..1a5a3f70545a3 100644 --- a/sklearn/metrics/_plot/tests/test_precision_recall_display.py +++ b/sklearn/metrics/_plot/tests/test_precision_recall_display.py @@ -2,6 +2,7 @@ import numpy as np import pytest +from scipy.integrate import trapezoid from sklearn.compose import make_column_transformer from sklearn.datasets import load_breast_cancer, make_classification @@ -16,7 +17,6 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle -from sklearn.utils.fixes import trapezoid # TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved pytestmark = pytest.mark.filterwarnings( diff --git a/sklearn/metrics/_plot/tests/test_roc_curve_display.py b/sklearn/metrics/_plot/tests/test_roc_curve_display.py index 8fd9f96576518..a4f4d81fb9ded 100644 --- a/sklearn/metrics/_plot/tests/test_roc_curve_display.py +++ b/sklearn/metrics/_plot/tests/test_roc_curve_display.py @@ -1,6 +1,7 @@ import numpy as np import pytest from numpy.testing import assert_allclose +from scipy.integrate import trapezoid from sklearn.compose import make_column_transformer from sklearn.datasets import load_breast_cancer, load_iris @@ -11,7 +12,6 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle -from sklearn.utils.fixes import trapezoid @pytest.fixture(scope="module") diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index a0c05eacf32ad..4bc18ed72e3b3 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -16,6 +16,7 @@ from numbers import Integral, Real import numpy as np +from scipy.integrate import trapezoid from scipy.sparse import csr_matrix, issparse from scipy.stats import rankdata @@ -30,7 +31,6 @@ from ..utils._encode import _encode, _unique from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params from ..utils.extmath import stable_cumsum -from ..utils.fixes import trapezoid from ..utils.multiclass import type_of_target from ..utils.sparsefuncs import count_nonzero from ..utils.validation import _check_pos_label_consistency, _check_sample_weight diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 03d22e0f6d344..6408dc6ba82ea 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -2,6 +2,7 @@ from types import GeneratorType import numpy as np +import pytest from numpy import linalg from scipy.sparse import issparse from scipy.spatial.distance import ( @@ -13,15 +14,6 @@ squareform, ) -try: - from scipy.spatial.distance import wminkowski -except ImportError: - # In scipy 1.6.0, wminkowski is deprecated and minkowski - # should be used instead. - from scipy.spatial.distance import minkowski as wminkowski - -import pytest - from sklearn import config_context from sklearn.exceptions import DataConversionWarning from sklearn.metrics.pairwise import ( @@ -68,8 +60,6 @@ CSC_CONTAINERS, CSR_CONTAINERS, DOK_CONTAINERS, - parse_version, - sp_version, ) from sklearn.utils.parallel import Parallel, delayed @@ -299,7 +289,6 @@ def test_pairwise_precomputed_non_negative(): _minkowski_kwds = {"w": np.arange(1, 5).astype("double", copy=False), "p": 1} -_wminkowski_kwds = {"w": np.arange(1, 5).astype("double", copy=False), "p": 1} def callable_rbf_kernel(x, y, **kwds): @@ -313,34 +302,16 @@ def callable_rbf_kernel(x, y, **kwds): "func, metric, kwds", [ (pairwise_distances, "euclidean", {}), - pytest.param( + ( pairwise_distances, minkowski, _minkowski_kwds, ), - pytest.param( + ( pairwise_distances, "minkowski", _minkowski_kwds, ), - pytest.param( - pairwise_distances, - wminkowski, - _wminkowski_kwds, - marks=pytest.mark.skipif( - sp_version >= parse_version("1.6.0"), - reason="wminkowski is now minkowski and it has been already tested.", - ), - ), - pytest.param( - pairwise_distances, - "wminkowski", - _wminkowski_kwds, - marks=pytest.mark.skipif( - sp_version >= parse_version("1.6.0"), - reason="wminkowski is now minkowski and it has been already tested.", - ), - ), (pairwise_kernels, "polynomial", {"degree": 1}), (pairwise_kernels, callable_rbf_kernel, {"gamma": 0.1}), ], diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 420fbd8a8d7ea..0c3b0e367923a 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -33,7 +33,6 @@ _enforce_estimator_tags_X, _enforce_estimator_tags_y, ) -from sklearn.utils.fixes import parse_version, sp_version # walk_packages() ignores DeprecationWarnings, now we need to ignore # FutureWarnings @@ -228,10 +227,6 @@ def test_fit_docstring_attributes(name, Estimator): if Estimator.__name__ in ("NMF", "MiniBatchNMF"): est.set_params(n_components="auto") - if Estimator.__name__ == "QuantileRegressor": - solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point" - est.set_params(solver=solver) - # Low max iter to speed up tests: we are only interested in checking the existence # of fitted attributes. This should be invariant to whether it has converged or not. if "max_iter" in est.get_params(): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d1fe1d5ab8c56..422a23bb5ef72 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -81,7 +81,7 @@ raises, set_random_state, ) -from .fixes import SPARSE_ARRAY_PRESENT, parse_version, sp_version +from .fixes import SPARSE_ARRAY_PRESENT from .validation import _num_samples, check_is_fitted, has_fit_parameter REGRESSION_DATASET = None @@ -776,11 +776,6 @@ def _set_checking_parameters(estimator): if name == "OneHotEncoder": estimator.set_params(handle_unknown="ignore") - if name == "QuantileRegressor": - # Avoid warning due to Scipy deprecating interior-point solver - solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point" - estimator.set_params(solver=solver) - if name in CROSS_DECOMPOSITION: estimator.set_params(n_components=1) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index fd64db3d8ba1b..bbf0831329fa5 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -221,13 +221,6 @@ def _sparse_nan_min_max(X, axis): from numpy import ComplexWarning, VisibleDeprecationWarning # type: ignore # noqa -# TODO: Remove when Scipy 1.6 is the minimum supported version -try: - from scipy.integrate import trapezoid # type: ignore # noqa -except ImportError: - from scipy.integrate import trapz as trapezoid # type: ignore # noqa - - # TODO: Adapt when Pandas > 2.2 is the minimum supported version def pd_fillna(pd, frame): pd_version = parse_version(pd.__version__).base_version diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py index ac91110651d94..980271a572b9a 100644 --- a/sklearn/utils/optimize.py +++ b/sklearn/utils/optimize.py @@ -352,11 +352,8 @@ def _check_optimize_result(solver, result, max_iter=None, extra_warning_msg=None # handle both scipy and scikit-learn solver names if solver == "lbfgs": if result.status != 0: - try: - # The message is already decoded in scipy>=1.6.0 - result_message = result.message.decode("latin1") - except AttributeError: - result_message = result.message + result_message = result.message + warning_msg = ( "{} failed to converge (status={}):\n{}.\n\n" "Increase the number of iterations (max_iter) " From 409d187aeb7224374c1190239e189f8a1f3fb4a9 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Fri, 12 Jul 2024 11:52:18 +0200 Subject: [PATCH 09/35] CI Move label removal to a separate workflow (#29456) --- .../{cuda-gpu-ci.yml => cuda-ci.yml} | 7 ------ .github/workflows/cuda-label-remover.yml | 23 +++++++++++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) rename .github/workflows/{cuda-gpu-ci.yml => cuda-ci.yml} (88%) create mode 100644 .github/workflows/cuda-label-remover.yml diff --git a/.github/workflows/cuda-gpu-ci.yml b/.github/workflows/cuda-ci.yml similarity index 88% rename from .github/workflows/cuda-gpu-ci.yml rename to .github/workflows/cuda-ci.yml index 802293f07189e..9124df6a57ad6 100644 --- a/.github/workflows/cuda-gpu-ci.yml +++ b/.github/workflows/cuda-ci.yml @@ -7,10 +7,6 @@ on: types: - labeled -# In order to remove the "CUDA CI" label we need to have write permissions for PRs -permissions: - pull-requests: write - jobs: tests: if: contains(github.event.pull_request.labels.*.name, 'CUDA CI') @@ -21,9 +17,6 @@ jobs: timeout-minutes: 20 name: Run Array API unit tests steps: - - uses: actions-ecosystem/action-remove-labels@v1 - with: - labels: CUDA CI - uses: actions/setup-python@v5 with: # XXX: The 3.12.4 release of Python on GitHub Actions is corrupted: diff --git a/.github/workflows/cuda-label-remover.yml b/.github/workflows/cuda-label-remover.yml new file mode 100644 index 0000000000000..f6a65a2c07d78 --- /dev/null +++ b/.github/workflows/cuda-label-remover.yml @@ -0,0 +1,23 @@ +name: Remove "CUDA CI" Label + +# This workflow removes the "CUDA CI" label that triggers the actual +# CUDA CI. It is separate so that we can use the `pull_request_target` +# trigger which has a API token with write access. +on: + pull_request_target: + types: + - labeled + +# In order to remove the "CUDA CI" label we need to have write permissions for PRs +permissions: + pull-requests: write + +jobs: + label-remover: + if: contains(github.event.pull_request.labels.*.name, 'CUDA CI') + name: Remove "CUDA CI" Label + runs-on: ubuntu-20.04 + steps: + - uses: actions-ecosystem/action-remove-labels@v1 + with: + labels: CUDA CI From 6bc7bc033cd9e37d93d0a5467e5f7d453aa135c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Dock=C3=A8s?= Date: Fri, 12 Jul 2024 14:42:35 +0200 Subject: [PATCH 10/35] BLD Make the version dynamic in pyproject.toml (#29399) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Loïc Estève --- doc/developers/maintainer.rst | 14 +++++++------- pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst index ffc9b73156fa8..c38da4c68dcd1 100644 --- a/doc/developers/maintainer.rst +++ b/doc/developers/maintainer.rst @@ -105,12 +105,12 @@ This PR will be used to push commits related to the release as explained in :ref:`making_a_release`. You can also create a second PR from main and targeting main to increment the -``__version__`` variable in `sklearn/__init__.py` and in `pyproject.toml` to increment -the dev version. This means while we're in the release candidate period, the latest -stable is two versions behind the main branch, instead of one. In this PR targeting -main you should also include a new file for the matching version under the -``doc/whats_new/`` folder so PRs that target the next version can contribute their -changelog entries to this file in parallel to the release process. +``__version__`` variable in `sklearn/__init__.py` to increment the dev version. +This means while we're in the release candidate period, the latest stable is +two versions behind the main branch, instead of one. In this PR targeting main +you should also include a new file for the matching version under the +``doc/whats_new/`` folder so PRs that target the next version can contribute +their changelog entries to this file in parallel to the release process. Minor version release (also known as bug-fix release) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -211,7 +211,7 @@ Making a release enough) and to update the on-going development entry. 2. On the branch for releasing, update the version number in ``sklearn/__init__.py``, - the ``__version__`` variable, and in `pyproject.toml`. + the ``__version__`` variable. For major releases, please add a 0 at the end: `0.99.0` instead of `0.99`. diff --git a/pyproject.toml b/pyproject.toml index ff7df45c1d843..1b613ae561b27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scikit-learn" -version = "1.6.dev0" +dynamic = ["version"] description = "A set of python modules for machine learning and data mining" readme = "README.rst" maintainers = [ From 97c3f3a505547dd775eae552f1b4b5e8cb9dc4bc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 12 Jul 2024 14:49:53 +0200 Subject: [PATCH 11/35] DOC improve rendering of items in LDA (#29474) --- sklearn/decomposition/_lda.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 4f91483a468a9..37b425a727a88 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -194,15 +194,14 @@ class LatentDirichletAllocation( In general, if the data size is large, the online update will be much faster than the batch update. - Valid options:: - - 'batch': Batch variational Bayes method. Use all training data in - each EM update. - Old `components_` will be overwritten in each iteration. - 'online': Online variational Bayes method. In each EM update, use - mini-batch of training data to update the ``components_`` - variable incrementally. The learning rate is controlled by the - ``learning_decay`` and the ``learning_offset`` parameters. + Valid options: + + - 'batch': Batch variational Bayes method. Use all training data in each EM + update. Old `components_` will be overwritten in each iteration. + - 'online': Online variational Bayes method. In each EM update, use mini-batch + of training data to update the ``components_`` variable incrementally. The + learning rate is controlled by the ``learning_decay`` and the + ``learning_offset`` parameters. .. versionchanged:: 0.20 The default learning method is now ``"batch"``. From cc97b80fd0836a7123cbcf3893152eccd643f448 Mon Sep 17 00:00:00 2001 From: Robert Pollak Date: Fri, 12 Jul 2024 15:11:57 +0200 Subject: [PATCH 12/35] Cleanup obsolete code from example (#29478) --- .../inspection/plot_permutation_importance_multicollinear.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/inspection/plot_permutation_importance_multicollinear.py b/examples/inspection/plot_permutation_importance_multicollinear.py index a8fe52b1565d9..1ecf95d9d61d8 100644 --- a/examples/inspection/plot_permutation_importance_multicollinear.py +++ b/examples/inspection/plot_permutation_importance_multicollinear.py @@ -66,7 +66,6 @@ def plot_permutation_importance(clf, X, y, ax): mdi_importances = pd.Series(clf.feature_importances_, index=X_train.columns) tree_importance_sorted_idx = np.argsort(clf.feature_importances_) -tree_indices = np.arange(0, len(clf.feature_importances_)) + 0.5 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) mdi_importances.sort_values().plot.barh(ax=ax1) From 1813b4a8a7bcc94e59a66401f754134c20c7288e Mon Sep 17 00:00:00 2001 From: EmilyXinyi <52259856+EmilyXinyi@users.noreply.github.com> Date: Fri, 12 Jul 2024 09:24:47 -0400 Subject: [PATCH 13/35] array API support for cosine_distances (#29265) --- doc/modules/array_api.rst | 1 + doc/whats_new/v1.6.rst | 1 + sklearn/metrics/pairwise.py | 7 +++++-- sklearn/metrics/tests/test_common.py | 2 ++ sklearn/utils/_array_api.py | 13 +++++++++++++ 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index 9afedeb7ccecb..48e99a2e4c57d 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -123,6 +123,7 @@ Metrics - :func:`sklearn.metrics.pairwise.additive_chi2_kernel` - :func:`sklearn.metrics.pairwise.chi2_kernel` - :func:`sklearn.metrics.pairwise.cosine_similarity` +- :func:`sklearn.metrics.pairwise.cosine_distances` - :func:`sklearn.metrics.pairwise.euclidean_distances` (see :ref:`device_support_for_float64`) - :func:`sklearn.metrics.pairwise.paired_cosine_distances` - :func:`sklearn.metrics.pairwise.rbf_kernel` (see :ref:`device_support_for_float64`) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 3971f60eb5f4b..4af519b278c80 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -43,6 +43,7 @@ See :ref:`array_api` for more details. - :func:`sklearn.metrics.pairwise.additive_chi2_kernel` :pr:`29144` by :user:`Yaroslav Korobko `; - :func:`sklearn.metrics.pairwise.chi2_kernel` :pr:`29267` by :user:`Yaroslav Korobko `; - :func:`sklearn.metrics.pairwise.cosine_similarity` :pr:`29014` by :user:`Edoardo Abati `; +- :func:`sklearn.metrics.pairwise.cosine_distances` :pr:`29265` by :user:`Emily Chen `; - :func:`sklearn.metrics.pairwise.euclidean_distances` :pr:`29433` by :user:`Omar Salman `; - :func:`sklearn.metrics.pairwise.paired_cosine_distances` :pr:`29112` by :user:`Edoardo Abati `; - :func:`sklearn.metrics.pairwise.rbf_kernel` :pr:`29433` by :user:`Omar Salman `. diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b7db4d94c4f07..f8b163813d6d6 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -22,6 +22,7 @@ gen_even_slices, ) from ..utils._array_api import ( + _clip, _fill_or_add_to_diagonal, _find_matching_floating_dtype, _is_numpy_namespace, @@ -1139,15 +1140,17 @@ def cosine_distances(X, Y=None): array([[1. , 1. ], [0.42..., 0.18...]]) """ + xp, _ = get_namespace(X, Y) + # 1.0 - cosine_similarity(X, Y) without copy S = cosine_similarity(X, Y) S *= -1 S += 1 - np.clip(S, 0, 2, out=S) + S = _clip(S, 0, 2, xp) if X is Y or Y is None: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. - np.fill_diagonal(S, 0.0) + _fill_or_add_to_diagonal(S, 0.0, xp, add_value=False) return S diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 14e96cc9fcd98..b93180aaafd87 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -54,6 +54,7 @@ from sklearn.metrics.pairwise import ( additive_chi2_kernel, chi2_kernel, + cosine_distances, cosine_similarity, euclidean_distances, paired_cosine_distances, @@ -2016,6 +2017,7 @@ def check_array_api_metric_pairwise(metric, array_namespace, device, dtype_name) mean_gamma_deviance: [check_array_api_regression_metric], max_error: [check_array_api_regression_metric], chi2_kernel: [check_array_api_metric_pairwise], + cosine_distances: [check_array_api_metric_pairwise], euclidean_distances: [check_array_api_metric_pairwise], rbf_kernel: [check_array_api_metric_pairwise], } diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 51caacb71c9e2..63c14386c04e9 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -791,6 +791,19 @@ def _nanmax(X, axis=None, xp=None): return X +def _clip(S, min_val, max_val, xp): + # TODO: remove this method and change all usage once we move to array api 2023.12 + # https://data-apis.org/array-api/2023.12/API_specification/generated/array_api.clip.html#clip + if _is_numpy_namespace(xp): + return numpy.clip(S, min_val, max_val) + else: + min_arr = xp.asarray(min_val, dtype=S.dtype) + max_arr = xp.asarray(max_val, dtype=S.dtype) + S = xp.where(S < min_arr, min_arr, S) + S = xp.where(S > max_arr, max_arr, S) + return S + + def _asarray_with_order( array, dtype=None, order=None, copy=None, *, xp=None, device=None ): From dc6c01c14433e7e86829f5073d8756661abc94d2 Mon Sep 17 00:00:00 2001 From: EmilyXinyi <52259856+EmilyXinyi@users.noreply.github.com> Date: Fri, 12 Jul 2024 10:18:35 -0400 Subject: [PATCH 14/35] array API support for mean_absolute_percentage_error (#29300) --- doc/modules/array_api.rst | 1 + doc/whats_new/v1.6.rst | 3 ++- sklearn/metrics/_regression.py | 19 ++++++++++++++----- sklearn/metrics/tests/test_common.py | 4 ++++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index 48e99a2e4c57d..53411f87a5305 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -117,6 +117,7 @@ Metrics - :func:`sklearn.metrics.d2_tweedie_score` - :func:`sklearn.metrics.max_error` - :func:`sklearn.metrics.mean_absolute_error` +- :func:`sklearn.metrics.mean_absolute_percentage_error` - :func:`sklearn.metrics.mean_gamma_deviance` - :func:`sklearn.metrics.mean_squared_error` - :func:`sklearn.metrics.mean_tweedie_deviance` diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 4af519b278c80..0024d979eeb19 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -37,7 +37,8 @@ See :ref:`array_api` for more details. - :func:`sklearn.metrics.max_error` :pr:`29212` by :user:`Edoardo Abati `; - :func:`sklearn.metrics.mean_absolute_error` :pr:`27736` by :user:`Edoardo Abati ` and :pr:`29143` by :user:`Tialo ` and :user:`Loïc Estève `; -- :func:`sklearn.metrics.mean_gamma_deviance` :pr:`29239` by :usser:`Emily Chen `; +- :func:`sklearn.metrics.mean_absolute_percentage_error` :pr:`29300` by :user:`Emily Chen `; +- :func:`sklearn.metrics.mean_gamma_deviance` :pr:`29239` by :user:`Emily Chen `; - :func:`sklearn.metrics.mean_squared_error` :pr:`29142` by :user:`Yaroslav Korobko `; - :func:`sklearn.metrics.mean_tweedie_deviance` :pr:`28106` by :user:`Thomas Li `; - :func:`sklearn.metrics.pairwise.additive_chi2_kernel` :pr:`29144` by :user:`Yaroslav Korobko `; diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 36a4638718118..482d5dc260b31 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -395,21 +395,30 @@ def mean_absolute_percentage_error( >>> mean_absolute_percentage_error(y_true, y_pred) 112589990684262.48 """ + input_arrays = [y_true, y_pred, sample_weight, multioutput] + xp, _ = get_namespace(*input_arrays) + dtype = _find_matching_floating_dtype(y_true, y_pred, sample_weight, xp=xp) + y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput ) check_consistent_length(y_true, y_pred, sample_weight) - epsilon = np.finfo(np.float64).eps - mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon) - output_errors = np.average(mape, weights=sample_weight, axis=0) + epsilon = xp.asarray(xp.finfo(xp.float64).eps, dtype=dtype) + y_true_abs = xp.asarray(xp.abs(y_true), dtype=dtype) + mape = xp.asarray(xp.abs(y_pred - y_true), dtype=dtype) / xp.maximum( + y_true_abs, epsilon + ) + output_errors = _average(mape, weights=sample_weight, axis=0) if isinstance(multioutput, str): if multioutput == "raw_values": return output_errors elif multioutput == "uniform_average": - # pass None as weights to np.average: uniform mean + # pass None as weights to _average: uniform mean multioutput = None - return np.average(output_errors, weights=multioutput) + mean_absolute_percentage_error = _average(output_errors, weights=multioutput) + assert mean_absolute_percentage_error.shape == () + return float(mean_absolute_percentage_error) @validate_params( diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index b93180aaafd87..b7fa3319b118c 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -2016,6 +2016,10 @@ def check_array_api_metric_pairwise(metric, array_namespace, device, dtype_name) additive_chi2_kernel: [check_array_api_metric_pairwise], mean_gamma_deviance: [check_array_api_regression_metric], max_error: [check_array_api_regression_metric], + mean_absolute_percentage_error: [ + check_array_api_regression_metric, + check_array_api_regression_metric_multioutput, + ], chi2_kernel: [check_array_api_metric_pairwise], cosine_distances: [check_array_api_metric_pairwise], euclidean_distances: [check_array_api_metric_pairwise], From 13a179113a61573c45b3e657f1f74f9b106b3f7f Mon Sep 17 00:00:00 2001 From: Nithish Bolleddula Date: Sun, 14 Jul 2024 22:52:31 -0700 Subject: [PATCH 15/35] EXA Use tick_labels in boxplot for matplotlib>=3.9 (#29471) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Loïc Estève --- .../plot_gradient_boosting_regression.py | 19 +++++++++++++++---- ...t_permutation_importance_multicollinear.py | 17 +++++++++++++---- .../plot_release_highlights_0_22_0.py | 15 +++++++++++++-- 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py index bdcc6cca66996..68a50b7a27492 100644 --- a/examples/ensemble/plot_gradient_boosting_regression.py +++ b/examples/ensemble/plot_gradient_boosting_regression.py @@ -21,6 +21,7 @@ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause +import matplotlib import matplotlib.pyplot as plt import numpy as np @@ -28,6 +29,7 @@ from sklearn.inspection import permutation_importance from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split +from sklearn.utils.fixes import parse_version # %% # Load the data @@ -145,11 +147,20 @@ ) sorted_idx = result.importances_mean.argsort() plt.subplot(1, 2, 2) -plt.boxplot( - result.importances[sorted_idx].T, - vert=False, - labels=np.array(diabetes.feature_names)[sorted_idx], + +# `labels` argument in boxplot is deprecated in matplotlib 3.9 and has been +# renamed to `tick_labels`. The following code handles this, but as a +# scikit-learn user you probably can write simpler code by using `labels=...` +# (matplotlib < 3.9) or `tick_labels=...` (matplotlib >= 3.9). +tick_labels_parameter_name = ( + "tick_labels" + if parse_version(matplotlib.__version__) >= parse_version("3.9") + else "labels" ) +tick_labels_dict = { + tick_labels_parameter_name: np.array(diabetes.feature_names)[sorted_idx] +} +plt.boxplot(result.importances[sorted_idx].T, vert=False, **tick_labels_dict) plt.title("Permutation Importance (test set)") fig.tight_layout() plt.show() diff --git a/examples/inspection/plot_permutation_importance_multicollinear.py b/examples/inspection/plot_permutation_importance_multicollinear.py index 1ecf95d9d61d8..d9faca695e24d 100644 --- a/examples/inspection/plot_permutation_importance_multicollinear.py +++ b/examples/inspection/plot_permutation_importance_multicollinear.py @@ -26,18 +26,27 @@ # ------------------------------------------------------ # # First, we define a function to ease the plotting: +import matplotlib + from sklearn.inspection import permutation_importance +from sklearn.utils.fixes import parse_version def plot_permutation_importance(clf, X, y, ax): result = permutation_importance(clf, X, y, n_repeats=10, random_state=42, n_jobs=2) perm_sorted_idx = result.importances_mean.argsort() - ax.boxplot( - result.importances[perm_sorted_idx].T, - vert=False, - labels=X.columns[perm_sorted_idx], + # `labels` argument in boxplot is deprecated in matplotlib 3.9 and has been + # renamed to `tick_labels`. The following code handles this, but as a + # scikit-learn user you probably can write simpler code by using `labels=...` + # (matplotlib < 3.9) or `tick_labels=...` (matplotlib >= 3.9). + tick_labels_parameter_name = ( + "tick_labels" + if parse_version(matplotlib.__version__) >= parse_version("3.9") + else "labels" ) + tick_labels_dict = {tick_labels_parameter_name: X.columns[perm_sorted_idx]} + ax.boxplot(result.importances[perm_sorted_idx].T, vert=False, **tick_labels_dict) ax.axvline(x=0, color="k", linestyle="--") return ax diff --git a/examples/release_highlights/plot_release_highlights_0_22_0.py b/examples/release_highlights/plot_release_highlights_0_22_0.py index 2e4c9185365a9..03120c781140c 100644 --- a/examples/release_highlights/plot_release_highlights_0_22_0.py +++ b/examples/release_highlights/plot_release_highlights_0_22_0.py @@ -34,6 +34,7 @@ # `plot_confusion_matrix`. Read more about this new API in the # :ref:`User Guide `. +import matplotlib import matplotlib.pyplot as plt from sklearn.datasets import make_classification @@ -43,6 +44,7 @@ from sklearn.metrics import RocCurveDisplay from sklearn.model_selection import train_test_split from sklearn.svm import SVC +from sklearn.utils.fixes import parse_version X, y = make_classification(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) @@ -117,9 +119,18 @@ fig, ax = plt.subplots() sorted_idx = result.importances_mean.argsort() -ax.boxplot( - result.importances[sorted_idx].T, vert=False, labels=feature_names[sorted_idx] + +# `labels` argument in boxplot is deprecated in matplotlib 3.9 and has been +# renamed to `tick_labels`. The following code handles this, but as a +# scikit-learn user you probably can write simpler code by using `labels=...` +# (matplotlib < 3.9) or `tick_labels=...` (matplotlib >= 3.9). +tick_labels_parameter_name = ( + "tick_labels" + if parse_version(matplotlib.__version__) >= parse_version("3.9") + else "labels" ) +tick_labels_dict = {tick_labels_parameter_name: feature_names[sorted_idx]} +ax.boxplot(result.importances[sorted_idx].T, vert=False, **tick_labels_dict) ax.set_title("Permutation Importance of each feature") ax.set_ylabel("Features") fig.tight_layout() From 7ac4f895f2e58c6461c1fb6ea05004ec6d6d329b Mon Sep 17 00:00:00 2001 From: m-maggi <124086916+m-maggi@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:06:33 +0200 Subject: [PATCH 16/35] DOC Elaborate on the criterion used for poisson regression using decision trees. (#29230) --- doc/modules/tree.rst | 7 ++++--- sklearn/tree/_classes.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index 371cfccfffc1b..318dd79f00504 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -552,17 +552,18 @@ Mean Squared Error: H(Q_m) = \frac{1}{n_m} \sum_{y \in Q_m} (y - \bar{y}_m)^2 -Half Poisson deviance: +Mean Poisson deviance: .. math:: - H(Q_m) = \frac{1}{n_m} \sum_{y \in Q_m} (y \log\frac{y}{\bar{y}_m} + H(Q_m) = \frac{2}{n_m} \sum_{y \in Q_m} (y \log\frac{y}{\bar{y}_m} - y + \bar{y}_m) Setting `criterion="poisson"` might be a good choice if your target is a count or a frequency (count per some unit). In any case, :math:`y >= 0` is a necessary condition to use this criterion. Note that it fits much slower than -the MSE criterion. +the MSE criterion. For performance reasons the actual implementation minimizes +the half mean poisson deviance, i.e. the mean poisson deviance divided by 2. Mean Absolute Error: diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 61c572554b3b6..29352d080414d 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1098,7 +1098,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): mean squared error with Friedman's improvement score for potential splits, "absolute_error" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and "poisson" which - uses reduction in Poisson deviance to find splits. + uses reduction in the half mean Poisson deviance to find splits. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. From 40d6d54d219e63a141de2084ffbc30a5da480bd3 Mon Sep 17 00:00:00 2001 From: scikit-learn-bot Date: Mon, 15 Jul 2024 02:21:34 -0700 Subject: [PATCH 17/35] :lock: :robot: CI Update lock files for cirrus-arm CI build(s) :lock: :robot: (#29485) Co-authored-by: Lock file bot --- ...pymin_conda_forge_linux-aarch64_conda.lock | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/build_tools/cirrus/pymin_conda_forge_linux-aarch64_conda.lock b/build_tools/cirrus/pymin_conda_forge_linux-aarch64_conda.lock index 5de3fa818fd24..fadf5540acae1 100644 --- a/build_tools/cirrus/pymin_conda_forge_linux-aarch64_conda.lock +++ b/build_tools/cirrus/pymin_conda_forge_linux-aarch64_conda.lock @@ -8,7 +8,7 @@ https://conda.anaconda.org/conda-forge/linux-aarch64/python_abi-3.9-4_cp39.conda https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda#161081fc7cec0bfda0d86d7cb595f8d8 https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_kmp_llvm.tar.bz2#98a1185182fec3c434069fa74e6473d6 https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-14.1.0-he277a41_0.conda#47ecd1292a3fd78b616640b35dd9632c -https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h31becfc_5.conda#a64e35f01e0b7a2a152eca87d33b9c87 +https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h68df207_7.conda#56398c28220513b9ea13d7b450acfb20 https://conda.anaconda.org/conda-forge/linux-aarch64/libbrotlicommon-1.1.0-h31becfc_1.conda#1b219fd801eddb7a94df5bd001053ad9 https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.20-h31becfc_0.conda#018592a3d691662f451f89d0de474a20 https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.4.2-h3557bc0_5.tar.bz2#dddd85f4d52121fab0a8b099c5e06501 @@ -34,35 +34,36 @@ https://conda.anaconda.org/conda-forge/linux-aarch64/libpng-1.6.43-h194ca79_0.co https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.46.0-hf51ef55_0.conda#a8ae63fd6fb7d007f74ef3df95e5edf3 https://conda.anaconda.org/conda-forge/linux-aarch64/libxcb-1.16-h7935292_0.conda#93c0136e9cba96657339dfe25fba4da7 https://conda.anaconda.org/conda-forge/linux-aarch64/ninja-1.12.1-h70be974_0.conda#216635cea46498d8045c7cf0f03eaf72 +https://conda.anaconda.org/conda-forge/linux-aarch64/qhull-2020.2-h70be974_5.conda#bb138086d938e2b64f5f364945793ebf https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.2-h8fc344f_1.conda#105eb1e16bf83bfb2eb380a48032b655 https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-h194ca79_0.conda#f75105e0585851f818e0009dd1dde4dc https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.6-h02f22dd_0.conda#be8d5f8cf21aed237b8b182ea86b3dd6 https://conda.anaconda.org/conda-forge/linux-aarch64/brotli-bin-1.1.0-h31becfc_1.conda#9e4a13596ab651ea8d77aae023d0ce3f https://conda.anaconda.org/conda-forge/linux-aarch64/freetype-2.12.1-hf0a5ef3_2.conda#a5ab74c5bd158c3d5532b66d8d83d907 https://conda.anaconda.org/conda-forge/linux-aarch64/libhiredis-1.0.2-h05efe27_0.tar.bz2#a87f068744fd20334cd41489eb163bee -https://conda.anaconda.org/conda-forge/linux-aarch64/libopenblas-0.3.27-pthreads_h5a5ec62_0.conda#ffecca8f4f31cd50b92c0e6e6bfe4416 +https://conda.anaconda.org/conda-forge/linux-aarch64/libopenblas-0.3.27-pthreads_h076ed1e_1.conda#cc0a15e3a6f92f454b6132ca6aca8e8d https://conda.anaconda.org/conda-forge/linux-aarch64/libtiff-4.6.0-hf980d43_3.conda#b6f3abf5726ae33094bee238b4eb492f https://conda.anaconda.org/conda-forge/linux-aarch64/llvm-openmp-18.1.8-hb063fc5_0.conda#f0cf07feda9ed87092833cd8fca012f5 https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.9.19-h4ac3b42_0_cpython.conda#1501507cd9451472ec8900d587ce872f https://conda.anaconda.org/conda-forge/linux-aarch64/brotli-1.1.0-h31becfc_1.conda#e41f5862ac746428407f3fd44d2ed01f https://conda.anaconda.org/conda-forge/linux-aarch64/ccache-4.10.1-ha3bccff_0.conda#7cd24a038d2727b5e6377975237a6cfa -https://conda.anaconda.org/conda-forge/noarch/certifi-2024.6.2-pyhd8ed1ab_0.conda#8821ec1c8fcdc9e1d291d7b9f6e9968a +https://conda.anaconda.org/conda-forge/noarch/certifi-2024.7.4-pyhd8ed1ab_0.conda#24e7fd6ca65997938fff9e5ab6f653e4 https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_0.tar.bz2#3faab06a954c2a04039983f2c4a50d99 https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_0.conda#5cd86562580f274031ede6aa6aa24441 https://conda.anaconda.org/conda-forge/linux-aarch64/cython-3.0.10-py39h387a81e_0.conda#0e917a89f77c978d152099357bd75b22 -https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.0-pyhd8ed1ab_2.conda#8d652ea2ee8eaee02ed8dc820bc794aa +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.2-pyhd8ed1ab_0.conda#d02ae936e42063ca46af6cdad2dbd1e0 https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_0.conda#15dda3cdbf330abfe9f555d22f66db46 https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_0.conda#f800d2da156d08e289b14e87e43c1ae5 https://conda.anaconda.org/conda-forge/linux-aarch64/kiwisolver-1.4.5-py39had2cf8c_1.conda#ddb99610f7b950fdd5ff2aff19136363 https://conda.anaconda.org/conda-forge/linux-aarch64/lcms2-2.16-h922389a_0.conda#ffdd8267a04c515e7ce69c727b051414 https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.9.0-22_linuxaarch64_openblas.conda#068ab33f2382cda4dd0b72a715ad33b5 https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyh9f0ad1d_0.tar.bz2#2ba8498c1018c1e9c61eb99b973dfe19 -https://conda.anaconda.org/conda-forge/linux-aarch64/openblas-0.3.27-pthreads_h339cbfa_0.conda#cb06c34a3056f59e9e244c20836add8a +https://conda.anaconda.org/conda-forge/linux-aarch64/openblas-0.3.27-pthreads_hd33deab_1.conda#70c0aa7d1dd049fffae952bfe8f2c4e9 https://conda.anaconda.org/conda-forge/linux-aarch64/openjpeg-2.5.2-h0d9d63b_0.conda#fd2898519e839d5ceb778343f39a3176 https://conda.anaconda.org/conda-forge/noarch/packaging-24.1-pyhd8ed1ab_0.conda#cbe1bb1f21567018ce595d9c2be0f0db https://conda.anaconda.org/conda-forge/noarch/pluggy-1.5.0-pyhd8ed1ab_0.conda#d3483c8fc2dc2cc3f5cf43e26d60cabf https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.1.2-pyhd8ed1ab_0.conda#b9a4dacf97241704529131a0dfc0494f -https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.1-pyhd8ed1ab_0.conda#985e9e86e1b0fc75a74a9bfab9309ef7 +https://conda.anaconda.org/conda-forge/noarch/setuptools-70.3.0-pyhd8ed1ab_0.conda#693bb57e8f92120caa956898065f3627 https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2 https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.5.0-pyhc1e730c_0.conda#df68d78237980a159bd7149f33c0e8fd https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2#5844808ffab9ebdb694585b50ba02a96 @@ -75,7 +76,7 @@ https://conda.anaconda.org/conda-forge/noarch/importlib_resources-6.4.0-pyhd8ed1 https://conda.anaconda.org/conda-forge/noarch/joblib-1.4.2-pyhd8ed1ab_0.conda#25df261d4523d9f9783bcdb7208d872f https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.9.0-22_linuxaarch64_openblas.conda#fbe7fe553f2cc78a0311e009b26f180d https://conda.anaconda.org/conda-forge/linux-aarch64/liblapack-3.9.0-22_linuxaarch64_openblas.conda#8c709d281609792c39b1d5c0241f90f1 -https://conda.anaconda.org/conda-forge/noarch/meson-1.4.1-pyhd8ed1ab_0.conda#714ca123839eeebb25d12b443067ea64 +https://conda.anaconda.org/conda-forge/noarch/meson-1.5.0-pyhd8ed1ab_0.conda#9d971c5bf99aed063664d6650e7e7ed8 https://conda.anaconda.org/conda-forge/linux-aarch64/pillow-10.4.0-py39h4a8821f_0.conda#318861157594972acc05a8715d3018a8 https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda#f586ac1e56c8638b64f9c8122a7b8a67 https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.8.0-pyhd8ed1ab_0.conda#573fe09d7bd0cd4bcc210d8369b5ca47 @@ -90,5 +91,5 @@ https://conda.anaconda.org/conda-forge/linux-aarch64/blas-devel-3.9.0-22_linuxaa https://conda.anaconda.org/conda-forge/linux-aarch64/contourpy-1.2.1-py39hd16970a_0.conda#66b9718539ecdd38876b0176c315bcad https://conda.anaconda.org/conda-forge/linux-aarch64/scipy-1.13.1-py39hb921187_0.conda#1aac9080de661e03d286f18fb71e5240 https://conda.anaconda.org/conda-forge/linux-aarch64/blas-2.122-openblas.conda#65bc48b3bc85f8eeeab54311443a83aa -https://conda.anaconda.org/conda-forge/linux-aarch64/matplotlib-base-3.8.4-py39hf44f4b6_2.conda#fadf734d38ed608c9f0b5c91fe79cfb4 -https://conda.anaconda.org/conda-forge/linux-aarch64/matplotlib-3.8.4-py39ha65689a_2.conda#c0472e3c4b3f007de6d643317c30963b +https://conda.anaconda.org/conda-forge/linux-aarch64/matplotlib-base-3.9.1-py39hf3ba65a_0.conda#1bd99011ababf4f9d0976271b23b179f +https://conda.anaconda.org/conda-forge/linux-aarch64/matplotlib-3.9.1-py39ha65689a_0.conda#677115251fc0a0cd1665318573e2307a From 3b7879d3113a62a36a04e06832422e05fe061732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 15 Jul 2024 11:22:55 +0200 Subject: [PATCH 18/35] MAINT Remove MANIFEST.in that was a setuptools thing (#29482) --- MANIFEST.in | 36 ------------------------------------ 1 file changed, 36 deletions(-) delete mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 1596d4cd011df..0000000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,36 +0,0 @@ -include *.rst -include *.build -recursive-include sklearn *.build -recursive-include doc * -recursive-include examples * -recursive-include sklearn *.c *.cpp *.h *.pyx *.pxd *.pxi *.tp -recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt *.arff.gz *.json.gz -include COPYING -include README.rst -include pyproject.toml -include sklearn/externals/README -include sklearn/svm/src/liblinear/COPYRIGHT -include sklearn/svm/src/libsvm/LIBSVM_CHANGES -include conftest.py -include Makefile -include MANIFEST.in -include .coveragerc - -# exclude from sdist -recursive-exclude asv_benchmarks * -recursive-exclude benchmarks * -recursive-exclude build_tools * -recursive-exclude maint_tools * -recursive-exclude benchmarks * -recursive-exclude .binder * -recursive-exclude .circleci * -exclude .cirrus.star -exclude .codecov.yml -exclude .git-blame-ignore-revs -exclude .mailmap -exclude .pre-commit-config.yaml -exclude azure-pipelines.yml -exclude CODE_OF_CONDUCT.md -exclude CONTRIBUTING.md -exclude SECURITY.md -exclude PULL_REQUEST_TEMPLATE.md From 55c3125efb849fe654699b08b9664143dd5e4623 Mon Sep 17 00:00:00 2001 From: scikit-learn-bot Date: Mon, 15 Jul 2024 03:00:25 -0700 Subject: [PATCH 19/35] :lock: :robot: CI Update lock files for scipy-dev CI build(s) :lock: :robot: (#29484) Co-authored-by: Lock file bot --- .../azure/pylatest_pip_scipy_dev_linux-64_conda.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock b/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock index 71d55df86a35d..0223829c4eac0 100644 --- a/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock +++ b/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock @@ -3,7 +3,7 @@ # input_hash: 8a4a203136d97ff3b2c8657fce2dd2228215bfbf9c1cfbe271e401f934bdf1a7 @EXPLICIT https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9 -https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2024.3.11-h06a4308_0.conda#08529eb3504712baabcbda266a19feb7 +https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2024.7.2-h06a4308_0.conda#5c6799c01e9be4c7ba294f6530b2d562 https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.38-h1181459_1.conda#68eedfd9c06f2b0e6888d8db345b7f5b https://repo.anaconda.com/pkgs/main/noarch/tzdata-2024a-h04d1e81_0.conda#452af53adae0a5b06eb5d05c707b2f25 https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda#b372c0eea9b60732fdae4b817a63c8cd @@ -30,14 +30,14 @@ https://repo.anaconda.com/pkgs/main/linux-64/pip-24.0-py312h06a4308_0.conda#6d96 # pip babel @ https://files.pythonhosted.org/packages/27/45/377f7e32a5c93d94cd56542349b34efab5ca3f9e2fd5a68c5e93169aa32d/Babel-2.15.0-py3-none-any.whl#sha256=08706bdad8d0a3413266ab61bd6c34d0c28d6e1e7badf40a2cebe67644e2e1fb # pip certifi @ https://files.pythonhosted.org/packages/1c/d5/c84e1a17bf61d4df64ca866a1c9a913874b4e9bdc131ec689a0ad013fb36/certifi-2024.7.4-py3-none-any.whl#sha256=c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90 # pip charset-normalizer @ https://files.pythonhosted.org/packages/ee/fb/14d30eb4956408ee3ae09ad34299131fb383c47df355ddb428a7331cfa1e/charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b -# pip coverage @ https://files.pythonhosted.org/packages/88/52/7054710a881b09d295e93b9889ac204c241a6847a8c05555fc6e1d8799d5/coverage-7.5.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=5013ed890dc917cef2c9f765c4c6a8ae9df983cd60dbb635df8ed9f4ebc9f555 +# pip coverage @ https://files.pythonhosted.org/packages/f2/aa/0419103c357bfd95a65d7b2e2249f9f1d79194241c5e87819cd81d36b96c/coverage-7.6.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=0086cd4fc71b7d485ac93ca4239c8f75732c2ae3ba83f6be1c9be59d9e2c6382 # pip docutils @ https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl#sha256=dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2 # pip execnet @ https://files.pythonhosted.org/packages/43/09/2aea36ff60d16dd8879bdb2f5b3ee0ba8d08cbbdcdfe870e695ce3784385/execnet-2.1.1-py3-none-any.whl#sha256=26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc # pip idna @ https://files.pythonhosted.org/packages/e5/3e/741d8c82801c347547f8a2a06aa57dbb1992be9e948df2ea0eda2c8b79e8/idna-3.7-py3-none-any.whl#sha256=82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # pip imagesize @ https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl#sha256=0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b # pip iniconfig @ https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl#sha256=b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 # pip markupsafe @ https://files.pythonhosted.org/packages/0a/0d/2454f072fae3b5a137c119abf15465d1771319dfe9e4acbb31722a0fff91/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 -# pip meson @ https://files.pythonhosted.org/packages/44/b2/d4433391a7c5e94a39b50ca7295a8ceba736e7c72c455752a60122f52453/meson-1.4.1-py3-none-any.whl#sha256=d5acc3abae2dad3c70ddcbd10acac92b78b144d34d43f40f5b8ac31dfd8a826a +# pip meson @ https://files.pythonhosted.org/packages/d6/97/852cc27c460d5fae07c6d7e4a2744f5684760358de3456b5ee9f130b8f57/meson-1.5.0.tar.gz#sha256=45d7b8653c1e5139df35b33be2dd5b2d040c5b2c6129f9a7c890d507e33312b8 # pip ninja @ https://files.pythonhosted.org/packages/6d/92/8d7aebd4430ab5ff65df2bfee6d5745f95c004284db2d8ca76dcbfd9de47/ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl#sha256=84502ec98f02a037a169c4b0d5d86075eaf6afc55e1879003d6cab51ced2ea4b # pip packaging @ https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl#sha256=5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 # pip platformdirs @ https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl#sha256=2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee From bed36b20a202e0c8411ead6cbc20e63d9f7794ec Mon Sep 17 00:00:00 2001 From: Tim Head Date: Mon, 15 Jul 2024 14:00:46 +0200 Subject: [PATCH 20/35] Fix changed return type dtype for array API (#29488) --- ...a_forge_cuda_array-api_linux-64_conda.lock | 123 +++++++++--------- sklearn/decomposition/_base.py | 2 +- sklearn/discriminant_analysis.py | 2 +- 3 files changed, 64 insertions(+), 63 deletions(-) diff --git a/build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_conda.lock b/build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_conda.lock index 91e3fae1b21ad..cf3d9bdb39ac4 100644 --- a/build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_conda.lock +++ b/build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_conda.lock @@ -3,7 +3,7 @@ # input_hash: 7044e24fc9243a244c265e4b8c44e1304a8f55cd0cfa2d036ead6f92921d624e @EXPLICIT https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 -https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.6.2-hbcca054_0.conda#847c3c2905cc467cea52c24f9cfa8080 +https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.7.4-hbcca054_0.conda#23ab7665c5f63cfb9f1f6195256daac6 https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.1-h1d6eff3_3.conda#913018efd4acd03c48f15cb60d2bbf97 https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 @@ -24,12 +24,12 @@ https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-12.1.105-h595 https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_kmp_llvm.tar.bz2#562b26ba2e19059551a811e72ab7f793 -https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h77fa898_13.conda#9358cdd61ef0d600d2a0dde2d53b006c +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.1.0-h77fa898_0.conda#ca0fad6a41ddaef54a153b78eccb5037 https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.12-h4ab18f5_0.conda#7ed427f0871fd41cb1d9c17727c17589 https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.1-h166bdaf_1.tar.bz2#d9c69a24ad678ffce24c6543a0176b00 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.9.19-h4ab18f5_0.conda#c6dedd5eab2236f4abb59ade9fb7fd44 -https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda#69b8b6202a07720f448be700e300ccf4 -https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.28.1-hd590300_0.conda#dcde58ff9a1f30b0037a2315d1846d1f +https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.9.23-h4ab18f5_0.conda#94d61ae2b2b701008a9d52ce6bbead27 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.32.1-h4bc722e_0.conda#7ed005e0df2bd50406b110b48ac4587b https://conda.anaconda.org/conda-forge/linux-64/gettext-tools-0.22.5-h59595ed_2.conda#985f2f453fb72408d6b6f1be0f324033 https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 https://conda.anaconda.org/conda-forge/linux-64/lame-3.100-h166bdaf_1003.tar.bz2#a8832b479f93521a9e7b5b743803be51 @@ -39,13 +39,13 @@ https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172b https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.6.2-h59595ed_0.conda#e7ba12deb7020dd080c6c70e7b6f6a3d https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3 https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-0.22.5-h59595ed_2.conda#172bcc51059416e7ce99e7b528cede83 -https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-13.2.0-h3d2ce59_13.conda#1e380198685bc1e993bbbc4b579f5916 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-14.1.0-hc5f4f2c_0.conda#6456c2620c990cd8dde2428a27ba0bc5 https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.17-hd590300_2.conda#d66573916ffcf376178462f1b61c941e https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.0.0-hd590300_1.conda#ea25936bb4080d843790b586850f82b8 https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 -https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.4-h7f98852_1.tar.bz2#6e8cc2173440d77708196c5b93771680 +https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-h4ab18f5_0.conda#601bfb4b3c6f0b844443bb81a56651e0 https://conda.anaconda.org/conda-forge/linux-64/libopus-1.3.1-h7f98852_1.tar.bz2#15345e56d527b330e1cacbdf58676e8f -https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-hc0a3c3a_13.conda#1053882642ed5bbc799e1e866ff86826 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-14.1.0-hc0a3c3a_0.conda#1cb187a157136398ddbaae90713e2498 https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.8.0-h166bdaf_0.tar.bz2#ede4266dc02e875fe1ea77b25dd43747 https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.4.0-hd590300_0.conda#b26e8aa824079e1be0294e7152ca4559 @@ -53,7 +53,7 @@ https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.cond https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-h4ab18f5_1.conda#57d7dc60e9325e3de37ff8dffd18e814 https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h59595ed_0.conda#fcea371545eda051b6deafb24889fc69 https://conda.anaconda.org/conda-forge/linux-64/ocl-icd-2.3.2-hd590300_1.conda#c66f837ac65e4d1cdeb80e2a1d5fcc3d -https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.1-h4ab18f5_0.conda#a41fa0e391cc9e0d6b78ac69ca047a6c +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.1-h4ab18f5_1.conda#b1e9d076f14e8d776213fd5047b4c3d9 https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-h36c2ea0_1001.tar.bz2#22dad4df6e8630e8dff2428f6f6a7036 https://conda.anaconda.org/conda-forge/linux-64/xorg-kbproto-1.0.7-h7f98852_1002.tar.bz2#4b230e8381279d76131116660f5a241a https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.1-hd590300_0.conda#b462a33c0be1421532f28bfe8f4a7514 @@ -65,10 +65,10 @@ https://conda.anaconda.org/conda-forge/linux-64/xorg-xf86vidmodeproto-2.3.1-h7f9 https://conda.anaconda.org/conda-forge/linux-64/xorg-xproto-7.0.31-h7f98852_1007.tar.bz2#b4a4381d54784606820704f7b5f05a15 https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0 https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h7f98852_2.tar.bz2#4cb3ad778ec2d5a7acbdf254eb1c42ae -https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.6.15-h88a6e22_0.conda#50eabf107100f8f929bc3246ea63fa08 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.2.18-h83b837d_6.conda#3e572eacd0ce99a59e1bb9c260ad5b20 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.1.16-h83b837d_2.conda#f40c698b4ea90f7fedd187c6639c818b -https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.1.18-h83b837d_6.conda#7995cb937bdac5913c8904fed6b3729d +https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.7.1-h87b94db_1.conda#2d76d2cfdcfe2d5c3883d33d8be919e7 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.2.18-he027950_7.conda#11e5cb0b426772974f6416545baee0ce +https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.1.16-he027950_3.conda#adbf0c44ca88a3cded175cd809a106b6 +https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.1.18-he027950_7.conda#95611b325a9728ed68b8f7eef2dd3feb https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-12.1.105-hd3aeb46_0.conda#e2ab3aeff4d18c82b3e7025a2ec3cecc https://conda.anaconda.org/conda-forge/linux-64/cuda-cupti-12.1.105-h59595ed_0.conda#37400196a2a9d83a1a79ed763189ce32 https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-12.1.105-hd3aeb46_0.conda#361041b17b31f25e60ac43127f52bd3a @@ -80,7 +80,7 @@ https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda#c94a5 https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.13-h59595ed_1003.conda#f87c7b7c2cb45f323ffbce941c78ab7c https://conda.anaconda.org/conda-forge/linux-64/icu-73.2-h59595ed_0.conda#cc47e1facc155f91abd89b11e48e72ff https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h27087fc_0.tar.bz2#76bbff344f0134279f225174e9064c8f -https://conda.anaconda.org/conda-forge/linux-64/libabseil-20240116.2-cxx17_h59595ed_0.conda#682bdbe046a68f749769b492f3625c5c +https://conda.anaconda.org/conda-forge/linux-64/libabseil-20240116.2-cxx17_he02047a_1.conda#c48fc56ec03229f294176923c3265c05 https://conda.anaconda.org/conda-forge/linux-64/libasprintf-0.22.5-h661eb56_2.conda#dd197c968bf9760bba0031888d431ede https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hd590300_1.conda#f07002e225d7a60a694d42a7bf5ff53f https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hd590300_1.conda#5fc11c6020d421960607d821310fcd4d @@ -91,7 +91,7 @@ https://conda.anaconda.org/conda-forge/linux-64/libcurand-10.3.2.106-hd3aeb46_0. https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1 https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda#a1cfcc585f0c42bf8d5546bb1dfb668d https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-devel-0.22.5-h59595ed_2.conda#b63d9b6da3653179a278077f0de20014 -https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-13.2.0-h69a702a_13.conda#516e66b26eea14e7e322fe99e88e0f02 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-14.1.0-h69a702a_0.conda#f4ca84fbd6d06b0a052fb2d5b96dde41 https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.58.0-h47da74e_1.conda#700ac6ea6d53d5510591c4344d5c989a https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-12.1.105-hd3aeb46_0.conda#ed70b41cca6446cab43b0069bf17bd9c https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.43-h2797004_0.conda#009981dd9cfcaa4dbfa25ffaed86bcae @@ -107,30 +107,31 @@ https://conda.anaconda.org/conda-forge/linux-64/ninja-1.12.1-h297d8ca_0.conda#3a https://conda.anaconda.org/conda-forge/linux-64/nspr-4.35-h27087fc_0.conda#da0ec11a6454ae19bff5b02ed881a2b1 https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.44-h0f59acf_0.conda#3914f7ac1761dce57102c72ca7c35d01 https://conda.anaconda.org/conda-forge/linux-64/pixman-0.43.2-h59595ed_0.conda#71004cbf7924e19c02746ccde9fd7123 +https://conda.anaconda.org/conda-forge/linux-64/qhull-2020.2-h434a139_5.conda#353823361b1d27eb3960efb076dfcaf6 https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4 -https://conda.anaconda.org/conda-forge/linux-64/s2n-1.4.16-he19d79f_0.conda#de1cf82e46578faf7de8c23efe5d7be4 -https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.0-hdb0a2a9_1.conda#843bbb8ace1d64ac50d64639ff38b014 +https://conda.anaconda.org/conda-forge/linux-64/s2n-1.4.17-he19d79f_0.conda#e25ac9bf10f8e6aa67727b1cdbe762ef +https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.1-ha2e4443_0.conda#6b7dcc7349efd123d493d2dbe85a045f https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda#d453b98d9c83e71da0741bb0ff4d76bc https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.4-h7391055_0.conda#93ee23f12bc2e684548181256edd2cf6 https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-h4ab18f5_1.conda#9653f1bf3766164d0e65fa723cabbc54 https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.6-ha6fb4c9_0.conda#4d056880988120e29d75bfff282e0f45 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.14.9-h2d549f9_2.conda#5a828631479163d88e419fd6841139c4 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.14.10-h826b7d6_1.conda#6961646dded770513a781de4cd5c1fe1 https://conda.anaconda.org/conda-forge/linux-64/brotli-bin-1.1.0-hd590300_1.conda#39f910d205726805a958da408ca194ba https://conda.anaconda.org/nvidia/linux-64/cuda-libraries-12.1.0-0.tar.bz2#8c08238819848e471a6213db526dbf15 https://conda.anaconda.org/conda-forge/linux-64/freetype-2.12.1-h267a509_2.conda#9ae35c3d96db2c94ce0cef86efdfa2cb https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda#ff862eebdfeb2fd048ae9dc92510baca -https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.2-h659d440_0.conda#cd95826dbd331ed1be26bdf401432844 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 https://conda.anaconda.org/conda-forge/linux-64/libasprintf-devel-0.22.5-h661eb56_2.conda#02e41ab5834dcdcc8590cf29d9526f50 -https://conda.anaconda.org/conda-forge/linux-64/libglib-2.80.2-h8a4344b_1.conda#9c406bb3d4dac2b358873e6462496d09 +https://conda.anaconda.org/conda-forge/linux-64/libglib-2.80.3-h8a4344b_1.conda#6ea440297aacee4893f02ad759e6ffbc https://conda.anaconda.org/conda-forge/linux-64/libhiredis-1.0.2-h2cc385e_0.tar.bz2#b34907d3a81a3cd8095ee83d174c074a https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-4.25.3-h08a7969_0.conda#6945825cebd2aeb16af4c69d97c32c13 https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2023.09.01-h5a48ba9_2.conda#41c69fba59d495e8cf5ffda48a607e35 https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.19.0-hb90f79a_1.conda#8cdb7d41faa0260875ba92414c487e2d https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.6.0-h1dd3fc0_3.conda#66f03896ffbe1a110ffda05c7a856504 -https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.12.7-hc051c1a_1.conda#340278ded8b0dc3a73f3660bbb0adbc6 +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.12.7-h4c95cb1_3.conda#0ac9aff6010a7751961c8e4b863a40e7 https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h9458935_1.conda#8083b20f566639c22f78bcd6ca35b276 https://conda.anaconda.org/conda-forge/linux-64/mysql-libs-8.3.0-hca2cd23_4.conda#1b50eebe2a738a3146c154d2eceaa8b6 -https://conda.anaconda.org/conda-forge/linux-64/nss-3.101-h593d115_0.conda#b24ab6abea1bdc28d646336a03d15392 +https://conda.anaconda.org/conda-forge/linux-64/nss-3.102-h593d115_0.conda#40e5e48c55a45621c4399ca9236406b7 https://conda.anaconda.org/conda-forge/linux-64/python-3.12.4-h194c7f8_0_cpython.conda#d73490214f536cccb5819e9873048c92 https://conda.anaconda.org/conda-forge/linux-64/xcb-util-0.4.1-hb711507_2.conda#8637c3e5821654d0edf97e2b0404b443 https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.1-hb711507_0.conda#ad748ccca349aec3e91743e08b5e2b50 @@ -138,29 +139,29 @@ https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.10-hb711 https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.2-hb711507_0.conda#608e0ef8256b81d04456e8d211eee3e8 https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.9-hb711507_1.conda#4a6d410296d7e39f00bacdee7df046e9 https://conda.anaconda.org/conda-forge/noarch/array-api-compat-1.7.1-pyhd8ed1ab_0.conda#8791d81c38f676a7c08c76546800bf70 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.4.2-h0cbf018_13.conda#15351eccac4eda2b5fd38bbbdae78bdf -https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.8.2-h360477d_2.conda#a820cb648906f7f30076c66dd46b1790 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.4.2-h7671281_15.conda#3b45b0da170f515de8be68155e14955a +https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.8.2-he17ee6b_6.conda#4e3d1bb2ade85619ac2163e695c2cc1b https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hd590300_1.conda#f27a24d46e3ea7b70a1f98e50c62508f -https://conda.anaconda.org/conda-forge/linux-64/ccache-4.9.1-h1fcd64f_0.conda#3620f564bcf28c3524951b6f64f5c5ac -https://conda.anaconda.org/conda-forge/noarch/certifi-2024.6.2-pyhd8ed1ab_0.conda#8821ec1c8fcdc9e1d291d7b9f6e9968a +https://conda.anaconda.org/conda-forge/linux-64/ccache-4.10.1-h065aff2_0.conda#d6b48c138e0c8170a6fe9c136e063540 +https://conda.anaconda.org/conda-forge/noarch/certifi-2024.7.4-pyhd8ed1ab_0.conda#24e7fd6ca65997938fff9e5ab6f653e4 https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_0.tar.bz2#3faab06a954c2a04039983f2c4a50d99 https://conda.anaconda.org/nvidia/linux-64/cuda-runtime-12.1.0-0.tar.bz2#95e8c2f09ec28cce7cdecd6200b5d26e https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_0.conda#5cd86562580f274031ede6aa6aa24441 https://conda.anaconda.org/conda-forge/linux-64/cython-3.0.10-py312h30efb56_0.conda#b119273bff37284cbcb9281c1e85e67d https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-h5008d03_3.tar.bz2#ecfff944ba3960ecb334b9a2663d708d -https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.0-pyhd8ed1ab_2.conda#8d652ea2ee8eaee02ed8dc820bc794aa +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.2-pyhd8ed1ab_0.conda#d02ae936e42063ca46af6cdad2dbd1e0 https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_0.conda#15dda3cdbf330abfe9f555d22f66db46 https://conda.anaconda.org/conda-forge/linux-64/fastrlock-0.8.2-py312h30efb56_2.conda#7065ec5a4909f925e305b77e505b0aec https://conda.anaconda.org/conda-forge/noarch/filelock-3.15.4-pyhd8ed1ab_0.conda#0e7e4388e9d5283e22b35a9443bdbcc9 https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.14.2-h14ed4e7_0.conda#0f69b688f52ff6da70bccb7ff7001d1d https://conda.anaconda.org/conda-forge/linux-64/gettext-0.22.5-h59595ed_2.conda#219ba82e95d7614cf7140d2a4afc0926 -https://conda.anaconda.org/conda-forge/linux-64/glib-tools-2.80.2-h73ef956_1.conda#1daf2cc7054ff71b9a05485f2562cbb4 +https://conda.anaconda.org/conda-forge/linux-64/glib-tools-2.80.3-h73ef956_1.conda#99701cdc9a25a333d15265d1d243b2dc https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_0.conda#f800d2da156d08e289b14e87e43c1ae5 https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.5-py312h8572e83_1.conda#c1e71f2bc05d8e8e033aefac2c490d05 https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.16-hb7c19ff_0.conda#51bb7010fc86f70eee639b4bb7a894f5 https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-h4637d8d_4.conda#d4529f4dff3057982a7617c7ac58fde3 -https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.8.0-hca28451_0.conda#f21c27f076a07907e70c49bb57bd0f20 -https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.10.0-default_h5622ce7_1001.conda#fc2d5b79c2d3f8568fbab31db7ae02f3 +https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.8.0-hca28451_1.conda#b8afb3e3cb3423cc445cf611ab95fdb0 +https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.11.1-default_hecaa2ac_1000.conda#f54aeebefb5c5ff84eca4fb05ca8aa3a https://conda.anaconda.org/conda-forge/linux-64/libllvm15-15.0.7-hb3ce162_4.conda#8a35df3cbc0c8b12cc8af9473ae75eef https://conda.anaconda.org/conda-forge/linux-64/libllvm18-18.1.8-hc9dba70_0.conda#f94ed0c5953c78dcca7adb953f4c5bfb https://conda.anaconda.org/conda-forge/linux-64/libpq-16.3-ha72fbe1_0.conda#bac737ae28b79cfbafd515258d97d29e @@ -179,7 +180,7 @@ https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.1-pyhd8ed1ab_0. https://conda.anaconda.org/conda-forge/noarch/pytz-2024.1-pyhd8ed1ab_0.conda#3eeeeb9e4827ace8c0c1419c85d590ad https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.1-py312h98912ed_1.conda#e3fd78d8d490af1d84763b9fe3f2e552 https://conda.anaconda.org/conda-forge/linux-64/re2-2023.09.01-h7f4b329_2.conda#8f70e36268dea8eb666ef14c29bd3cda -https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.0-pyhd8ed1ab_0.conda#258e66f95f814d51ada2a1fe9274039b +https://conda.anaconda.org/conda-forge/noarch/setuptools-70.3.0-pyhd8ed1ab_0.conda#693bb57e8f92120caa956898065f3627 https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2 https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.5.0-pyhc1e730c_0.conda#df68d78237980a159bd7149f33c0e8fd https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_0.tar.bz2#f832c45a477c78bebd107098db465095 @@ -191,78 +192,78 @@ https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-hb711507_2. https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.42-h4ab18f5_0.conda#b193af204da1bfb8c13882d131a14bd2 https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.4-h0b41bf4_2.conda#82b6df12252e6f32402b96dacc656fec https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.11-hd590300_0.conda#ed67c36f215b310412b2af935bf3e530 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.7.22-h9137712_5.conda#ea86de440f848596543ff58030e5272d -https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.10.4-hf85b563_6.conda#845ddce9934691f5c34ad13d7313ba29 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.7.22-hbd3ac97_10.conda#7ca4abcc98c7521c02f4e8809bbe40df +https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.10.4-hcd6a914_8.conda#b81c45867558446640306507498b2c6b https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.12.0-h830ed8b_0.conda#320d066f9cad598854f4af32c7c82931 https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.0-hbb29018_2.conda#b6d90276c5aee9b4407dd94eb0cd40a8 -https://conda.anaconda.org/conda-forge/linux-64/coverage-7.5.4-py312h9a8786e_0.conda#b40224324679d1966a9fafbd602b28f3 -https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.53.0-py312h9a8786e_0.conda#8490346e9d5efd7a6869582aa0c95b25 -https://conda.anaconda.org/conda-forge/linux-64/glib-2.80.2-h8a4344b_1.conda#dad336abc079b9a38dc10087231619cd +https://conda.anaconda.org/conda-forge/linux-64/coverage-7.6.0-py312h41a817b_0.conda#66c68c204a3eaabc3b4221f1c4bcebbe +https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.53.1-py312h41a817b_0.conda#da921c56bcf69a8b97216ecec0cc4015 +https://conda.anaconda.org/conda-forge/linux-64/glib-2.80.3-h8a4344b_1.conda#a3acc4920c9ca19cb6b295028d606477 https://conda.anaconda.org/conda-forge/linux-64/gmpy2-2.1.5-py312h1d5cde6_1.conda#27abd7664bc87595bd98b6306b8393d1 https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.4-pyhd8ed1ab_0.conda#7b86ecb7d3557821c649b3c31e3eb9f2 https://conda.anaconda.org/conda-forge/noarch/joblib-1.4.2-pyhd8ed1ab_0.conda#25df261d4523d9f9783bcdb7208d872f https://conda.anaconda.org/conda-forge/linux-64/libclang-cpp15-15.0.7-default_h127d8a8_5.conda#d0a9633b53cdc319b8a1a532ae7822b8 https://conda.anaconda.org/conda-forge/linux-64/libclang13-18.1.8-default_h6ae225f_0.conda#28ad2db5c14d2e23d7962b8389e2cc0b https://conda.anaconda.org/conda-forge/linux-64/libflac-1.4.3-h59595ed_0.conda#ee48bf17cc83a00f59ca1494d5646869 -https://conda.anaconda.org/conda-forge/linux-64/libgpg-error-1.49-h4f305b6_0.conda#dfcfd72c7a430d3616763ecfbefe4ca9 +https://conda.anaconda.org/conda-forge/linux-64/libgpg-error-1.50-h4f305b6_0.conda#0d7ff1a8e69565ca3add6925e18e708f https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.62.2-h15f2491_0.conda#8dabe607748cb3d7002ad73cd06f1325 https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.7.0-h2c5496b_1.conda#e2eaefa4de2b7237af7c907b8bbc760a -https://conda.anaconda.org/conda-forge/noarch/meson-1.4.1-pyhd8ed1ab_0.conda#714ca123839eeebb25d12b443067ea64 -https://conda.anaconda.org/conda-forge/linux-64/pillow-10.3.0-py312h287a98d_1.conda#b1325cda3f250f9f842180607054e6ed +https://conda.anaconda.org/conda-forge/noarch/meson-1.5.0-pyhd8ed1ab_0.conda#9d971c5bf99aed063664d6650e7e7ed8 +https://conda.anaconda.org/conda-forge/linux-64/pillow-10.4.0-py312h287a98d_0.conda#59ea71eed98aee0bebbbdd3b118167c7 https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda#f586ac1e56c8638b64f9c8122a7b8a67 https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.8.0-pyhd8ed1ab_0.conda#573fe09d7bd0cd4bcc210d8369b5ca47 https://conda.anaconda.org/conda-forge/noarch/pytest-8.2.2-pyhd8ed1ab_0.conda#0f3f49c22c7ef3a1195fa61dad3c43be https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda#2cf4264fffb9e6eff6031c5b6884d61c https://conda.anaconda.org/pytorch/linux-64/pytorch-cuda-12.1-ha16c6d3_5.tar.bz2#ffc0937cf6ba3ffb299b0c256accc53f https://conda.anaconda.org/conda-forge/linux-64/sip-6.7.12-py312h30efb56_0.conda#32633871002ee9902f747d2236e0d122 -https://conda.anaconda.org/conda-forge/linux-64/tbb-2021.12.0-h297d8ca_1.conda#3ff978d8994f591818a506640c6a7071 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.5.10-h679ed35_3.conda#8cb40f80d08389f6aaf68cf86581ed02 +https://conda.anaconda.org/conda-forge/linux-64/tbb-2021.12.0-h434a139_3.conda#c667c11d1e488a38220ede8a34441bff +https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.6.0-h365ddd8_2.conda#22339cf124753bafda336167f80e7860 https://conda.anaconda.org/conda-forge/linux-64/azure-identity-cpp-1.8.0-hdb0d106_1.conda#a297ffb4b505f51d0f58352c5c13971b https://conda.anaconda.org/conda-forge/linux-64/azure-storage-common-cpp-12.6.0-he3f277c_1.conda#8a10bb068b138dd473300b5fe34a1865 https://conda.anaconda.org/conda-forge/linux-64/gstreamer-1.24.5-haf2f30d_0.conda#c5252c02592373fa8caf5a5327165a89 https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-8.5.0-hfac3d4d_0.conda#f5126317dd0ce0ba26945e411ecc6960 -https://conda.anaconda.org/conda-forge/linux-64/libgcrypt-1.10.3-hd590300_0.conda#32d16ad533c59bb0a3c5ffaf16110829 -https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.25.0-h2736e30_0.conda#1bbc13a65b92eafde06dbdf0ef3658cd +https://conda.anaconda.org/conda-forge/linux-64/libgcrypt-1.11.0-h4ab18f5_0.conda#0a00e32cabe3e571c0611387e7bc2042 +https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.26.0-h26d7fe4_0.conda#7b9d4c93870fb2d644168071d4d76afb https://conda.anaconda.org/conda-forge/linux-64/libsndfile-1.2.2-hc60ed4a_1.conda#ef1910918dd895516a769ed36b5b3a4e https://conda.anaconda.org/conda-forge/noarch/meson-python-0.16.0-pyh0c530f3_0.conda#e16f0dbf502da873be9f9adb0dc52547 https://conda.anaconda.org/conda-forge/linux-64/mkl-2022.1.0-h84fe81f_915.tar.bz2#b9c8f925797a93dbff45e1626b025a6b https://conda.anaconda.org/conda-forge/linux-64/pyqt5-sip-12.12.2-py312h30efb56_5.conda#8a2a122dc4fe14d8cff38f1cf426381f https://conda.anaconda.org/conda-forge/noarch/pytest-cov-5.0.0-pyhd8ed1ab_0.conda#c54c0107057d67ddf077751339ec2c63 https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.6.1-pyhd8ed1ab_0.conda#b39568655c127a9c4a44d178ac99b6d0 -https://conda.anaconda.org/conda-forge/noarch/sympy-1.12.1-pypyh2585a3b_103.conda#4af9db19148140eb2ff3b2a93697063b -https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.26.12-h8bc9c4d_0.conda#ec9824a9e18425707af48d21820970f1 +https://conda.anaconda.org/conda-forge/noarch/sympy-1.13.0-pypyh2585a3b_103.conda#be7ad175eb670a83ff575f86e53c57fb +https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.27.3-hda66527_2.conda#734875312c8196feecc91f89856da612 https://conda.anaconda.org/conda-forge/linux-64/azure-storage-blobs-cpp-12.11.0-ha67cba7_1.conda#f03bba57b85a5b3ac443a871787fc429 https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.24.5-hbaaba92_0.conda#4a485842570569ba754863b2c083b346 https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-16_linux64_mkl.tar.bz2#85f61af03fd291dae33150ffe89dc09a -https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.25.0-h3d9a0c8_0.conda#5e3f7cfcfd74065847da8f8598ff81d3 +https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.26.0-ha262f82_0.conda#89b53708fd67762b26c38c8ecc5d323d https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-255-h3516f8a_1.conda#3366af27f0b593544a6cd453c7932ac5 https://conda.anaconda.org/conda-forge/linux-64/mkl-devel-2022.1.0-ha770c72_916.tar.bz2#69ba49e445f87aea2cba343a71a35ca2 -https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.329-hf74b5d1_5.conda#3d82493d6b434cc47fc9302f3cc11a09 +https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.329-h46c3b66_9.conda#c840f07ec58dc0b06041e7f36550a539 https://conda.anaconda.org/conda-forge/linux-64/azure-storage-files-datalake-cpp-12.10.0-h29b5301_1.conda#bb35c23b178fc17b9e4458766f91da7f https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-16_linux64_mkl.tar.bz2#361bf757b95488de76c4f123805742d3 https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-16_linux64_mkl.tar.bz2#a2f166748917d6d6e4707841ca1f519e https://conda.anaconda.org/conda-forge/linux-64/pulseaudio-client-17.0-hb77b528_0.conda#07f45f1be1c25345faddb8db0de8039b -https://conda.anaconda.org/conda-forge/linux-64/libarrow-16.1.0-h4a673ee_10_cpu.conda#c737ba625b762cc4cbe7c68d27e8d2e1 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-16.1.0-h34456a7_14_cpu.conda#9f76c33cbcbacc87a9555da65713681c https://conda.anaconda.org/conda-forge/linux-64/liblapacke-3.9.0-16_linux64_mkl.tar.bz2#44ccc4d4dca6a8d57fa17442bc64b5a1 -https://conda.anaconda.org/conda-forge/linux-64/numpy-1.26.4-py312heda63a1_0.conda#d8285bea2a350f63fab23bf460221f3f +https://conda.anaconda.org/conda-forge/linux-64/numpy-2.0.0-py312h22e1c76_0.conda#7956c7d65f87aecaba720af6088e72c3 https://conda.anaconda.org/conda-forge/linux-64/qt-main-5.15.8-ha2b5568_22.conda#15de976572f24032540236006d6d0e9f -https://conda.anaconda.org/conda-forge/noarch/array-api-strict-1.1.1-pyhd8ed1ab_0.conda#941bbcd64d1a7b44aeb497f468fc85b4 +https://conda.anaconda.org/conda-forge/noarch/array-api-strict-2.0.1-pyhd8ed1ab_0.conda#2c00d29e0e276f2d32dfe20e698b8eeb https://conda.anaconda.org/conda-forge/linux-64/blas-devel-3.9.0-16_linux64_mkl.tar.bz2#3f92c1c9e1c0e183462c5071aa02cae1 https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.2.1-py312h8572e83_0.conda#12c6a831ef734f0b2dd4caff514cbb7f https://conda.anaconda.org/conda-forge/linux-64/cupy-core-13.2.0-py312hd074ebb_0.conda#75e3cec7a83b84e6955b908b9cd97cb6 -https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-16.1.0-hac33072_10_cpu.conda#1283e2eecd89d1e06c33d004451a4a9e -https://conda.anaconda.org/conda-forge/linux-64/libparquet-16.1.0-h6a7eafb_10_cpu.conda#a65776bbdae47c8b725f77dbed54c5d2 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-16.1.0-he02047a_14_cpu.conda#491c2677e91ada55bfe283dbd14d1aac +https://conda.anaconda.org/conda-forge/linux-64/libparquet-16.1.0-h9e5060d_14_cpu.conda#9fc891cd8f6bd24ba3db2c12fd1528e9 https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.2-py312h1d6d2e6_1.conda#ae00b61f3000d2284d1f2584d4dfafa8 -https://conda.anaconda.org/conda-forge/linux-64/polars-0.20.31-py312hc7f843c_0.conda#c37ecb115967f1056ec360708913fdf1 -https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-16.1.0-py312h70856f0_3_cpu.conda#7f93c5a99083e2a26a301db64f44acb8 +https://conda.anaconda.org/conda-forge/linux-64/polars-1.1.0-py312he319279_0.conda#097cadac45fd8c90ef7bbb8776733ec4 +https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-16.1.0-py312h70856f0_4_cpu.conda#6971b04df592bd625eebd5bfb1d9fc93 https://conda.anaconda.org/conda-forge/linux-64/pyqt-5.15.9-py312h949fe66_5.conda#f6548a564e2d01b2a42020259503945b -https://conda.anaconda.org/conda-forge/linux-64/scipy-1.13.1-py312hc2bc53b_0.conda#864b2399a9c998e17d1a9a4e0c601285 +https://conda.anaconda.org/conda-forge/linux-64/scipy-1.14.0-py312hc2bc53b_1.conda#eae80145f63aa04a02dda456d4883b46 https://conda.anaconda.org/conda-forge/linux-64/blas-2.116-mkl.tar.bz2#c196a26abf6b4f132c88828ab7c2231c https://conda.anaconda.org/conda-forge/linux-64/cupy-13.2.0-py312had87585_0.conda#ce0020d5af7542d12dc022b34613dad3 -https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-16.1.0-hac33072_10_cpu.conda#49d2f8911e30844309aaf1fe221f0d66 -https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.8.4-py312h20ab3a6_2.conda#fbfe798f83f0d66410903ad8f40d5283 -https://conda.anaconda.org/conda-forge/linux-64/pyamg-5.1.0-py312h389efb2_1.conda#323587ece55d7578e88b37fb43e91ac6 -https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-16.1.0-h7e0c224_10_cpu.conda#d3aa33ea25ffdc1147134b202c84158d -https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.8.4-py312h7900ff3_2.conda#ac26198045dff11c94202bb3e1bdc132 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-16.1.0-he02047a_14_cpu.conda#4381fff40c7bcf7ca0142ed4b0120dbc +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.9.1-py312h9201f00_0.conda#e1dc3a7d999666f5c58cbb391940e235 +https://conda.anaconda.org/conda-forge/linux-64/pyamg-5.2.1-py312h389efb2_0.conda#37038b979f8be9666d90a852879368fb +https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-16.1.0-hc9a23c6_14_cpu.conda#c35b4b76394d7414888fd4d66d7ae96a +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.9.1-py312h7900ff3_0.conda#a5031dbd62fa2f33e180f5d7f331b6ea https://conda.anaconda.org/pytorch/linux-64/pytorch-2.3.1-py3.12_cuda12.1_cudnn8.9.2_0.tar.bz2#8806dd010a45f7eb4af40a24ff99de47 -https://conda.anaconda.org/conda-forge/linux-64/pyarrow-16.1.0-py312h9cebb41_3.conda#185d19647c3f7ddbdad8331911042763 +https://conda.anaconda.org/conda-forge/linux-64/pyarrow-16.1.0-py312h9cebb41_4.conda#2097b6ae7186e10c9aab1228636b804f diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index 970294efe0184..fdffda67077c9 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -44,7 +44,7 @@ def get_covariance(self): exp_var_diff = xp.where( exp_var > self.noise_variance_, exp_var_diff, - xp.asarray(0.0, device=device(exp_var)), + xp.asarray(0.0, device=device(exp_var), dtype=exp_var.dtype), ) cov = (components_.T * exp_var_diff) @ components_ _fill_or_add_to_diagonal(cov, self.noise_variance_, xp) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index caad2de01b135..d7187274bf921 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -518,7 +518,7 @@ def _solve_svd(self, X, y): std = xp.std(Xc, axis=0) # avoid division by zero in normalization std[std == 0] = 1.0 - fac = xp.asarray(1.0 / (n_samples - n_classes)) + fac = xp.asarray(1.0 / (n_samples - n_classes), dtype=X.dtype) # 2) Within variance scaling X = xp.sqrt(fac) * (Xc / std) From d79cb58c464f0b54bf0f0286c725d2df837574d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 15 Jul 2024 14:34:09 +0200 Subject: [PATCH 21/35] BLD Check build dependencies in meson.build (#28721) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- Makefile | 2 +- doc/developers/advanced_installation.rst | 1 - meson.build | 1 + pyproject.toml | 2 +- sklearn/meson.build | 35 ++++++++++++++++++++++++ 5 files changed, 38 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 66c7a6c0e93a4..eb6ec39edcbdc 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ all: dev: dev-meson dev-meson: - pip install --verbose --no-build-isolation --editable . --check-build-dependencies --config-settings editable-verbose=true + pip install --verbose --no-build-isolation --editable . --config-settings editable-verbose=true clean: clean-meson diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index bad6ccf9039ad..8946c25f50fb5 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -99,7 +99,6 @@ feature, code or documentation improvement). pip install --editable . \ --verbose --no-build-isolation \ - --check-build-dependencies \ --config-settings editable-verbose=true #. Check that the installed scikit-learn has a version number ending with diff --git a/meson.build b/meson.build index b6b3652a82268..9902d3fe189d2 100644 --- a/meson.build +++ b/meson.build @@ -13,6 +13,7 @@ project( cc = meson.get_compiler('c') cpp = meson.get_compiler('cpp') +cython = meson.get_compiler('cython') # Check compiler is recent enough (see "Toolchain Roadmap" for details) if cc.get_id() == 'gcc' diff --git a/pyproject.toml b/pyproject.toml index 1b613ae561b27..e253dfe311487 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,7 +98,7 @@ build-backend = "mesonpy" requires = [ "meson-python>=0.16.0", "Cython>=3.0.10", - "numpy>=1.25", + "numpy>=2", "scipy>=1.6.0", ] diff --git a/sklearn/meson.build b/sklearn/meson.build index 4552082801337..b9ecb5afad898 100644 --- a/sklearn/meson.build +++ b/sklearn/meson.build @@ -18,6 +18,41 @@ if is_mingw add_project_arguments('-mlong-double-64', language: 'c') endif +# Only check build dependencies version when not cross-compiling, as running +# Python interpreter can be tricky in cross-compilation settings. For more +# details, see https://docs.scipy.org/doc/scipy/building/cross_compilation.html +if not meson.is_cross_build() + if not py.version().version_compare('>=3.9') + error('scikit-learn requires Python>=3.9, got ' + py.version() + ' instead') + endif + + cython_min_version = run_command(py, ['_min_dependencies.py', 'cython'], check: true).stdout().strip() + if not cython.version().version_compare('>=' + cython_min_version) + error('scikit-learn requires Cython>=' + cython_min_version + ', got ' + cython.version() + ' instead') + endif + + meson_python_version = run_command(py, + ['-c', 'import mesonpy; print(mesonpy.__version__)'], check: true).stdout().strip() + meson_python_min_version = run_command(py, ['_min_dependencies.py', 'meson-python'], check: true).stdout().strip() + if not meson_python_version.version_compare('>=' + meson_python_min_version) + error('scikit-learn requires meson-python>=' + meson_python_min_version + ', got ' + meson_python_version + ' instead') + endif + + numpy_version = run_command(py, + ['-c', 'import numpy; print(numpy.__version__)'], check: true).stdout().strip() + numpy_min_version = run_command(py, ['_min_dependencies.py', 'numpy'], check: true).stdout().strip() + if not numpy_version.version_compare('>=' + numpy_min_version) + error('scikit-learn requires numpy>=' + numpy_min_version + ', got ' + numpy_version + ' instead') + endif + + scipy_version = run_command(py, + ['-c', 'import scipy; print(scipy.__version__)'], check: true).stdout().strip() + scipy_min_version = run_command(py, ['_min_dependencies.py', 'scipy'], check: true).stdout().strip() + if not scipy_version.version_compare('>=' + scipy_min_version) + error('scikit-learn requires scipy>=' + scipy_min_version + ', got ' + scipy_version + ' instead') + endif +endif + # Adapted from scipy, each project seems to have its own tweaks for this. One # day using dependency('numpy') will be a thing, see # https://github.com/mesonbuild/meson/issues/9598. From b204dba63a69bdef9a15bcc0953b0b080b400ada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Tue, 16 Jul 2024 14:16:58 +0200 Subject: [PATCH 22/35] BENCH Don't use setuptools based build commands (#29500) --- asv_benchmarks/asv.conf.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/asv_benchmarks/asv.conf.json b/asv_benchmarks/asv.conf.json index ba7b12011acec..21770d656eb98 100644 --- a/asv_benchmarks/asv.conf.json +++ b/asv_benchmarks/asv.conf.json @@ -20,10 +20,10 @@ // Customizable commands for building, installing, and // uninstalling the project. See asv.conf.json documentation. - // - // "install_command": ["python -mpip install {wheel_file}"], - // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], - // "build_command": ["python -m build --wheel -o {build_cache_dir} {build_dir}"], + "install_command": ["python -mpip install {wheel_file}"], + "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + "build_command": ["python -m build --wheel -o {build_cache_dir} {build_dir}"], + // List of branches to benchmark. If not provided, defaults to "master // (for git) or "default" (for mercurial). "branches": ["main"], @@ -72,12 +72,12 @@ // those due to dependency changes. // "matrix": { - "numpy": ["1.25.2"], - "scipy": ["1.11.2"], + "numpy": ["2.0.0"], + "scipy": ["1.14.0"], "cython": ["3.0.10"], "joblib": ["1.3.2"], "threadpoolctl": ["3.2.0"], - "pandas": ["2.1.0"] + "pandas": ["2.2.2"] }, // Combinations of libraries/python versions can be excluded/included From 379a2f196f8794bc18289c37e99e13058238a08b Mon Sep 17 00:00:00 2001 From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> Date: Tue, 16 Jul 2024 15:48:17 +0200 Subject: [PATCH 23/35] DOC improved documentation for `MethodPair` and `RouterMappingPair` (#29489) --- sklearn/utils/_metadata_requests.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py index a558ed8e0abfe..cb2fb03050c39 100644 --- a/sklearn/utils/_metadata_requests.py +++ b/sklearn/utils/_metadata_requests.py @@ -4,7 +4,7 @@ In order to better understand the components implemented in this file, one needs to understand their relationship to one another. -The only relevant public API for end users are the ``set_{method}_request``, +The only relevant public API for end users are the ``set_{method}_request`` methods, e.g. ``estimator.set_fit_request(sample_weight=True)``. However, third-party developers and users who implement custom meta-estimators, need to deal with the objects implemented in this file. @@ -59,10 +59,10 @@ To give the above representation some structure, we use the following objects: -- ``(caller, callee)`` is a namedtuple called ``MethodPair`` +- ``(caller=..., callee=...)`` is a namedtuple called ``MethodPair`` -- The list of ``MethodPair`` stored in the ``mapping`` field is a - ``MethodMapping`` object +- The list of ``MethodPair`` stored in the ``mapping`` field of a `RouterMappingPair` is + a ``MethodMapping`` object - ``(mapping=..., router=...)`` is a namedtuple called ``RouterMappingPair`` @@ -686,13 +686,14 @@ def __str__(self): # This section includes all objects required for MetadataRouter which is used # in routers, returned by their ``get_metadata_routing``. -# This namedtuple is used to store a (mapping, routing) pair. Mapping is a -# MethodMapping object, and routing is the output of `get_metadata_routing`. -# MetadataRouter stores a collection of these namedtuples. +# `RouterMappingPair` is used to store a (mapping, router) tuple where `mapping` is a +# `MethodMapping` object and `router` is the output of `get_metadata_routing`. +# `MetadataRouter` stores a collection of `RouterMappingPair` objects in its +# `_route_mappings` attribute. RouterMappingPair = namedtuple("RouterMappingPair", ["mapping", "router"]) -# A namedtuple storing a single method route. A collection of these namedtuples -# is stored in a MetadataRouter. +# `MethodPair` is used to store a single method routing. `MethodMapping` stores a list +# of `MethodPair` objects in its `_routes` attribute. MethodPair = namedtuple("MethodPair", ["caller", "callee"]) @@ -700,11 +701,11 @@ class MethodMapping: """Stores the mapping between caller and callee methods for a router. This class is primarily used in a ``get_metadata_routing()`` of a router - object when defining the mapping between a sub-object (a sub-estimator or a - scorer) to the router's methods. It stores a collection of namedtuples. + object when defining the mapping between the router's methods and a sub-object (a + sub-estimator or a scorer). - Iterating through an instance of this class will yield named - ``MethodPair(caller, callee)`` tuples. + Iterating through an instance of this class yields + ``MethodPair(caller, callee)`` instances. .. versionadded:: 1.3 """ From a4e2bfbd923302441b7fba54fbf4785f0148f7f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Wed, 17 Jul 2024 11:11:24 +0200 Subject: [PATCH 24/35] MAINT Clean-up some old warning filters (#29460) --- sklearn/cluster/tests/test_spectral.py | 17 -------- .../tests/test_factor_analysis.py | 8 +--- sklearn/ensemble/tests/test_forest.py | 2 - .../tests/test_plot_partial_dependence.py | 14 ------- .../tests/test_partial_dependence.py | 1 - .../linear_model/tests/test_least_angle.py | 9 ++--- sklearn/linear_model/tests/test_sgd.py | 2 - .../manifold/tests/test_spectral_embedding.py | 39 ------------------- sklearn/metrics/cluster/tests/test_common.py | 8 ---- sklearn/metrics/tests/test_score_objects.py | 3 -- sklearn/model_selection/tests/test_search.py | 2 - sklearn/model_selection/tests/test_split.py | 3 -- .../model_selection/tests/test_validation.py | 4 -- sklearn/preprocessing/tests/test_label.py | 2 - sklearn/tests/test_docstring_parameters.py | 4 -- sklearn/tests/test_multioutput.py | 1 - sklearn/tree/tests/test_tree.py | 1 - sklearn/utils/tests/test_multiclass.py | 1 - 18 files changed, 5 insertions(+), 116 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 689a159851f50..a1975902c0c47 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -215,23 +215,6 @@ def test_discretize(n_samples, coo_container): assert adjusted_rand_score(y_true, y_pred) > 0.8 -# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand -# https://github.com/scikit-learn/scikit-learn/issues/15913 -@pytest.mark.filterwarnings( - "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*" -) -# TODO: Remove when pyamg removes the use of np.float -@pytest.mark.filterwarnings( - "ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*" -) -# TODO: Remove when pyamg removes the use of pinv2 -@pytest.mark.filterwarnings( - "ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*" -) -# TODO: Remove when pyamg removes the use of np.find_common_type -@pytest.mark.filterwarnings( - "ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*" -) def test_spectral_clustering_with_arpack_amg_solvers(): # Test that spectral_clustering is the same for arpack and amg solver # Based on toy example from plot_segmentation_toy.py diff --git a/sklearn/decomposition/tests/test_factor_analysis.py b/sklearn/decomposition/tests/test_factor_analysis.py index 76af7f8181598..9175829695b0d 100644 --- a/sklearn/decomposition/tests/test_factor_analysis.py +++ b/sklearn/decomposition/tests/test_factor_analysis.py @@ -9,15 +9,9 @@ from sklearn.decomposition import FactorAnalysis from sklearn.decomposition._factor_analysis import _ortho_rotation from sklearn.exceptions import ConvergenceWarning -from sklearn.utils._testing import ( - assert_almost_equal, - assert_array_almost_equal, - ignore_warnings, -) +from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal -# Ignore warnings from switching to more power iterations in randomized_svd -@ignore_warnings def test_factor_analysis(global_random_seed): # Test FactorAnalysis ability to recover the data covariance structure rng = np.random.RandomState(global_random_seed) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index a750282a3139c..aadf230fd751e 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -879,8 +879,6 @@ def test_random_trees_dense_equal(): assert_array_equal(X_transformed_sparse.toarray(), X_transformed_dense) -# Ignore warnings from switching to more power iterations in randomized_svd -@ignore_warnings def test_random_hasher(): # test random forest hashing on circles dataset # make sure that it is linearly separable. diff --git a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py index 57fc68d07e887..e3480ec033225 100644 --- a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py +++ b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py @@ -42,7 +42,6 @@ def clf_diabetes(diabetes): return clf -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize("grid_resolution", [10, 20]) def test_plot_partial_dependence(grid_resolution, pyplot, clf_diabetes, diabetes): # Test partial dependence plot function. @@ -114,7 +113,6 @@ def test_plot_partial_dependence(grid_resolution, pyplot, clf_diabetes, diabetes assert ax.get_ylabel() == diabetes.feature_names[2] -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize( "kind, centered, subsample, shape", [ @@ -164,7 +162,6 @@ def test_plot_partial_dependence_kind( assert all([ln._y[0] != 0.0 for ln in disp.lines_.ravel() if ln is not None]) -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize( "input_type, feature_names_type", [ @@ -252,7 +249,6 @@ def test_plot_partial_dependence_str_features( assert ax.get_ylabel() == "bmi" -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") def test_plot_partial_dependence_custom_axes(pyplot, clf_diabetes, diabetes): grid_resolution = 25 fig, (ax1, ax2) = pyplot.subplots(1, 2) @@ -288,7 +284,6 @@ def test_plot_partial_dependence_custom_axes(pyplot, clf_diabetes, diabetes): assert ax.get_ylabel() == "bmi" -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize( "kind, lines", [("average", 1), ("individual", 50), ("both", 51)] ) @@ -329,7 +324,6 @@ def test_plot_partial_dependence_passing_numpy_axes( assert len(disp2.axes_[0, 1].get_lines()) == 2 * lines -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize("nrows, ncols", [(2, 2), (3, 1)]) def test_plot_partial_dependence_incorrent_num_axes( pyplot, clf_diabetes, diabetes, nrows, ncols @@ -364,7 +358,6 @@ def test_plot_partial_dependence_incorrent_num_axes( disp.plot(ax=ax_format) -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") def test_plot_partial_dependence_with_same_axes(pyplot, clf_diabetes, diabetes): # The first call to plot_partial_dependence will create two new axes to # place in the space of the passed in axes, which results in a total of @@ -404,7 +397,6 @@ def test_plot_partial_dependence_with_same_axes(pyplot, clf_diabetes, diabetes): ) -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") def test_plot_partial_dependence_feature_name_reuse(pyplot, clf_diabetes, diabetes): # second call to plot does not change the feature names from the first # call @@ -426,7 +418,6 @@ def test_plot_partial_dependence_feature_name_reuse(pyplot, clf_diabetes, diabet assert ax.get_xlabel() == feature_names[i] -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") def test_plot_partial_dependence_multiclass(pyplot): grid_resolution = 25 clf_int = GradientBoostingClassifier(n_estimators=10, random_state=1) @@ -480,7 +471,6 @@ def test_plot_partial_dependence_multiclass(pyplot): multioutput_regression_data = make_regression(n_samples=50, n_targets=2, random_state=0) -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize("target", [0, 1]) def test_plot_partial_dependence_multioutput(pyplot, target): # Test partial dependence plot function on multi-output input. @@ -506,7 +496,6 @@ def test_plot_partial_dependence_multioutput(pyplot, target): assert ax.get_xlabel() == f"x{i}" -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") def test_plot_partial_dependence_dataframe(pyplot, clf_diabetes, diabetes): pd = pytest.importorskip("pandas") df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names) @@ -525,7 +514,6 @@ def test_plot_partial_dependence_dataframe(pyplot, clf_diabetes, diabetes): dummy_classification_data = make_classification(random_state=0) -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize( "data, params, err_msg", [ @@ -619,7 +607,6 @@ def test_plot_partial_dependence_error(pyplot, data, params, err_msg): PartialDependenceDisplay.from_estimator(estimator, X, **params) -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize( "params, err_msg", [ @@ -982,7 +969,6 @@ def test_partial_dependence_kind_error( ) -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize( "line_kw, pd_line_kw, ice_lines_kw, expected_colors", [ diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 58d71def0252d..9768516efa492 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -480,7 +480,6 @@ def fit(self, X, y): return self -@pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize( "estimator, params, err_msg", [ diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index 50c6a7a95626e..9b4a39750e03a 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -117,20 +117,20 @@ def test_all_precomputed(): assert_array_almost_equal(expected, got) +# TODO: remove warning filter when numpy min version >= 2.0.0 @pytest.mark.filterwarnings("ignore: `rcond` parameter will change") -# numpy deprecation def test_lars_lstsq(): # Test that Lars gives least square solution at the end # of the path X1 = 3 * X # use un-normalized dataset clf = linear_model.LassoLars(alpha=0.0) clf.fit(X1, y) - coef_lstsq = np.linalg.lstsq(X1, y, rcond=None)[0] + coef_lstsq = np.linalg.lstsq(X1, y)[0] assert_array_almost_equal(clf.coef_, coef_lstsq) -@pytest.mark.filterwarnings("ignore:`rcond` parameter will change") -# numpy deprecation +# TODO: remove warning filter when numpy min version >= 2.0.0 +@pytest.mark.filterwarnings("ignore: `rcond` parameter will change") def test_lasso_gives_lstsq_solution(): # Test that Lars Lasso gives least square solution at the end # of the path @@ -408,7 +408,6 @@ def test_lars_n_nonzero_coefs(verbose=False): assert len(lars.alphas_) == 7 -@ignore_warnings def test_multitarget(): # Assure that estimators receiving multidimensional y do the right thing Y = np.vstack([y, y**2]).T diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 795a0e62f37f8..25c1ccb60be02 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -25,7 +25,6 @@ assert_almost_equal, assert_array_almost_equal, assert_array_equal, - ignore_warnings, ) @@ -1365,7 +1364,6 @@ def test_elasticnet_convergence(klass): assert_almost_equal(cd.coef_, sgd.coef_, decimal=2, err_msg=err_msg) -@ignore_warnings @pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) def test_partial_fit(klass): third = X.shape[0] // 3 diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 14bb13c080099..6dec35123f9cc 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -244,22 +244,6 @@ def test_spectral_embedding_callable_affinity(sparse_container, seed=36): _assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05) -# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand -# https://github.com/scikit-learn/scikit-learn/issues/15913 -@pytest.mark.filterwarnings( - "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*" -) -# TODO: Remove when pyamg removes the use of np.float -@pytest.mark.filterwarnings( - "ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*" -) -# TODO: Remove when pyamg removes the use of pinv2 -@pytest.mark.filterwarnings( - "ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*" -) -@pytest.mark.filterwarnings( - "ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*" -) @pytest.mark.skipif( not pyamg_available, reason="PyAMG is required for the tests in this function." ) @@ -319,27 +303,9 @@ def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36): se_amg.fit_transform(affinity) -# TODO: Remove filterwarnings when pyamg does replaces sp.rand call with -# np.random.rand: -# https://github.com/scikit-learn/scikit-learn/issues/15913 -@pytest.mark.filterwarnings( - "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*" -) -# TODO: Remove when pyamg removes the use of np.float -@pytest.mark.filterwarnings( - "ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*" -) -# TODO: Remove when pyamg removes the use of pinv2 -@pytest.mark.filterwarnings( - "ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*" -) @pytest.mark.skipif( not pyamg_available, reason="PyAMG is required for the tests in this function." ) -# TODO: Remove when pyamg removes the use of np.find_common_type -@pytest.mark.filterwarnings( - "ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*" -) @pytest.mark.parametrize("dtype", (np.float32, np.float64)) def test_spectral_embedding_amg_solver_failure(dtype, seed=36): # Non-regression test for amg solver failure (issue #13393 on github) @@ -360,7 +326,6 @@ def test_spectral_embedding_amg_solver_failure(dtype, seed=36): _assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05) -@pytest.mark.filterwarnings("ignore:the behavior of nmi will change in version 0.22") def test_pipeline_spectral_clustering(seed=36): # Test using pipeline to do spectral clustering random_state = np.random.RandomState(seed) @@ -509,10 +474,6 @@ def test_error_pyamg_not_available(): se_precomp.fit_transform(S) -# TODO: Remove when pyamg removes the use of np.find_common_type -@pytest.mark.filterwarnings( - "ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*" -) @pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"]) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_spectral_eigen_tol_auto(monkeypatch, solver, csr_container): diff --git a/sklearn/metrics/cluster/tests/test_common.py b/sklearn/metrics/cluster/tests/test_common.py index bc32b7df7f561..0570f0ac2a0f1 100644 --- a/sklearn/metrics/cluster/tests/test_common.py +++ b/sklearn/metrics/cluster/tests/test_common.py @@ -96,8 +96,6 @@ def test_symmetric_non_symmetric_union(): ) -# 0.22 AMI and NMI changes -@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize( "metric_name, y1, y2", [(name, y1, y2) for name in SYMMETRIC_METRICS] ) @@ -114,8 +112,6 @@ def test_non_symmetry(metric_name, y1, y2): assert metric(y1, y2) != pytest.approx(metric(y2, y1)) -# 0.22 AMI and NMI changes -@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("metric_name", NORMALIZED_METRICS) def test_normalized_output(metric_name): upper_bound_1 = [0, 0, 0, 1, 1, 1] @@ -135,8 +131,6 @@ def test_normalized_output(metric_name): assert not (score < 0).any() -# 0.22 AMI and NMI changes -@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS)) def test_permute_labels(metric_name): # All clustering metrics do not change score due to permutations of labels @@ -156,8 +150,6 @@ def test_permute_labels(metric_name): assert_allclose(score_1, metric(X, 1 - y_pred)) -# 0.22 AMI and NMI changes -@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS)) # For all clustering metrics Input parameters can be both # in the form of arrays lists, positive, negative or string diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index bfe8f57f92265..73bb008c47300 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -551,7 +551,6 @@ def test_supervised_cluster_scorers(): assert_almost_equal(score1, score2) -@ignore_warnings def test_raises_on_score_list(): # Test that when a list of scores is returned, we raise proper errors. X, y = make_blobs(random_state=0) @@ -566,7 +565,6 @@ def test_raises_on_score_list(): grid_search.fit(X, y) -@ignore_warnings def test_classification_scorer_sample_weight(): # Test that classification scorers support sample_weight or raise sensible # errors @@ -626,7 +624,6 @@ def test_classification_scorer_sample_weight(): ) -@ignore_warnings def test_regression_scorer_sample_weight(): # Test that regression scorers support sample_weight or raise sensible # errors diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 77b99747dd4be..6af7451261348 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -273,7 +273,6 @@ def test_SearchCV_with_fit_params(SearchCV): searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10)) -@ignore_warnings def test_grid_search_no_score(): # Test grid-search on classifier that has no score function. clf = LinearSVC(random_state=0) @@ -812,7 +811,6 @@ def test_y_as_list(): assert hasattr(grid_search, "cv_results_") -@ignore_warnings def test_pandas_input(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 4e594499ae59a..04c0296229125 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -775,7 +775,6 @@ def test_group_shuffle_split_default_test_size(train_size, exp_train, exp_test): assert len(X_test) == exp_test -@ignore_warnings def test_stratified_shuffle_split_init(): X = np.arange(7) y = np.asarray([0, 1, 1, 1, 2, 2, 2]) @@ -1153,7 +1152,6 @@ def test_leave_one_p_group_out_error_on_fewer_number_of_groups(): next(LeavePGroupsOut(n_groups=3).split(X, y, groups)) -@ignore_warnings def test_repeated_cv_value_errors(): # n_repeats is not integer or <= 0 for cv in (RepeatedKFold, RepeatedStratifiedKFold): @@ -1425,7 +1423,6 @@ def test_train_test_split_32bit_overflow(): assert y_train.size + y_test.size == big_number -@ignore_warnings def test_train_test_split_pandas(): # check train_test_split doesn't destroy pandas dataframe types = [MockDataFrame] diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 33d4d366bf17a..2b73068097e02 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -620,7 +620,6 @@ def test_cross_val_score_predict_groups(): cross_val_predict(estimator=clf, X=X, y=y, cv=cv) -@pytest.mark.filterwarnings("ignore: Using or importing the ABCs from") def test_cross_val_score_pandas(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] @@ -1116,8 +1115,6 @@ def test_cross_val_predict_input_types(coo_container): assert_array_equal(predictions.shape, (150,)) -@pytest.mark.filterwarnings("ignore: Using or importing the ABCs from") -# python3.7 deprecation warnings in pandas via matplotlib :-/ def test_cross_val_predict_pandas(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] @@ -2074,7 +2071,6 @@ def test_score_memmap(): sleep(1.0) -@pytest.mark.filterwarnings("ignore: Using or importing the ABCs from") def test_permutation_test_score_pandas(): # check permutation_test_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 90e3aa210eebb..da3079406b305 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -19,7 +19,6 @@ from sklearn.utils._testing import ( _array_api_for_tests, assert_array_equal, - ignore_warnings, ) from sklearn.utils.fixes import ( COO_CONTAINERS, @@ -143,7 +142,6 @@ def test_label_binarizer_pandas_nullable(dtype, unique_first): assert_array_equal(y_out, [[1], [0]]) -@ignore_warnings def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 0c3b0e367923a..d028d429abead 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -70,10 +70,6 @@ ] -# numpydoc 0.8.0's docscrape tool raises because of collections.abc under -# Python 3.7 -@pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_docstring_parameters(): # Test module docstring formatting diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py index 7c32180c27682..4b055169776d0 100644 --- a/sklearn/tests/test_multioutput.py +++ b/sklearn/tests/test_multioutput.py @@ -698,7 +698,6 @@ def fit(self, X, y, sample_weight=None, **fit_params): return super().fit(X, y, sample_weight) -@pytest.mark.filterwarnings("ignore:`n_features_in_` is deprecated") @pytest.mark.parametrize( "estimator, dataset", [ diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index b59c857e3dccd..60d864a73a790 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1543,7 +1543,6 @@ def test_explicit_sparse_zeros(tree_type, csc_container, csr_container): assert_array_almost_equal(s.predict_proba(X1), d.predict_proba(X2)) -@ignore_warnings def check_raise_error_on_1d_input(name): TreeEstimator = ALL_TREES[name] diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 95a1ea0bb0806..49f224b952d5d 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -413,7 +413,6 @@ def test_check_classification_targets(): check_classification_targets(example) -# @ignore_warnings def test_type_of_target(): for group, group_examples in EXAMPLES.items(): for example in group_examples: From 6fd2f8372227ac394cd3dd5cd96d110fa49355b8 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 18 Jul 2024 03:11:52 -0400 Subject: [PATCH 25/35] MNT convert `unsigned char` to `uint8_t` (#29510) Signed-off-by: Adam Li --- sklearn/tree/_splitter.pxd | 6 +++--- sklearn/tree/_splitter.pyx | 22 +++++++++++----------- sklearn/tree/_tree.pxd | 8 ++++---- sklearn/tree/_tree.pyx | 29 ++++++++++++++--------------- sklearn/tree/_utils.pxd | 4 ++-- sklearn/tree/_utils.pyx | 2 +- 6 files changed, 35 insertions(+), 36 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 6b96878fb88ad..485a18be1e522 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -5,7 +5,7 @@ from ._criterion cimport Criterion from ._tree cimport ParentInfo -from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, uint32_t +from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, uint8_t, uint32_t cdef struct SplitRecord: @@ -20,7 +20,7 @@ cdef struct SplitRecord: float64_t impurity_right # Impurity of the right split. float64_t lower_bound # Lower bound on value of both children for monotonicity float64_t upper_bound # Upper bound on value of both children for monotonicity - unsigned char missing_go_to_left # Controls if missing values go to the left node. + uint8_t missing_go_to_left # Controls if missing values go to the left node. intp_t n_missing # Number of missing values for the feature being split on cdef class Splitter: @@ -81,7 +81,7 @@ cdef class Splitter: object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ) except -1 cdef int node_reset( diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 7bd0dbe71d0f0..ad8e3eb84ed2c 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -113,7 +113,7 @@ cdef class Splitter: object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ) except -1: """Initialize the splitter. @@ -932,14 +932,14 @@ cdef class DensePartitioner: cdef intp_t start cdef intp_t end cdef intp_t n_missing - cdef const unsigned char[::1] missing_values_in_feature_mask + cdef const uint8_t[::1] missing_values_in_feature_mask def __init__( self, const float32_t[:, :] X, intp_t[::1] samples, float32_t[::1] feature_values, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ): self.X = X self.samples = samples @@ -967,7 +967,7 @@ cdef class DensePartitioner: const float32_t[:, :] X = self.X intp_t[::1] samples = self.samples intp_t n_missing = 0 - const unsigned char[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask + const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask # Sort samples along that feature; by # copying the values into an array and @@ -1022,7 +1022,7 @@ cdef class DensePartitioner: float32_t max_feature_value = -INFINITY_32t float32_t[::1] feature_values = self.feature_values intp_t n_missing = 0 - const unsigned char[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask + const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask # We are copying the values into an array and # finding min/max of the array in a manner which utilizes the cache more @@ -1184,7 +1184,7 @@ cdef class SparsePartitioner: cdef intp_t start cdef intp_t end cdef intp_t n_missing - cdef const unsigned char[::1] missing_values_in_feature_mask + cdef const uint8_t[::1] missing_values_in_feature_mask cdef const float32_t[::1] X_data cdef const int32_t[::1] X_indices @@ -1205,7 +1205,7 @@ cdef class SparsePartitioner: intp_t[::1] samples, intp_t n_samples, float32_t[::1] feature_values, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ): if not (issparse(X) and X.format == "csc"): raise ValueError("X should be in csc format") @@ -1607,7 +1607,7 @@ cdef class BestSplitter(Splitter): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ) except -1: Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = DensePartitioner( @@ -1635,7 +1635,7 @@ cdef class BestSparseSplitter(Splitter): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ) except -1: Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = SparsePartitioner( @@ -1663,7 +1663,7 @@ cdef class RandomSplitter(Splitter): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ) except -1: Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = DensePartitioner( @@ -1691,7 +1691,7 @@ cdef class RandomSparseSplitter(Splitter): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ) except -1: Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = SparsePartitioner( diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 275b34f187e0f..831ca38a11148 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -6,7 +6,7 @@ import numpy as np cimport numpy as cnp -from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t +from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t, uint32_t from ._splitter cimport Splitter from ._splitter cimport SplitRecord @@ -21,7 +21,7 @@ cdef struct Node: float64_t impurity # Impurity of the node (i.e., the value of the criterion) intp_t n_node_samples # Number of samples at the node float64_t weighted_n_node_samples # Weighted number of samples at the node - unsigned char missing_go_to_left # Whether features have missing values + uint8_t missing_go_to_left # Whether features have missing values cdef struct ParentInfo: @@ -58,7 +58,7 @@ cdef class Tree: intp_t feature, float64_t threshold, float64_t impurity, intp_t n_node_samples, float64_t weighted_n_node_samples, - unsigned char missing_go_to_left) except -1 nogil + uint8_t missing_go_to_left) except -1 nogil cdef int _resize(self, intp_t capacity) except -1 nogil cdef int _resize_c(self, intp_t capacity=*) except -1 nogil @@ -105,7 +105,7 @@ cdef class TreeBuilder: object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight=*, - const unsigned char[::1] missing_values_in_feature_mask=*, + const uint8_t[::1] missing_values_in_feature_mask=*, ) cdef _check_input( diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 1dfacbf068767..43b7770131497 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -80,7 +80,7 @@ cdef class TreeBuilder: object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight=None, - const unsigned char[::1] missing_values_in_feature_mask=None, + const uint8_t[::1] missing_values_in_feature_mask=None, ): """Build a decision tree from the training set (X, y).""" pass @@ -156,7 +156,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight=None, - const unsigned char[::1] missing_values_in_feature_mask=None, + const uint8_t[::1] missing_values_in_feature_mask=None, ): """Build a decision tree from the training set (X, y).""" @@ -411,7 +411,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight=None, - const unsigned char[::1] missing_values_in_feature_mask=None, + const uint8_t[::1] missing_values_in_feature_mask=None, ): """Build a decision tree from the training set (X, y).""" @@ -910,7 +910,7 @@ cdef class Tree: intp_t feature, float64_t threshold, float64_t impurity, intp_t n_node_samples, float64_t weighted_n_node_samples, - unsigned char missing_go_to_left) except -1 nogil: + uint8_t missing_go_to_left) except -1 nogil: """Add a node to the tree. The new node registers itself as the child of its parent. @@ -1578,7 +1578,7 @@ cdef class _CCPPruneController: """Save metrics when pruning""" pass - cdef void after_pruning(self, unsigned char[:] in_subtree) noexcept nogil: + cdef void after_pruning(self, uint8_t[:] in_subtree) noexcept nogil: """Called after pruning""" pass @@ -1597,7 +1597,7 @@ cdef class _AlphaPruner(_CCPPruneController): # less than or equal to self.ccp_alpha return self.ccp_alpha < effective_alpha - cdef void after_pruning(self, unsigned char[:] in_subtree) noexcept nogil: + cdef void after_pruning(self, uint8_t[:] in_subtree) noexcept nogil: """Updates the number of leaves in subtree""" for i in range(in_subtree.shape[0]): if in_subtree[i]: @@ -1627,7 +1627,7 @@ cdef struct CostComplexityPruningRecord: intp_t node_idx intp_t parent -cdef _cost_complexity_prune(unsigned char[:] leaves_in_subtree, # OUT +cdef _cost_complexity_prune(uint8_t[:] leaves_in_subtree, # OUT Tree orig_tree, _CCPPruneController controller): """Perform cost complexity pruning. @@ -1640,7 +1640,7 @@ cdef _cost_complexity_prune(unsigned char[:] leaves_in_subtree, # OUT Parameters ---------- - leaves_in_subtree : unsigned char[:] + leaves_in_subtree : uint8_t[:] Output for leaves of subtree orig_tree : Tree Original tree @@ -1674,10 +1674,9 @@ cdef _cost_complexity_prune(unsigned char[:] leaves_in_subtree, # OUT intp_t parent_idx # candidate nodes that can be pruned - unsigned char[:] candidate_nodes = np.zeros(shape=n_nodes, - dtype=np.uint8) + uint8_t[:] candidate_nodes = np.zeros(shape=n_nodes, dtype=np.uint8) # nodes in subtree - unsigned char[:] in_subtree = np.ones(shape=n_nodes, dtype=np.uint8) + uint8_t[:] in_subtree = np.ones(shape=n_nodes, dtype=np.uint8) intp_t pruned_branch_node_idx float64_t subtree_alpha float64_t effective_alpha @@ -1811,7 +1810,7 @@ def _build_pruned_tree_ccp( cdef: intp_t n_nodes = orig_tree.node_count - unsigned char[:] leaves_in_subtree = np.zeros( + uint8_t[:] leaves_in_subtree = np.zeros( shape=n_nodes, dtype=np.uint8) pruning_controller = _AlphaPruner(ccp_alpha=ccp_alpha) @@ -1843,7 +1842,7 @@ def ccp_pruning_path(Tree orig_tree): corresponding alpha value in ``ccp_alphas``. """ cdef: - unsigned char[:] leaves_in_subtree = np.zeros( + uint8_t[:] leaves_in_subtree = np.zeros( shape=orig_tree.node_count, dtype=np.uint8) path_finder = _PathFinder(orig_tree.node_count) @@ -1876,7 +1875,7 @@ cdef struct BuildPrunedRecord: cdef _build_pruned_tree( Tree tree, # OUT Tree orig_tree, - const unsigned char[:] leaves_in_subtree, + const uint8_t[:] leaves_in_subtree, intp_t capacity ): """Build a pruned tree. @@ -1890,7 +1889,7 @@ cdef _build_pruned_tree( Location to place the pruned tree orig_tree : Tree Original tree - leaves_in_subtree : unsigned char memoryview, shape=(node_count, ) + leaves_in_subtree : uint8_t memoryview, shape=(node_count, ) Boolean mask for leaves to include in subtree capacity : intp_t Number of nodes to initially allocate in pruned tree diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 014845e67a248..de16cc65b32a9 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -6,7 +6,7 @@ cimport numpy as cnp from ._tree cimport Node from ..neighbors._quad_tree cimport Cell -from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t +from ..utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t cdef enum: # Max value for our rand_r replacement (near the bottom). @@ -26,7 +26,7 @@ ctypedef fused realloc_ptr: # Add pointer types here as needed. (float32_t*) (intp_t*) - (unsigned char*) + (uint8_t*) (WeightedPQueueRecord*) (float64_t*) (float64_t**) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index a68c073ac2a89..c5e936ae48eb1 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -447,7 +447,7 @@ def _any_isnan_axis0(const float32_t[:, :] X): intp_t i, j intp_t n_samples = X.shape[0] intp_t n_features = X.shape[1] - unsigned char[::1] isnan_out = np.zeros(X.shape[1], dtype=np.bool_) + uint8_t[::1] isnan_out = np.zeros(X.shape[1], dtype=np.bool_) with nogil: for i in range(n_samples): From 21ab5e11dfb258a222fc7557915acdaf0b0cca1a Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Thu, 18 Jul 2024 20:43:15 +0500 Subject: [PATCH 26/35] FIX avoid error for metrics on polars series for numpy<1.21 (#29490) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Loïc Estève --- sklearn/metrics/tests/test_common.py | 17 +++++++++++++++++ sklearn/utils/_array_api.py | 10 +++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index b7fa3319b118c..a67a254c2f4e1 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -2039,3 +2039,20 @@ def yield_metric_checker_combinations(metric_checkers=array_api_metric_checkers) @pytest.mark.parametrize("metric, check_func", yield_metric_checker_combinations()) def test_array_api_compliance(metric, array_namespace, device, dtype_name, check_func): check_func(metric, array_namespace, device, dtype_name) + + +@pytest.mark.parametrize("df_lib_name", ["pandas", "polars"]) +@pytest.mark.parametrize("metric_name", sorted(ALL_METRICS)) +def test_metrics_dataframe_series(metric_name, df_lib_name): + df_lib = pytest.importorskip(df_lib_name) + + y_pred = df_lib.Series([0.0, 1.0, 0, 1.0]) + y_true = df_lib.Series([1.0, 0.0, 0.0, 0.0]) + + metric = ALL_METRICS[metric_name] + try: + expected_metric = metric(y_pred.to_numpy(), y_true.to_numpy()) + except ValueError: + pytest.skip(f"{metric_name} can not deal with 1d inputs") + + assert_allclose(metric(y_pred, y_true), expected_metric) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 63c14386c04e9..2f8e5dddd6868 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -439,7 +439,15 @@ def reshape(self, x, shape, *, copy=None): return numpy.reshape(x, shape) def isdtype(self, dtype, kind): - return isdtype(dtype, kind, xp=self) + try: + return isdtype(dtype, kind, xp=self) + except TypeError: + # In older versions of numpy, data types that arise from outside + # numpy like from a Polars Series raise a TypeError. + # e.g. TypeError: Cannot interpret 'Int64' as a data type. + # Therefore, we return False. + # TODO: Remove when minimum supported version of numpy is >= 1.21. + return False def pow(self, x1, x2): return numpy.power(x1, x2) From 9d39f57399d6f1f7d8e8d4351dbc3e9244b98d28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Morais?= <15629444+ojoaomorais@users.noreply.github.com> Date: Thu, 18 Jul 2024 13:27:50 -0300 Subject: [PATCH 27/35] Fixing NeighborhoodComponentAnalysis documentation. (#29441) --- sklearn/neighbors/_nca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 8e2bf87f7182c..f9756137eff24 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -56,8 +56,8 @@ class NeighborhoodComponentsAnalysis( - `'auto'` Depending on `n_components`, the most reasonable initialization - will be chosen. If `n_components <= n_classes` we use `'lda'`, as - it uses labels information. If not, but + is chosen. If `n_components <= min(n_features, n_classes - 1)` + we use `'lda'`, as it uses labels information. If not, but `n_components < min(n_features, n_samples)`, we use `'pca'`, as it projects data in meaningful directions (those of higher variance). Otherwise, we just use `'identity'`. From 1bcddcb723b5ecb4acdaf24ba1cc6cacf31540f7 Mon Sep 17 00:00:00 2001 From: "Farid \"Freddie\" Taba" Date: Fri, 19 Jul 2024 04:55:24 -0700 Subject: [PATCH 28/35] MAINT Deprecate scoring='max_error' and replace it by scoring='neg_max_error' (#29462) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Loïc Estève --- doc/modules/model_evaluation.rst | 2 +- doc/whats_new/v1.6.rst | 4 ++++ sklearn/metrics/_scorer.py | 25 +++++++++++++++++++-- sklearn/metrics/tests/test_score_objects.py | 12 +++++++++- 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 080ed0c63a58c..7c2314fc3a3a7 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -92,7 +92,7 @@ Scoring Function **Regression** 'explained_variance' :func:`metrics.explained_variance_score` -'max_error' :func:`metrics.max_error` +'neg_max_error' :func:`metrics.max_error` 'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` 'neg_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_root_mean_squared_error' :func:`metrics.root_mean_squared_error` diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 0024d979eeb19..9f1b8514de4a0 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -219,6 +219,10 @@ Changelog :pr:`29210` by :user:`Marc Torrellas Socastro ` and :user:`Stefanie Senger `. +- |API| scoring="neg_max_error" should be used instead of + scoring="max_error" which is now deprecated. + :pr:`29462` by :user:`Farid "Freddie" Taba `. + :mod:`sklearn.model_selection` .............................. diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index c1a916aa0b5f3..bbc1424c335fb 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -219,6 +219,8 @@ def __init__(self, score_func, sign, kwargs, response_method="predict"): self._sign = sign self._kwargs = kwargs self._response_method = response_method + # TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6) + self._deprecation_msg = None def _get_pos_label(self): if "pos_label" in self._kwargs: @@ -270,6 +272,12 @@ def __call__(self, estimator, X, y_true, sample_weight=None, **kwargs): score : float Score function applied to prediction of estimator on X. """ + # TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6) + if self._deprecation_msg is not None: + warnings.warn( + self._deprecation_msg, category=DeprecationWarning, stacklevel=2 + ) + _raise_for_params(kwargs, self, None) _kwargs = copy.deepcopy(kwargs) @@ -420,7 +428,12 @@ def get_scorer(scoring): """ if isinstance(scoring, str): try: - scorer = copy.deepcopy(_SCORERS[scoring]) + if scoring == "max_error": + # TODO (1.8): scoring="max_error" has been deprecated in 1.6, + # remove in 1.8 + scorer = max_error_scorer + else: + scorer = copy.deepcopy(_SCORERS[scoring]) except KeyError: raise ValueError( "%r is not a valid scoring value. " @@ -758,7 +771,15 @@ def make_scorer( # Standard regression scores explained_variance_scorer = make_scorer(explained_variance_score) r2_scorer = make_scorer(r2_score) +neg_max_error_scorer = make_scorer(max_error, greater_is_better=False) max_error_scorer = make_scorer(max_error, greater_is_better=False) +# TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6) +deprecation_msg = ( + "Scoring method max_error was renamed to " + "neg_max_error in version 1.6 and will " + "be removed in 1.8." +) +max_error_scorer._deprecation_msg = deprecation_msg neg_mean_squared_error_scorer = make_scorer(mean_squared_error, greater_is_better=False) neg_mean_squared_log_error_scorer = make_scorer( mean_squared_log_error, greater_is_better=False @@ -867,7 +888,7 @@ def negative_likelihood_ratio(y_true, y_pred): _SCORERS = dict( explained_variance=explained_variance_scorer, r2=r2_scorer, - max_error=max_error_scorer, + neg_max_error=neg_max_error_scorer, matthews_corrcoef=matthews_corrcoef_scorer, neg_median_absolute_error=neg_median_absolute_error_scorer, neg_mean_absolute_error=neg_mean_absolute_error_scorer, diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 73bb008c47300..ac4bf731ee02c 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -78,7 +78,7 @@ "mean_absolute_percentage_error", "mean_squared_error", "median_absolute_error", - "max_error", + "neg_max_error", "neg_mean_poisson_deviance", "neg_mean_gamma_deviance", ] @@ -706,6 +706,16 @@ def test_scoring_is_not_metric(): check_scoring(KMeans(), scoring=cluster_module.rand_score) +def test_deprecated_scorer(): + X, y = make_regression(n_samples=10, n_features=1, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + reg = DecisionTreeRegressor() + reg.fit(X_train, y_train) + deprecated_scorer = get_scorer("max_error") + with pytest.warns(DeprecationWarning): + deprecated_scorer(reg, X_test, y_test) + + @pytest.mark.parametrize( ( "scorers,expected_predict_count," From a4582c078237774518af096ee142b63d9c05e05b Mon Sep 17 00:00:00 2001 From: Michael Dawson Date: Fri, 19 Jul 2024 10:46:50 -0400 Subject: [PATCH 29/35] fix typo on `LeaveOneGroupOut` documentation (#29513) --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index bfd741eee5811..e78440a9099e8 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1279,7 +1279,7 @@ class LeaveOneGroupOut(GroupsConsumerMixin, BaseCrossValidator): Provides train/test indices to split data such that each training set is comprised of all samples except ones belonging to one specific group. - Arbitrary domain specific group information is provided an array integers + Arbitrary domain specific group information is provided as an array of integers that encodes the group of each sample. For instance the groups could be the year of collection of the samples From 70a84ea2db68d3cab1df30ed374445be2ac67dd4 Mon Sep 17 00:00:00 2001 From: Thomas Date: Fri, 19 Jul 2024 17:55:23 +0200 Subject: [PATCH 30/35] DOC Fix documentation example sparsefuncs_fast (#29526) (#29527) --- sklearn/utils/sparsefuncs_fast.pyx | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index dbba23c615d63..23261c59de320 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -489,7 +489,11 @@ def inplace_csr_row_normalize_l1(X): -------- >>> from scipy.sparse import csr_matrix >>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1 - >>> X = csr_matrix(([1.0, 2.0, 3.0], [0, 2, 3], [0, 3, 4]), shape=(3, 4)) + >>> import numpy as np + >>> indptr = np.array([0, 2, 3, 4]) + >>> indices = np.array([0, 1, 2, 3]) + >>> data = np.array([1.0, 2.0, 3.0, 4.0]) + >>> X = csr_matrix((data, indices, indptr), shape=(3, 4)) >>> X.toarray() array([[1., 2., 0., 0.], [0., 0., 3., 0.], @@ -547,7 +551,11 @@ def inplace_csr_row_normalize_l2(X): -------- >>> from scipy.sparse import csr_matrix >>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2 - >>> X = csr_matrix(([1.0, 2.0, 3.0], [0, 2, 3], [0, 3, 4]), shape=(3, 4)) + >>> import numpy as np + >>> indptr = np.array([0, 2, 3, 4]) + >>> indices = np.array([0, 1, 2, 3]) + >>> data = np.array([1.0, 2.0, 3.0, 4.0]) + >>> X = csr_matrix((data, indices, indptr), shape=(3, 4)) >>> X.toarray() array([[1., 2., 0., 0.], [0., 0., 3., 0.], From 3d5e243701efe3c6d8184c72779c14d4b9b8f9ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Sat, 20 Jul 2024 02:55:14 +0200 Subject: [PATCH 31/35] MAINT Remove assert_no_warnings from tests (#29525) --- sklearn/compose/tests/test_target.py | 12 +- sklearn/metrics/tests/test_classification.py | 212 ++++++++++--------- sklearn/preprocessing/tests/test_encoders.py | 5 +- sklearn/tests/test_base.py | 12 +- sklearn/utils/_testing.py | 2 - sklearn/utils/tests/test_testing.py | 46 ++-- sklearn/utils/tests/test_validation.py | 54 +++-- 7 files changed, 177 insertions(+), 166 deletions(-) diff --git a/sklearn/compose/tests/test_target.py b/sklearn/compose/tests/test_target.py index fd885459e76d1..f16ee5a31bf67 100644 --- a/sklearn/compose/tests/test_target.py +++ b/sklearn/compose/tests/test_target.py @@ -10,7 +10,7 @@ from sklearn.linear_model import LinearRegression, OrthogonalMatchingPursuit from sklearn.pipeline import Pipeline from sklearn.preprocessing import FunctionTransformer, StandardScaler -from sklearn.utils._testing import assert_allclose, assert_no_warnings +from sklearn.utils._testing import assert_allclose friedman = datasets.make_friedman1(random_state=0) @@ -66,17 +66,17 @@ def test_transform_target_regressor_invertible(): ) with pytest.warns( UserWarning, - match=( - "The provided functions or" - " transformer are not strictly inverse of each other." - ), + match=(r"The provided functions.* are not strictly inverse of each other"), ): regr.fit(X, y) regr = TransformedTargetRegressor( regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log ) regr.set_params(check_inverse=False) - assert_no_warnings(regr.fit, X, y) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + regr.fit(X, y) def _check_standard_scaled(y, y_pred): diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index aa612f73ef5c7..d3a94a015ca9c 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -45,7 +45,6 @@ assert_almost_equal, assert_array_almost_equal, assert_array_equal, - assert_no_warnings, ignore_warnings, ) from sklearn.utils.extmath import _nanaverage @@ -266,24 +265,24 @@ def test_precision_recall_f1_score_binary(): # individual scoring function that can be used for grid search: in the # binary class case the score is the value of the measure for the positive # class (e.g. label == 1). This is deprecated for average != 'binary'. - for kwargs, my_assert in [ - ({}, assert_no_warnings), - ({"average": "binary"}, assert_no_warnings), - ]: - ps = my_assert(precision_score, y_true, y_pred, **kwargs) - assert_array_almost_equal(ps, 0.85, 2) + for kwargs in [{}, {"average": "binary"}]: + with warnings.catch_warnings(): + warnings.simplefilter("error") - rs = my_assert(recall_score, y_true, y_pred, **kwargs) - assert_array_almost_equal(rs, 0.68, 2) + ps = precision_score(y_true, y_pred, **kwargs) + assert_array_almost_equal(ps, 0.85, 2) - fs = my_assert(f1_score, y_true, y_pred, **kwargs) - assert_array_almost_equal(fs, 0.76, 2) + rs = recall_score(y_true, y_pred, **kwargs) + assert_array_almost_equal(rs, 0.68, 2) - assert_almost_equal( - my_assert(fbeta_score, y_true, y_pred, beta=2, **kwargs), - (1 + 2**2) * ps * rs / (2**2 * ps + rs), - 2, - ) + fs = f1_score(y_true, y_pred, **kwargs) + assert_array_almost_equal(fs, 0.76, 2) + + assert_almost_equal( + fbeta_score(y_true, y_pred, beta=2, **kwargs), + (1 + 2**2) * ps * rs / (2**2 * ps + rs), + 2, + ) @ignore_warnings @@ -1919,22 +1918,23 @@ def test_precision_recall_f1_no_labels(beta, average, zero_division): y_true = np.zeros((20, 3)) y_pred = np.zeros_like(y_true) - p, r, f, s = assert_no_warnings( - precision_recall_fscore_support, - y_true, - y_pred, - average=average, - beta=beta, - zero_division=zero_division, - ) - fbeta = assert_no_warnings( - fbeta_score, - y_true, - y_pred, - beta=beta, - average=average, - zero_division=zero_division, - ) + with warnings.catch_warnings(): + warnings.simplefilter("error") + + p, r, f, s = precision_recall_fscore_support( + y_true, + y_pred, + average=average, + beta=beta, + zero_division=zero_division, + ) + fbeta = fbeta_score( + y_true, + y_pred, + beta=beta, + average=average, + zero_division=zero_division, + ) assert s is None # if zero_division = nan, check that all metrics are nan and exit @@ -1984,17 +1984,20 @@ def test_precision_recall_f1_no_labels_average_none(zero_division): # |y_i| = [0, 0, 0] # |y_hat_i| = [0, 0, 0] - p, r, f, s = assert_no_warnings( - precision_recall_fscore_support, - y_true, - y_pred, - average=None, - beta=1.0, - zero_division=zero_division, - ) - fbeta = assert_no_warnings( - fbeta_score, y_true, y_pred, beta=1.0, average=None, zero_division=zero_division - ) + with warnings.catch_warnings(): + warnings.simplefilter("error") + + p, r, f, s = precision_recall_fscore_support( + y_true, + y_pred, + average=None, + beta=1.0, + zero_division=zero_division, + ) + fbeta = fbeta_score( + y_true, y_pred, beta=1.0, average=None, zero_division=zero_division + ) + zero_division = np.float64(zero_division) assert_array_almost_equal(p, [zero_division, zero_division, zero_division], 2) assert_array_almost_equal(r, [zero_division, zero_division, zero_division], 2) @@ -2138,59 +2141,57 @@ def test_prf_warnings(): @pytest.mark.parametrize("zero_division", [0, 1, np.nan]) def test_prf_no_warnings_if_zero_division_set(zero_division): - # average of per-label scores - f = precision_recall_fscore_support - for average in [None, "weighted", "macro"]: - assert_no_warnings( - f, [0, 1, 2], [1, 1, 2], average=average, zero_division=zero_division - ) + with warnings.catch_warnings(): + warnings.simplefilter("error") - assert_no_warnings( - f, [1, 1, 2], [0, 1, 2], average=average, zero_division=zero_division - ) + # average of per-label scores + for average in [None, "weighted", "macro"]: + precision_recall_fscore_support( + [0, 1, 2], [1, 1, 2], average=average, zero_division=zero_division + ) - # average of per-sample scores - assert_no_warnings( - f, - np.array([[1, 0], [1, 0]]), - np.array([[1, 0], [0, 0]]), - average="samples", - zero_division=zero_division, - ) + precision_recall_fscore_support( + [1, 1, 2], [0, 1, 2], average=average, zero_division=zero_division + ) - assert_no_warnings( - f, - np.array([[1, 0], [0, 0]]), - np.array([[1, 0], [1, 0]]), - average="samples", - zero_division=zero_division, - ) + # average of per-sample scores + precision_recall_fscore_support( + np.array([[1, 0], [1, 0]]), + np.array([[1, 0], [0, 0]]), + average="samples", + zero_division=zero_division, + ) - # single score: micro-average - assert_no_warnings( - f, - np.array([[1, 1], [1, 1]]), - np.array([[0, 0], [0, 0]]), - average="micro", - zero_division=zero_division, - ) + precision_recall_fscore_support( + np.array([[1, 0], [0, 0]]), + np.array([[1, 0], [1, 0]]), + average="samples", + zero_division=zero_division, + ) - assert_no_warnings( - f, - np.array([[0, 0], [0, 0]]), - np.array([[1, 1], [1, 1]]), - average="micro", - zero_division=zero_division, - ) + # single score: micro-average + precision_recall_fscore_support( + np.array([[1, 1], [1, 1]]), + np.array([[0, 0], [0, 0]]), + average="micro", + zero_division=zero_division, + ) - # single positive label - assert_no_warnings( - f, [1, 1], [-1, -1], average="binary", zero_division=zero_division - ) + precision_recall_fscore_support( + np.array([[0, 0], [0, 0]]), + np.array([[1, 1], [1, 1]]), + average="micro", + zero_division=zero_division, + ) - assert_no_warnings( - f, [-1, -1], [1, 1], average="binary", zero_division=zero_division - ) + # single positive label + precision_recall_fscore_support( + [1, 1], [-1, -1], average="binary", zero_division=zero_division + ) + + precision_recall_fscore_support( + [-1, -1], [1, 1], average="binary", zero_division=zero_division + ) with warnings.catch_warnings(record=True) as record: warnings.simplefilter("always") @@ -2202,13 +2203,16 @@ def test_prf_no_warnings_if_zero_division_set(zero_division): @pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan]) def test_recall_warnings(zero_division): - assert_no_warnings( - recall_score, - np.array([[1, 1], [1, 1]]), - np.array([[0, 0], [0, 0]]), - average="micro", - zero_division=zero_division, - ) + with warnings.catch_warnings(): + warnings.simplefilter("error") + + recall_score( + np.array([[1, 1], [1, 1]]), + np.array([[0, 0], [0, 0]]), + average="micro", + zero_division=zero_division, + ) + with warnings.catch_warnings(record=True) as record: warnings.simplefilter("always") recall_score( @@ -2266,13 +2270,15 @@ def test_precision_warnings(zero_division): " this behavior." ) - assert_no_warnings( - precision_score, - np.array([[0, 0], [0, 0]]), - np.array([[1, 1], [1, 1]]), - average="micro", - zero_division=zero_division, - ) + with warnings.catch_warnings(): + warnings.simplefilter("error") + + precision_score( + np.array([[0, 0], [0, 0]]), + np.array([[1, 1], [1, 1]]), + average="micro", + zero_division=zero_division, + ) @pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan]) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 05acc95cf1671..9db35c2091d2b 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -1,4 +1,5 @@ import re +import warnings import numpy as np import pytest @@ -796,7 +797,9 @@ def test_encoder_dtypes_pandas(): def test_one_hot_encoder_warning(): enc = OneHotEncoder() X = [["Male", 1], ["Female", 3]] - np.testing.assert_no_warnings(enc.fit_transform, X) + with warnings.catch_warnings(): + warnings.simplefilter("error") + enc.fit_transform(X) @pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")]) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index ab842e425456a..21f5ef02ab9f4 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -34,7 +34,6 @@ from sklearn.utils._testing import ( _convert_container, assert_array_equal, - assert_no_warnings, ignore_warnings, ) @@ -472,7 +471,10 @@ def test_pickle_version_warning_is_not_raised_with_matching_version(): tree = DecisionTreeClassifier().fit(iris.data, iris.target) tree_pickle = pickle.dumps(tree) assert b"_sklearn_version" in tree_pickle - tree_restored = assert_no_warnings(pickle.loads, tree_pickle) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + tree_restored = pickle.loads(tree_pickle) # test that we can predict with the restored decision tree classifier score_of_original = tree.score(iris.data, iris.target) @@ -542,7 +544,11 @@ def test_pickle_version_no_warning_is_issued_with_non_sklearn_estimator(): try: module_backup = TreeNoVersion.__module__ TreeNoVersion.__module__ = "notsklearn" - assert_no_warnings(pickle.loads, tree_pickle_noversion) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + + pickle.loads(tree_pickle_noversion) finally: TreeNoVersion.__module__ = module_backup diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 94289132b51d3..7b62c622b16ca 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -32,7 +32,6 @@ assert_array_almost_equal, assert_array_equal, assert_array_less, - assert_no_warnings, ) import sklearn @@ -61,7 +60,6 @@ "assert_approx_equal", "assert_allclose", "assert_run_python_script_without_output", - "assert_no_warnings", "SkipTest", ] diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index 345012592b7b3..14bc62e206c7e 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -1,6 +1,5 @@ import atexit import os -import unittest import warnings import numpy as np @@ -16,7 +15,6 @@ _get_warnings_filters_info_list, assert_allclose, assert_allclose_dense_sparse, - assert_no_warnings, assert_raise_message, assert_raises, assert_raises_regex, @@ -124,8 +122,12 @@ def _multiple_warning_function(): warnings.warn("deprecation warning") # Check the function directly - assert_no_warnings(ignore_warnings(_warning_function)) - assert_no_warnings(ignore_warnings(_warning_function, category=DeprecationWarning)) + with warnings.catch_warnings(): + warnings.simplefilter("error") + + ignore_warnings(_warning_function) + ignore_warnings(_warning_function, category=DeprecationWarning) + with pytest.warns(DeprecationWarning): ignore_warnings(_warning_function, category=UserWarning)() @@ -140,9 +142,10 @@ def _multiple_warning_function(): assert len(record) == 1 assert isinstance(record[0].message, DeprecationWarning) - assert_no_warnings( + with warnings.catch_warnings(): + warnings.simplefilter("error") + ignore_warnings(_warning_function, category=(DeprecationWarning, UserWarning)) - ) # Check the decorator @ignore_warnings @@ -170,9 +173,13 @@ def decorator_no_deprecation_multiple_warning(): def decorator_no_user_multiple_warning(): _multiple_warning_function() - assert_no_warnings(decorator_no_warning) - assert_no_warnings(decorator_no_warning_multiple) - assert_no_warnings(decorator_no_deprecation_warning) + with warnings.catch_warnings(): + warnings.simplefilter("error") + + decorator_no_warning() + decorator_no_warning_multiple() + decorator_no_deprecation_warning() + with pytest.warns(DeprecationWarning): decorator_no_user_warning() with pytest.warns(UserWarning): @@ -205,9 +212,13 @@ def context_manager_no_user_multiple_warning(): with ignore_warnings(category=UserWarning): _multiple_warning_function() - assert_no_warnings(context_manager_no_warning) - assert_no_warnings(context_manager_no_warning_multiple) - assert_no_warnings(context_manager_no_deprecation_warning) + with warnings.catch_warnings(): + warnings.simplefilter("error") + + context_manager_no_warning() + context_manager_no_warning_multiple() + context_manager_no_deprecation_warning() + with pytest.warns(DeprecationWarning): context_manager_no_user_warning() with pytest.warns(UserWarning): @@ -230,17 +241,6 @@ def test(): pass -class TestWarns(unittest.TestCase): - def test_warn(self): - def f(): - warnings.warn("yo") - return 3 - - with pytest.raises(AssertionError): - assert_no_warnings(f) - assert assert_no_warnings(lambda x: x, 1) == 1 - - # Tests for docstrings: diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5bde51ae514d9..ef205df5260bb 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -45,7 +45,6 @@ assert_allclose, assert_allclose_dense_sparse, assert_array_equal, - assert_no_warnings, create_memmap_backed_data, ignore_warnings, skip_if_array_api_compat_not_configured, @@ -605,39 +604,38 @@ def test_check_array_dtype_warning(): X_csc_int32 = sp.csc_matrix(X_int64, dtype=np.int32) integer_data = [X_int64, X_csc_int32] float32_data = [X_float32, X_csr_float32, X_csc_float32] - for X in integer_data: - X_checked = assert_no_warnings( - check_array, X, dtype=np.float64, accept_sparse=True - ) - assert X_checked.dtype == np.float64 + with warnings.catch_warnings(): + warnings.simplefilter("error") - for X in float32_data: - X_checked = assert_no_warnings( - check_array, X, dtype=[np.float64, np.float32], accept_sparse=True - ) - assert X_checked.dtype == np.float32 - assert X_checked is X + for X in integer_data: + X_checked = check_array(X, dtype=np.float64, accept_sparse=True) + assert X_checked.dtype == np.float64 - X_checked = assert_no_warnings( - check_array, - X, + for X in float32_data: + X_checked = check_array( + X, dtype=[np.float64, np.float32], accept_sparse=True + ) + assert X_checked.dtype == np.float32 + assert X_checked is X + + X_checked = check_array( + X, + dtype=[np.float64, np.float32], + accept_sparse=["csr", "dok"], + copy=True, + ) + assert X_checked.dtype == np.float32 + assert X_checked is not X + + X_checked = check_array( + X_csc_float32, dtype=[np.float64, np.float32], accept_sparse=["csr", "dok"], - copy=True, + copy=False, ) assert X_checked.dtype == np.float32 - assert X_checked is not X - - X_checked = assert_no_warnings( - check_array, - X_csc_float32, - dtype=[np.float64, np.float32], - accept_sparse=["csr", "dok"], - copy=False, - ) - assert X_checked.dtype == np.float32 - assert X_checked is not X_csc_float32 - assert X_checked.format == "csr" + assert X_checked is not X_csc_float32 + assert X_checked.format == "csr" def test_check_array_accept_sparse_type_exception(): From c55d06452c2b54fdb707079767afd3d43291ebbf Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Sat, 20 Jul 2024 02:53:10 -0600 Subject: [PATCH 32/35] ENH Adjust estimator representation beyond `maxlevels` (#29492) Co-authored-by: Adam Li --- sklearn/utils/_pprint.py | 2 +- sklearn/utils/tests/test_pprint.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_pprint.py b/sklearn/utils/_pprint.py index 9b33cd617a5fc..5045300357306 100644 --- a/sklearn/utils/_pprint.py +++ b/sklearn/utils/_pprint.py @@ -427,7 +427,7 @@ def _safe_repr(object, context, maxlevels, level, changed_only=False): if issubclass(typ, BaseEstimator): objid = id(object) if maxlevels and level >= maxlevels: - return "{...}", False, objid in context + return f"{typ.__name__}(...)", False, objid in context if objid in context: return pprint._recursion(object), False, True context[objid] = 1 diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py index ec48c4a012574..bef5836910787 100644 --- a/sklearn/utils/tests/test_pprint.py +++ b/sklearn/utils/tests/test_pprint.py @@ -2,6 +2,7 @@ from pprint import PrettyPrinter import numpy as np +import pytest from sklearn.utils._pprint import _EstimatorPrettyPrinter from sklearn.linear_model import LogisticRegressionCV @@ -346,6 +347,24 @@ def test_deeply_nested(print_changed_only_false): assert rfe.__repr__() == expected +@pytest.mark.parametrize( + ("print_changed_only", "expected"), + [ + (True, "RFE(estimator=RFE(...))"), + ( + False, + "RFE(estimator=RFE(...), n_features_to_select=None, step=1, verbose=0)", + ), + ], +) +def test_print_estimator_max_depth(print_changed_only, expected): + with config_context(print_changed_only=print_changed_only): + pp = _EstimatorPrettyPrinter(depth=1) + + rfe = RFE(RFE(RFE(RFE(RFE(LogisticRegression()))))) + assert pp.pformat(rfe) == expected + + def test_gridsearch(print_changed_only_false): # render a gridsearch param_grid = [ From 215be2ede050995d4b6fb00b5ef29571b4c71c50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Sat, 20 Jul 2024 18:39:56 +0200 Subject: [PATCH 33/35] MAINT Clean-up some testing utils (#29528) --- .../tests/test_coordinate_descent.py | 4 +- sklearn/tests/test_base.py | 4 +- sklearn/utils/_testing.py | 56 ------------------- sklearn/utils/estimator_checks.py | 6 +- sklearn/utils/tests/test_testing.py | 48 ---------------- 5 files changed, 5 insertions(+), 113 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 6e2b35a5590cb..eff3dd34f70a2 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -264,9 +264,7 @@ def test_lasso_cv(): ) # check that they also give a similar MSE mse_lars = interpolate.interp1d(lars.cv_alphas_, lars.mse_path_.T) - np.testing.assert_approx_equal( - mse_lars(clf.alphas_[5]).mean(), clf.mse_path_[5].mean(), significant=2 - ) + assert_allclose(mse_lars(clf.alphas_[5]).mean(), clf.mse_path_[5].mean(), rtol=1e-2) # test set assert clf.score(X_test, y_test) > 0.99 diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 21f5ef02ab9f4..c197f1e01b702 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -326,8 +326,8 @@ def test_set_params(): # we don't currently catch if the things in pipeline are estimators # bad_pipeline = Pipeline([("bad", NoEstimator())]) - # assert_raises(AttributeError, bad_pipeline.set_params, - # bad__stupid_param=True) + # with pytest.raises(AttributeError): + # bad_pipeline.set_params(bad__stupid_param=True) def test_set_params_passes_all_parameters(): diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 7b62c622b16ca..961091e4af71a 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -20,7 +20,6 @@ from functools import wraps from inspect import signature from subprocess import STDOUT, CalledProcessError, TimeoutExpired, check_output -from unittest import TestCase import joblib import numpy as np @@ -28,7 +27,6 @@ from numpy.testing import assert_allclose as np_assert_allclose from numpy.testing import ( assert_almost_equal, - assert_approx_equal, assert_array_almost_equal, assert_array_equal, assert_array_less, @@ -51,28 +49,16 @@ ) __all__ = [ - "assert_raises", - "assert_raises_regexp", "assert_array_equal", "assert_almost_equal", "assert_array_almost_equal", "assert_array_less", - "assert_approx_equal", "assert_allclose", "assert_run_python_script_without_output", "SkipTest", ] -_dummy = TestCase("__init__") -assert_raises = _dummy.assertRaises SkipTest = unittest.case.SkipTest -assert_dict_equal = _dummy.assertDictEqual - -assert_raises_regex = _dummy.assertRaisesRegex -# assert_raises_regexp is deprecated in Python 3.4 in favor of -# assert_raises_regex but lets keep the backward compat in scikit-learn with -# the old name for now -assert_raises_regexp = assert_raises_regex def ignore_warnings(obj=None, category=Warning): @@ -176,48 +162,6 @@ def __exit__(self, *exc_info): self.log[:] = [] -def assert_raise_message(exceptions, message, function, *args, **kwargs): - """Helper function to test the message raised in an exception. - - Given an exception, a callable to raise the exception, and - a message string, tests that the correct exception is raised and - that the message is a substring of the error thrown. Used to test - that the specific message thrown during an exception is correct. - - Parameters - ---------- - exceptions : exception or tuple of exception - An Exception object. - - message : str - The error message or a substring of the error message. - - function : callable - Callable object to raise error. - - *args : the positional arguments to `function`. - - **kwargs : the keyword arguments to `function`. - """ - try: - function(*args, **kwargs) - except exceptions as e: - error_message = str(e) - if message not in error_message: - raise AssertionError( - "Error message does not include the expected" - " string: %r. Observed error message: %r" % (message, error_message) - ) - else: - # concatenate exception names - if isinstance(exceptions, tuple): - names = " or ".join(e.__name__ for e in exceptions) - else: - names = exceptions.__name__ - - raise AssertionError("%s not raised by %s" % (names, function.__name__)) - - def assert_allclose( actual, desired, rtol=None, atol=0.0, equal_nan=True, err_msg="", verbose=True ): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 422a23bb5ef72..1eb920f90643a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -75,7 +75,6 @@ assert_array_almost_equal, assert_array_equal, assert_array_less, - assert_raise_message, create_memmap_backed_data, ignore_warnings, raises, @@ -1489,9 +1488,8 @@ def check_fit2d_predict1d(name, estimator_orig): for method in ["predict", "transform", "decision_function", "predict_proba"]: if hasattr(estimator, method): - assert_raise_message( - ValueError, "Reshape your data", getattr(estimator, method), X[0] - ) + with raises(ValueError, match="Reshape your data"): + getattr(estimator, method)(X[0]) def _apply_on_subsets(func, X): diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index 14bc62e206c7e..4e7a40dae1222 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -15,9 +15,6 @@ _get_warnings_filters_info_list, assert_allclose, assert_allclose_dense_sparse, - assert_raise_message, - assert_raises, - assert_raises_regex, assert_run_python_script_without_output, check_docstring_parameters, create_memmap_backed_data, @@ -66,51 +63,6 @@ def test_assert_allclose_dense_sparse(csr_container): assert_allclose_dense_sparse(B, A) -def test_assert_raises_msg(): - with assert_raises_regex(AssertionError, "Hello world"): - with assert_raises(ValueError, msg="Hello world"): - pass - - -def test_assert_raise_message(): - def _raise_ValueError(message): - raise ValueError(message) - - def _no_raise(): - pass - - assert_raise_message(ValueError, "test", _raise_ValueError, "test") - - assert_raises( - AssertionError, - assert_raise_message, - ValueError, - "something else", - _raise_ValueError, - "test", - ) - - assert_raises( - ValueError, - assert_raise_message, - TypeError, - "something else", - _raise_ValueError, - "test", - ) - - assert_raises(AssertionError, assert_raise_message, ValueError, "test", _no_raise) - - # multiple exceptions in a tuple - assert_raises( - AssertionError, - assert_raise_message, - (ValueError, AttributeError), - "test", - _no_raise, - ) - - def test_ignore_warning(): # This check that ignore_warning decorator and context manager are working # as expected From c3fed503f712ada7d849fdf87dd088648106ca8d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 20 Jul 2024 16:23:52 -0400 Subject: [PATCH 34/35] MAINT Pull apart Splitter and Partitioner in the sklearn/tree code (#29458) Signed-off-by: Adam Li --- sklearn/tree/_partitioner.pxd | 178 +++++++ sklearn/tree/_partitioner.pyx | 816 +++++++++++++++++++++++++++++++ sklearn/tree/_splitter.pxd | 6 +- sklearn/tree/_splitter.pyx | 878 ++-------------------------------- sklearn/tree/meson.build | 3 + 5 files changed, 1035 insertions(+), 846 deletions(-) create mode 100644 sklearn/tree/_partitioner.pxd create mode 100644 sklearn/tree/_partitioner.pyx diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd new file mode 100644 index 0000000000000..4ddd2a9cf9eb6 --- /dev/null +++ b/sklearn/tree/_partitioner.pxd @@ -0,0 +1,178 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# See _partitioner.pyx for details. + +from ..utils._typedefs cimport ( + float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t +) +from ._splitter cimport SplitRecord + + +# Mitigate precision differences between 32 bit and 64 bit +cdef float32_t FEATURE_THRESHOLD = 1e-7 + + +# We provide here the abstract interfact for a Partitioner that would be +# theoretically shared between the Dense and Sparse partitioners. However, +# we leave it commented out for now as it is not used in the current +# implementation due to the performance hit from vtable lookups when using +# inheritance based polymorphism. It is left here for future reference. +# +# Note: Instead, in `_splitter.pyx`, we define a fused type that can be used +# to represent both the dense and sparse partitioners. +# +# cdef class BasePartitioner: +# cdef intp_t[::1] samples +# cdef float32_t[::1] feature_values +# cdef intp_t start +# cdef intp_t end +# cdef intp_t n_missing +# cdef const uint8_t[::1] missing_values_in_feature_mask + +# cdef void sort_samples_and_feature_values( +# self, intp_t current_feature +# ) noexcept nogil +# cdef void init_node_split( +# self, +# intp_t start, +# intp_t end +# ) noexcept nogil +# cdef void find_min_max( +# self, +# intp_t current_feature, +# float32_t* min_feature_value_out, +# float32_t* max_feature_value_out, +# ) noexcept nogil +# cdef void next_p( +# self, +# intp_t* p_prev, +# intp_t* p +# ) noexcept nogil +# cdef intp_t partition_samples( +# self, +# float64_t current_threshold +# ) noexcept nogil +# cdef void partition_samples_final( +# self, +# intp_t best_pos, +# float64_t best_threshold, +# intp_t best_feature, +# intp_t n_missing, +# ) noexcept nogil + + +cdef class DensePartitioner: + """Partitioner specialized for dense data. + + Note that this partitioner is agnostic to the splitting strategy (best vs. random). + """ + cdef const float32_t[:, :] X + cdef intp_t[::1] samples + cdef float32_t[::1] feature_values + cdef intp_t start + cdef intp_t end + cdef intp_t n_missing + cdef const uint8_t[::1] missing_values_in_feature_mask + + cdef void sort_samples_and_feature_values( + self, intp_t current_feature + ) noexcept nogil + cdef void init_node_split( + self, + intp_t start, + intp_t end + ) noexcept nogil + cdef void find_min_max( + self, + intp_t current_feature, + float32_t* min_feature_value_out, + float32_t* max_feature_value_out, + ) noexcept nogil + cdef void next_p( + self, + intp_t* p_prev, + intp_t* p + ) noexcept nogil + cdef intp_t partition_samples( + self, + float64_t current_threshold + ) noexcept nogil + cdef void partition_samples_final( + self, + intp_t best_pos, + float64_t best_threshold, + intp_t best_feature, + intp_t n_missing, + ) noexcept nogil + + +cdef class SparsePartitioner: + """Partitioner specialized for sparse CSC data. + + Note that this partitioner is agnostic to the splitting strategy (best vs. random). + """ + cdef const float32_t[::1] X_data + cdef const int32_t[::1] X_indices + cdef const int32_t[::1] X_indptr + cdef intp_t n_total_samples + cdef intp_t[::1] index_to_samples + cdef intp_t[::1] sorted_samples + cdef intp_t start_positive + cdef intp_t end_negative + cdef bint is_samples_sorted + + cdef intp_t[::1] samples + cdef float32_t[::1] feature_values + cdef intp_t start + cdef intp_t end + cdef intp_t n_missing + cdef const uint8_t[::1] missing_values_in_feature_mask + + cdef void sort_samples_and_feature_values( + self, intp_t current_feature + ) noexcept nogil + cdef void init_node_split( + self, + intp_t start, + intp_t end + ) noexcept nogil + cdef void find_min_max( + self, + intp_t current_feature, + float32_t* min_feature_value_out, + float32_t* max_feature_value_out, + ) noexcept nogil + cdef void next_p( + self, + intp_t* p_prev, + intp_t* p + ) noexcept nogil + cdef intp_t partition_samples( + self, + float64_t current_threshold + ) noexcept nogil + cdef void partition_samples_final( + self, + intp_t best_pos, + float64_t best_threshold, + intp_t best_feature, + intp_t n_missing, + ) noexcept nogil + + cdef void extract_nnz( + self, + intp_t feature + ) noexcept nogil + cdef intp_t _partition( + self, + float64_t threshold, + intp_t zero_pos + ) noexcept nogil + + +cdef void shift_missing_values_to_left_if_required( + SplitRecord* best, + intp_t[::1] samples, + intp_t end, +) noexcept nogil diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx new file mode 100644 index 0000000000000..57801c3f279ed --- /dev/null +++ b/sklearn/tree/_partitioner.pyx @@ -0,0 +1,816 @@ +"""Partition samples in the construction of a tree. + +This module contains the algorithms for moving sample indices to +the left and right child node given a split determined by the +splitting algorithm in `_splitter.pyx`. + +Partitioning is done in a way that is efficient for both dense data, +and sparse data stored in a Compressed Sparse Column (CSC) format. +""" +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from cython cimport final +from libc.math cimport isnan, log +from libc.stdlib cimport qsort +from libc.string cimport memcpy + +import numpy as np +from scipy.sparse import issparse + + +# Constant to switch between algorithm non zero value extract algorithm +# in SparsePartitioner +cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 + +# Allow for 32 bit float comparisons +cdef float32_t INFINITY_32t = np.inf + + +@final +cdef class DensePartitioner: + """Partitioner specialized for dense data. + + Note that this partitioner is agnostic to the splitting strategy (best vs. random). + """ + def __init__( + self, + const float32_t[:, :] X, + intp_t[::1] samples, + float32_t[::1] feature_values, + const uint8_t[::1] missing_values_in_feature_mask, + ): + self.X = X + self.samples = samples + self.feature_values = feature_values + self.missing_values_in_feature_mask = missing_values_in_feature_mask + + cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil: + """Initialize splitter at the beginning of node_split.""" + self.start = start + self.end = end + self.n_missing = 0 + + cdef inline void sort_samples_and_feature_values( + self, intp_t current_feature + ) noexcept nogil: + """Simultaneously sort based on the feature_values. + + Missing values are stored at the end of feature_values. + The number of missing values observed in feature_values is stored + in self.n_missing. + """ + cdef: + intp_t i, current_end + float32_t[::1] feature_values = self.feature_values + const float32_t[:, :] X = self.X + intp_t[::1] samples = self.samples + intp_t n_missing = 0 + const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask + + # Sort samples along that feature; by copying the values into an array and + # sorting the array in a manner which utilizes the cache more effectively. + if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]: + i, current_end = self.start, self.end - 1 + # Missing values are placed at the end and do not participate in the sorting. + while i <= current_end: + # Finds the right-most value that is not missing so that + # it can be swapped with missing values at its left. + if isnan(X[samples[current_end], current_feature]): + n_missing += 1 + current_end -= 1 + continue + + # X[samples[current_end], current_feature] is a non-missing value + if isnan(X[samples[i], current_feature]): + samples[i], samples[current_end] = samples[current_end], samples[i] + n_missing += 1 + current_end -= 1 + + feature_values[i] = X[samples[i], current_feature] + i += 1 + else: + # When there are no missing values, we only need to copy the data into + # feature_values + for i in range(self.start, self.end): + feature_values[i] = X[samples[i], current_feature] + + sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing) + self.n_missing = n_missing + + cdef inline void find_min_max( + self, + intp_t current_feature, + float32_t* min_feature_value_out, + float32_t* max_feature_value_out, + ) noexcept nogil: + """Find the minimum and maximum value for current_feature. + + Missing values are stored at the end of feature_values. The number of missing + values observed in feature_values is stored in self.n_missing. + """ + cdef: + intp_t p, current_end + float32_t current_feature_value + const float32_t[:, :] X = self.X + intp_t[::1] samples = self.samples + float32_t min_feature_value = INFINITY_32t + float32_t max_feature_value = -INFINITY_32t + float32_t[::1] feature_values = self.feature_values + intp_t n_missing = 0 + const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask + + # We are copying the values into an array and finding min/max of the array in + # a manner which utilizes the cache more effectively. We need to also count + # the number of missing-values there are. + if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]: + p, current_end = self.start, self.end - 1 + # Missing values are placed at the end and do not participate in the + # min/max calculation. + while p <= current_end: + # Finds the right-most value that is not missing so that + # it can be swapped with missing values towards its left. + if isnan(X[samples[current_end], current_feature]): + n_missing += 1 + current_end -= 1 + continue + + # X[samples[current_end], current_feature] is a non-missing value + if isnan(X[samples[p], current_feature]): + samples[p], samples[current_end] = samples[current_end], samples[p] + n_missing += 1 + current_end -= 1 + + current_feature_value = X[samples[p], current_feature] + feature_values[p] = current_feature_value + if current_feature_value < min_feature_value: + min_feature_value = current_feature_value + elif current_feature_value > max_feature_value: + max_feature_value = current_feature_value + p += 1 + else: + min_feature_value = X[samples[self.start], current_feature] + max_feature_value = min_feature_value + + feature_values[self.start] = min_feature_value + for p in range(self.start + 1, self.end): + current_feature_value = X[samples[p], current_feature] + feature_values[p] = current_feature_value + + if current_feature_value < min_feature_value: + min_feature_value = current_feature_value + elif current_feature_value > max_feature_value: + max_feature_value = current_feature_value + + min_feature_value_out[0] = min_feature_value + max_feature_value_out[0] = max_feature_value + self.n_missing = n_missing + + cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil: + """Compute the next p_prev and p for iteratiing over feature values. + + The missing values are not included when iterating through the feature values. + """ + cdef: + float32_t[::1] feature_values = self.feature_values + intp_t end_non_missing = self.end - self.n_missing + + while ( + p[0] + 1 < end_non_missing and + feature_values[p[0] + 1] <= feature_values[p[0]] + FEATURE_THRESHOLD + ): + p[0] += 1 + + p_prev[0] = p[0] + + # By adding 1, we have + # (feature_values[p] >= end) or (feature_values[p] > feature_values[p - 1]) + p[0] += 1 + + cdef inline intp_t partition_samples( + self, + float64_t current_threshold + ) noexcept nogil: + """Partition samples for feature_values at the current_threshold.""" + cdef: + intp_t p = self.start + intp_t partition_end = self.end + intp_t[::1] samples = self.samples + float32_t[::1] feature_values = self.feature_values + + while p < partition_end: + if feature_values[p] <= current_threshold: + p += 1 + else: + partition_end -= 1 + + feature_values[p], feature_values[partition_end] = ( + feature_values[partition_end], feature_values[p] + ) + samples[p], samples[partition_end] = samples[partition_end], samples[p] + + return partition_end + + cdef inline void partition_samples_final( + self, + intp_t best_pos, + float64_t best_threshold, + intp_t best_feature, + intp_t best_n_missing, + ) noexcept nogil: + """Partition samples for X at the best_threshold and best_feature. + + If missing values are present, this method partitions `samples` + so that the `best_n_missing` missing values' indices are in the + right-most end of `samples`, that is `samples[end_non_missing:end]`. + """ + cdef: + # Local invariance: start <= p <= partition_end <= end + intp_t start = self.start + intp_t p = start + intp_t end = self.end - 1 + intp_t partition_end = end - best_n_missing + intp_t[::1] samples = self.samples + const float32_t[:, :] X = self.X + float32_t current_value + + if best_n_missing != 0: + # Move samples with missing values to the end while partitioning the + # non-missing samples + while p < partition_end: + # Keep samples with missing values at the end + if isnan(X[samples[end], best_feature]): + end -= 1 + continue + + # Swap sample with missing values with the sample at the end + current_value = X[samples[p], best_feature] + if isnan(current_value): + samples[p], samples[end] = samples[end], samples[p] + end -= 1 + + # The swapped sample at the end is always a non-missing value, so + # we can continue the algorithm without checking for missingness. + current_value = X[samples[p], best_feature] + + # Partition the non-missing samples + if current_value <= best_threshold: + p += 1 + else: + samples[p], samples[partition_end] = samples[partition_end], samples[p] + partition_end -= 1 + else: + # Partitioning routine when there are no missing values + while p < partition_end: + if X[samples[p], best_feature] <= best_threshold: + p += 1 + else: + samples[p], samples[partition_end] = samples[partition_end], samples[p] + partition_end -= 1 + + +@final +cdef class SparsePartitioner: + """Partitioner specialized for sparse CSC data. + + Note that this partitioner is agnostic to the splitting strategy (best vs. random). + """ + def __init__( + self, + object X, + intp_t[::1] samples, + intp_t n_samples, + float32_t[::1] feature_values, + const uint8_t[::1] missing_values_in_feature_mask, + ): + if not (issparse(X) and X.format == "csc"): + raise ValueError("X should be in csc format") + + self.samples = samples + self.feature_values = feature_values + + # Initialize X + cdef intp_t n_total_samples = X.shape[0] + + self.X_data = X.data + self.X_indices = X.indices + self.X_indptr = X.indptr + self.n_total_samples = n_total_samples + + # Initialize auxiliary array used to perform split + self.index_to_samples = np.full(n_total_samples, fill_value=-1, dtype=np.intp) + self.sorted_samples = np.empty(n_samples, dtype=np.intp) + + cdef intp_t p + for p in range(n_samples): + self.index_to_samples[samples[p]] = p + + self.missing_values_in_feature_mask = missing_values_in_feature_mask + + cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil: + """Initialize splitter at the beginning of node_split.""" + self.start = start + self.end = end + self.is_samples_sorted = 0 + self.n_missing = 0 + + cdef inline void sort_samples_and_feature_values( + self, + intp_t current_feature + ) noexcept nogil: + """Simultaneously sort based on the feature_values.""" + cdef: + float32_t[::1] feature_values = self.feature_values + intp_t[::1] index_to_samples = self.index_to_samples + intp_t[::1] samples = self.samples + + self.extract_nnz(current_feature) + # Sort the positive and negative parts of `feature_values` + sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start) + if self.start_positive < self.end: + sort( + &feature_values[self.start_positive], + &samples[self.start_positive], + self.end - self.start_positive + ) + + # Update index_to_samples to take into account the sort + for p in range(self.start, self.end_negative): + index_to_samples[samples[p]] = p + for p in range(self.start_positive, self.end): + index_to_samples[samples[p]] = p + + # Add one or two zeros in feature_values, if there is any + if self.end_negative < self.start_positive: + self.start_positive -= 1 + feature_values[self.start_positive] = 0. + + if self.end_negative != self.start_positive: + feature_values[self.end_negative] = 0. + self.end_negative += 1 + + # XXX: When sparse supports missing values, this should be set to the + # number of missing values for current_feature + self.n_missing = 0 + + cdef inline void find_min_max( + self, + intp_t current_feature, + float32_t* min_feature_value_out, + float32_t* max_feature_value_out, + ) noexcept nogil: + """Find the minimum and maximum value for current_feature.""" + cdef: + intp_t p + float32_t current_feature_value, min_feature_value, max_feature_value + float32_t[::1] feature_values = self.feature_values + + self.extract_nnz(current_feature) + + if self.end_negative != self.start_positive: + # There is a zero + min_feature_value = 0 + max_feature_value = 0 + else: + min_feature_value = feature_values[self.start] + max_feature_value = min_feature_value + + # Find min, max in feature_values[start:end_negative] + for p in range(self.start, self.end_negative): + current_feature_value = feature_values[p] + + if current_feature_value < min_feature_value: + min_feature_value = current_feature_value + elif current_feature_value > max_feature_value: + max_feature_value = current_feature_value + + # Update min, max given feature_values[start_positive:end] + for p in range(self.start_positive, self.end): + current_feature_value = feature_values[p] + + if current_feature_value < min_feature_value: + min_feature_value = current_feature_value + elif current_feature_value > max_feature_value: + max_feature_value = current_feature_value + + min_feature_value_out[0] = min_feature_value + max_feature_value_out[0] = max_feature_value + + cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil: + """Compute the next p_prev and p for iteratiing over feature values.""" + cdef: + intp_t p_next + float32_t[::1] feature_values = self.feature_values + + if p[0] + 1 != self.end_negative: + p_next = p[0] + 1 + else: + p_next = self.start_positive + + while (p_next < self.end and + feature_values[p_next] <= feature_values[p[0]] + FEATURE_THRESHOLD): + p[0] = p_next + if p[0] + 1 != self.end_negative: + p_next = p[0] + 1 + else: + p_next = self.start_positive + + p_prev[0] = p[0] + p[0] = p_next + + cdef inline intp_t partition_samples( + self, + float64_t current_threshold + ) noexcept nogil: + """Partition samples for feature_values at the current_threshold.""" + return self._partition(current_threshold, self.start_positive) + + cdef inline void partition_samples_final( + self, + intp_t best_pos, + float64_t best_threshold, + intp_t best_feature, + intp_t n_missing, + ) noexcept nogil: + """Partition samples for X at the best_threshold and best_feature.""" + self.extract_nnz(best_feature) + self._partition(best_threshold, best_pos) + + cdef inline intp_t _partition(self, float64_t threshold, intp_t zero_pos) noexcept nogil: + """Partition samples[start:end] based on threshold.""" + cdef: + intp_t p, partition_end + intp_t[::1] index_to_samples = self.index_to_samples + float32_t[::1] feature_values = self.feature_values + intp_t[::1] samples = self.samples + + if threshold < 0.: + p = self.start + partition_end = self.end_negative + elif threshold > 0.: + p = self.start_positive + partition_end = self.end + else: + # Data are already split + return zero_pos + + while p < partition_end: + if feature_values[p] <= threshold: + p += 1 + + else: + partition_end -= 1 + + feature_values[p], feature_values[partition_end] = ( + feature_values[partition_end], feature_values[p] + ) + sparse_swap(index_to_samples, samples, p, partition_end) + + return partition_end + + cdef inline void extract_nnz(self, intp_t feature) noexcept nogil: + """Extract and partition values for a given feature. + + The extracted values are partitioned between negative values + feature_values[start:end_negative[0]] and positive values + feature_values[start_positive[0]:end]. + The samples and index_to_samples are modified according to this + partition. + + The extraction corresponds to the intersection between the arrays + X_indices[indptr_start:indptr_end] and samples[start:end]. + This is done efficiently using either an index_to_samples based approach + or binary search based approach. + + Parameters + ---------- + feature : intp_t, + Index of the feature we want to extract non zero value. + """ + cdef intp_t[::1] samples = self.samples + cdef float32_t[::1] feature_values = self.feature_values + cdef intp_t indptr_start = self.X_indptr[feature], + cdef intp_t indptr_end = self.X_indptr[feature + 1] + cdef intp_t n_indices = (indptr_end - indptr_start) + cdef intp_t n_samples = self.end - self.start + cdef intp_t[::1] index_to_samples = self.index_to_samples + cdef intp_t[::1] sorted_samples = self.sorted_samples + cdef const int32_t[::1] X_indices = self.X_indices + cdef const float32_t[::1] X_data = self.X_data + + # Use binary search if n_samples * log(n_indices) < + # n_indices and index_to_samples approach otherwise. + # O(n_samples * log(n_indices)) is the running time of binary + # search and O(n_indices) is the running time of index_to_samples + # approach. + if ((1 - self.is_samples_sorted) * n_samples * log(n_samples) + + n_samples * log(n_indices) < EXTRACT_NNZ_SWITCH * n_indices): + extract_nnz_binary_search(X_indices, X_data, + indptr_start, indptr_end, + samples, self.start, self.end, + index_to_samples, + feature_values, + &self.end_negative, &self.start_positive, + sorted_samples, &self.is_samples_sorted) + + # Using an index to samples technique to extract non zero values + # index_to_samples is a mapping from X_indices to samples + else: + extract_nnz_index_to_samples(X_indices, X_data, + indptr_start, indptr_end, + samples, self.start, self.end, + index_to_samples, + feature_values, + &self.end_negative, &self.start_positive) + + +cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil: + """Comparison function for sort. + + This must return an `int` as it is used by stdlib's qsort, which expects + an `int` return value. + """ + return ((a)[0] - (b)[0]) + + +cdef inline void binary_search(const int32_t[::1] sorted_array, + int32_t start, int32_t end, + intp_t value, intp_t* index, + int32_t* new_start) noexcept nogil: + """Return the index of value in the sorted array. + + If not found, return -1. new_start is the last pivot + 1 + """ + cdef int32_t pivot + index[0] = -1 + while start < end: + pivot = start + (end - start) / 2 + + if sorted_array[pivot] == value: + index[0] = pivot + start = pivot + 1 + break + + if sorted_array[pivot] < value: + start = pivot + 1 + else: + end = pivot + new_start[0] = start + + +cdef inline void extract_nnz_index_to_samples(const int32_t[::1] X_indices, + const float32_t[::1] X_data, + int32_t indptr_start, + int32_t indptr_end, + intp_t[::1] samples, + intp_t start, + intp_t end, + intp_t[::1] index_to_samples, + float32_t[::1] feature_values, + intp_t* end_negative, + intp_t* start_positive) noexcept nogil: + """Extract and partition values for a feature using index_to_samples. + + Complexity is O(indptr_end - indptr_start). + """ + cdef int32_t k + cdef intp_t index + cdef intp_t end_negative_ = start + cdef intp_t start_positive_ = end + + for k in range(indptr_start, indptr_end): + if start <= index_to_samples[X_indices[k]] < end: + if X_data[k] > 0: + start_positive_ -= 1 + feature_values[start_positive_] = X_data[k] + index = index_to_samples[X_indices[k]] + sparse_swap(index_to_samples, samples, index, start_positive_) + + elif X_data[k] < 0: + feature_values[end_negative_] = X_data[k] + index = index_to_samples[X_indices[k]] + sparse_swap(index_to_samples, samples, index, end_negative_) + end_negative_ += 1 + + # Returned values + end_negative[0] = end_negative_ + start_positive[0] = start_positive_ + + +cdef inline void extract_nnz_binary_search(const int32_t[::1] X_indices, + const float32_t[::1] X_data, + int32_t indptr_start, + int32_t indptr_end, + intp_t[::1] samples, + intp_t start, + intp_t end, + intp_t[::1] index_to_samples, + float32_t[::1] feature_values, + intp_t* end_negative, + intp_t* start_positive, + intp_t[::1] sorted_samples, + bint* is_samples_sorted) noexcept nogil: + """Extract and partition values for a given feature using binary search. + + If n_samples = end - start and n_indices = indptr_end - indptr_start, + the complexity is + + O((1 - is_samples_sorted[0]) * n_samples * log(n_samples) + + n_samples * log(n_indices)). + """ + cdef intp_t n_samples + + if not is_samples_sorted[0]: + n_samples = end - start + memcpy(&sorted_samples[start], &samples[start], + n_samples * sizeof(intp_t)) + qsort(&sorted_samples[start], n_samples, sizeof(intp_t), + compare_SIZE_t) + is_samples_sorted[0] = 1 + + while (indptr_start < indptr_end and + sorted_samples[start] > X_indices[indptr_start]): + indptr_start += 1 + + while (indptr_start < indptr_end and + sorted_samples[end - 1] < X_indices[indptr_end - 1]): + indptr_end -= 1 + + cdef intp_t p = start + cdef intp_t index + cdef intp_t k + cdef intp_t end_negative_ = start + cdef intp_t start_positive_ = end + + while (p < end and indptr_start < indptr_end): + # Find index of sorted_samples[p] in X_indices + binary_search(X_indices, indptr_start, indptr_end, + sorted_samples[p], &k, &indptr_start) + + if k != -1: + # If k != -1, we have found a non zero value + + if X_data[k] > 0: + start_positive_ -= 1 + feature_values[start_positive_] = X_data[k] + index = index_to_samples[X_indices[k]] + sparse_swap(index_to_samples, samples, index, start_positive_) + + elif X_data[k] < 0: + feature_values[end_negative_] = X_data[k] + index = index_to_samples[X_indices[k]] + sparse_swap(index_to_samples, samples, index, end_negative_) + end_negative_ += 1 + p += 1 + + # Returned values + end_negative[0] = end_negative_ + start_positive[0] = start_positive_ + + +cdef inline void sparse_swap(intp_t[::1] index_to_samples, intp_t[::1] samples, + intp_t pos_1, intp_t pos_2) noexcept nogil: + """Swap sample pos_1 and pos_2 preserving sparse invariant.""" + samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1] + index_to_samples[samples[pos_1]] = pos_1 + index_to_samples[samples[pos_2]] = pos_2 + + +cdef inline void shift_missing_values_to_left_if_required( + SplitRecord* best, + intp_t[::1] samples, + intp_t end, +) noexcept nogil: + """Shift missing value sample indices to the left of the split if required. + + Note: this should always be called at the very end because it will + move samples around, thereby affecting the criterion. + This affects the computation of the children impurity, which affects + the computation of the next node. + """ + cdef intp_t i, p, current_end + # The partitioner partitions the data such that the missing values are in + # samples[-n_missing:] for the criterion to consume. If the missing values + # are going to the right node, then the missing values are already in the + # correct position. If the missing values go left, then we move the missing + # values to samples[best.pos:best.pos+n_missing] and update `best.pos`. + if best.n_missing > 0 and best.missing_go_to_left: + for p in range(best.n_missing): + i = best.pos + p + current_end = end - 1 - p + samples[i], samples[current_end] = samples[current_end], samples[i] + best.pos += best.n_missing + + +# Sort n-element arrays pointed to by feature_values and samples, simultaneously, +# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997). +cdef inline void sort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil: + if n == 0: + return + cdef intp_t maxd = 2 * log(n) + introsort(feature_values, samples, n, maxd) + + +cdef inline void swap(float32_t* feature_values, intp_t* samples, + intp_t i, intp_t j) noexcept nogil: + # Helper for sort + feature_values[i], feature_values[j] = feature_values[j], feature_values[i] + samples[i], samples[j] = samples[j], samples[i] + + +cdef inline float32_t median3(float32_t* feature_values, intp_t n) noexcept nogil: + # Median of three pivot selection, after Bentley and McIlroy (1993). + # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. + cdef float32_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1] + if a < b: + if b < c: + return b + elif a < c: + return c + else: + return a + elif b < c: + if a < c: + return a + else: + return c + else: + return b + + +# Introsort with median of 3 pivot selection and 3-way partition function +# (robust to repeated elements, e.g. lots of zero features). +cdef void introsort(float32_t* feature_values, intp_t *samples, + intp_t n, intp_t maxd) noexcept nogil: + cdef float32_t pivot + cdef intp_t i, l, r + + while n > 1: + if maxd <= 0: # max depth limit exceeded ("gone quadratic") + heapsort(feature_values, samples, n) + return + maxd -= 1 + + pivot = median3(feature_values, n) + + # Three-way partition. + i = l = 0 + r = n + while i < r: + if feature_values[i] < pivot: + swap(feature_values, samples, i, l) + i += 1 + l += 1 + elif feature_values[i] > pivot: + r -= 1 + swap(feature_values, samples, i, r) + else: + i += 1 + + introsort(feature_values, samples, l, maxd) + feature_values += r + samples += r + n -= r + + +cdef inline void sift_down(float32_t* feature_values, intp_t* samples, + intp_t start, intp_t end) noexcept nogil: + # Restore heap order in feature_values[start:end] by moving the max element to start. + cdef intp_t child, maxind, root + + root = start + while True: + child = root * 2 + 1 + + # find max of root, left child, right child + maxind = root + if child < end and feature_values[maxind] < feature_values[child]: + maxind = child + if child + 1 < end and feature_values[maxind] < feature_values[child + 1]: + maxind = child + 1 + + if maxind == root: + break + else: + swap(feature_values, samples, root, maxind) + root = maxind + + +cdef void heapsort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil: + cdef intp_t start, end + + # heapify + start = (n - 2) / 2 + end = n + while True: + sift_down(feature_values, samples, start, end) + if start == 0: + break + start -= 1 + + # sort by shrinking the heap, putting the max element immediately after it + end = n - 1 + while end > 0: + swap(feature_values, samples, 0, end) + sift_down(feature_values, samples, 0, end) + end = end - 1 diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 485a18be1e522..42c6c6d935a9c 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -2,11 +2,13 @@ # SPDX-License-Identifier: BSD-3-Clause # See _splitter.pyx for details. + +from ..utils._typedefs cimport ( + float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t +) from ._criterion cimport Criterion from ._tree cimport ParentInfo -from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, uint8_t, uint32_t - cdef struct SplitRecord: # Data to track sample split diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index ad8e3eb84ed2c..b557a4d1c6300 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -1,33 +1,49 @@ +"""Splitting algorithms in the construction of a tree. + +This module contains the main splitting algorithms for constructing a tree. +Splitting is concerned with finding the optimal partition of the data into +two groups. The impurity of the groups is minimized, and the impurity is measured +by some criterion, which is typically the Gini impurity or the entropy. Criterion +are implemented in the ``_criterion`` module. + +Splitting evaluates a subset of features (defined by `max_features` also +known as mtry in the literature). The module supports two primary types +of splitting strategies: + +- Best Split: A greedy approach to find the optimal split. This method + ensures that the best possible split is chosen by examining various + thresholds for each candidate feature. +- Random Split: A stochastic approach that selects a split randomly + from a subset of the best splits. This method is faster but does + not guarantee the optimal split. +""" # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause -from cython cimport final -from libc.math cimport isnan -from libc.stdlib cimport qsort from libc.string cimport memcpy -from ._criterion cimport Criterion -from ._utils cimport log -from ._utils cimport rand_int -from ._utils cimport rand_uniform -from ._utils cimport RAND_R_MAX from ..utils._typedefs cimport int8_t +from ._criterion cimport Criterion +from ._partitioner cimport ( + FEATURE_THRESHOLD, DensePartitioner, SparsePartitioner, + shift_missing_values_to_left_if_required +) +from ._utils cimport RAND_R_MAX, rand_int, rand_uniform import numpy as np -from scipy.sparse import issparse - -cdef float64_t INFINITY = np.inf +# Introduce a fused-class to make it possible to share the split implementation +# between the dense and sparse cases in the node_split_best and node_split_random +# functions. The alternative would have been to use inheritance-based polymorphism +# but it would have resulted in a ~10% overall tree fitting performance +# degradation caused by the overhead frequent virtual method lookups. +ctypedef fused Partitioner: + DensePartitioner + SparsePartitioner -# Allow for 32 bit float comparisons -cdef float32_t INFINITY_32t = np.inf -# Mitigate precision differences between 32 bit and 64 bit -cdef float32_t FEATURE_THRESHOLD = 1e-7 +cdef float64_t INFINITY = np.inf -# Constant to switch between algorithm non zero value extract algorithm -# in SparsePartitioner -cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY @@ -249,39 +265,6 @@ cdef class Splitter: return self.criterion.node_impurity() -cdef inline void shift_missing_values_to_left_if_required( - SplitRecord* best, - intp_t[::1] samples, - intp_t end, -) noexcept nogil: - """Shift missing value sample indices to the left of the split if required. - - Note: this should always be called at the very end because it will - move samples around, thereby affecting the criterion. - This affects the computation of the children impurity, which affects - the computation of the next node. - """ - cdef intp_t i, p, current_end - # The partitioner partitions the data such that the missing values are in - # samples[-n_missing:] for the criterion to consume. If the missing values - # are going to the right node, then the missing values are already in the - # correct position. If the missing values go left, then we move the missing - # values to samples[best.pos:best.pos+n_missing] and update `best.pos`. - if best.n_missing > 0 and best.missing_go_to_left: - for p in range(best.n_missing): - i = best.pos + p - current_end = end - 1 - p - samples[i], samples[current_end] = samples[current_end], samples[i] - best.pos += best.n_missing - -# Introduce a fused-class to make it possible to share the split implementation -# between the dense and sparse cases in the node_split_best and node_split_random -# functions. The alternative would have been to use inheritance-based polymorphism -# but it would have resulted in a ~10% overall tree fitting performance -# degradation caused by the overhead frequent virtual method lookups. -ctypedef fused Partitioner: - DensePartitioner - SparsePartitioner cdef inline int node_split_best( Splitter splitter, @@ -556,119 +539,6 @@ cdef inline int node_split_best( return 0 -# Sort n-element arrays pointed to by feature_values and samples, simultaneously, -# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997). -cdef inline void sort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil: - if n == 0: - return - cdef intp_t maxd = 2 * log(n) - introsort(feature_values, samples, n, maxd) - - -cdef inline void swap(float32_t* feature_values, intp_t* samples, - intp_t i, intp_t j) noexcept nogil: - # Helper for sort - feature_values[i], feature_values[j] = feature_values[j], feature_values[i] - samples[i], samples[j] = samples[j], samples[i] - - -cdef inline float32_t median3(float32_t* feature_values, intp_t n) noexcept nogil: - # Median of three pivot selection, after Bentley and McIlroy (1993). - # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. - cdef float32_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1] - if a < b: - if b < c: - return b - elif a < c: - return c - else: - return a - elif b < c: - if a < c: - return a - else: - return c - else: - return b - - -# Introsort with median of 3 pivot selection and 3-way partition function -# (robust to repeated elements, e.g. lots of zero features). -cdef void introsort(float32_t* feature_values, intp_t *samples, - intp_t n, intp_t maxd) noexcept nogil: - cdef float32_t pivot - cdef intp_t i, l, r - - while n > 1: - if maxd <= 0: # max depth limit exceeded ("gone quadratic") - heapsort(feature_values, samples, n) - return - maxd -= 1 - - pivot = median3(feature_values, n) - - # Three-way partition. - i = l = 0 - r = n - while i < r: - if feature_values[i] < pivot: - swap(feature_values, samples, i, l) - i += 1 - l += 1 - elif feature_values[i] > pivot: - r -= 1 - swap(feature_values, samples, i, r) - else: - i += 1 - - introsort(feature_values, samples, l, maxd) - feature_values += r - samples += r - n -= r - - -cdef inline void sift_down(float32_t* feature_values, intp_t* samples, - intp_t start, intp_t end) noexcept nogil: - # Restore heap order in feature_values[start:end] by moving the max element to start. - cdef intp_t child, maxind, root - - root = start - while True: - child = root * 2 + 1 - - # find max of root, left child, right child - maxind = root - if child < end and feature_values[maxind] < feature_values[child]: - maxind = child - if child + 1 < end and feature_values[maxind] < feature_values[child + 1]: - maxind = child + 1 - - if maxind == root: - break - else: - swap(feature_values, samples, root, maxind) - root = maxind - - -cdef void heapsort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil: - cdef intp_t start, end - - # heapify - start = (n - 2) / 2 - end = n - while True: - sift_down(feature_values, samples, start, end) - if start == 0: - break - start -= 1 - - # sort by shrinking the heap, putting the max element immediately after it - end = n - 1 - while end > 0: - swap(feature_values, samples, 0, end) - sift_down(feature_values, samples, 0, end) - end = end - 1 - cdef inline int node_split_random( Splitter splitter, Partitioner partitioner, @@ -919,686 +789,6 @@ cdef inline int node_split_random( return 0 -@final -cdef class DensePartitioner: - """Partitioner specialized for dense data. - - Note that this partitioner is agnostic to the splitting strategy (best vs. random). - """ - cdef: - const float32_t[:, :] X - cdef intp_t[::1] samples - cdef float32_t[::1] feature_values - cdef intp_t start - cdef intp_t end - cdef intp_t n_missing - cdef const uint8_t[::1] missing_values_in_feature_mask - - def __init__( - self, - const float32_t[:, :] X, - intp_t[::1] samples, - float32_t[::1] feature_values, - const uint8_t[::1] missing_values_in_feature_mask, - ): - self.X = X - self.samples = samples - self.feature_values = feature_values - self.missing_values_in_feature_mask = missing_values_in_feature_mask - - cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil: - """Initialize splitter at the beginning of node_split.""" - self.start = start - self.end = end - self.n_missing = 0 - - cdef inline void sort_samples_and_feature_values( - self, intp_t current_feature - ) noexcept nogil: - """Simultaneously sort based on the feature_values. - - Missing values are stored at the end of feature_values. - The number of missing values observed in feature_values is stored - in self.n_missing. - """ - cdef: - intp_t i, current_end - float32_t[::1] feature_values = self.feature_values - const float32_t[:, :] X = self.X - intp_t[::1] samples = self.samples - intp_t n_missing = 0 - const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask - - # Sort samples along that feature; by - # copying the values into an array and - # sorting the array in a manner which utilizes the cache more - # effectively. - if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]: - i, current_end = self.start, self.end - 1 - # Missing values are placed at the end and do not participate in the sorting. - while i <= current_end: - # Finds the right-most value that is not missing so that - # it can be swapped with missing values at its left. - if isnan(X[samples[current_end], current_feature]): - n_missing += 1 - current_end -= 1 - continue - - # X[samples[current_end], current_feature] is a non-missing value - if isnan(X[samples[i], current_feature]): - samples[i], samples[current_end] = samples[current_end], samples[i] - n_missing += 1 - current_end -= 1 - - feature_values[i] = X[samples[i], current_feature] - i += 1 - else: - # When there are no missing values, we only need to copy the data into - # feature_values - for i in range(self.start, self.end): - feature_values[i] = X[samples[i], current_feature] - - sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing) - self.n_missing = n_missing - - cdef inline void find_min_max( - self, - intp_t current_feature, - float32_t* min_feature_value_out, - float32_t* max_feature_value_out, - ) noexcept nogil: - """Find the minimum and maximum value for current_feature. - - Missing values are stored at the end of feature_values. - The number of missing values observed in feature_values is stored - in self.n_missing. - """ - cdef: - intp_t p, current_end - float32_t current_feature_value - const float32_t[:, :] X = self.X - intp_t[::1] samples = self.samples - float32_t min_feature_value = INFINITY_32t - float32_t max_feature_value = -INFINITY_32t - float32_t[::1] feature_values = self.feature_values - intp_t n_missing = 0 - const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask - - # We are copying the values into an array and - # finding min/max of the array in a manner which utilizes the cache more - # effectively. We need to also count the number of missing-values there are - if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]: - p, current_end = self.start, self.end - 1 - # Missing values are placed at the end and do not participate in the - # min/max calculation. - while p <= current_end: - # Finds the right-most value that is not missing so that - # it can be swapped with missing values towards its left. - if isnan(X[samples[current_end], current_feature]): - n_missing += 1 - current_end -= 1 - continue - - # X[samples[current_end], current_feature] is a non-missing value - if isnan(X[samples[p], current_feature]): - samples[p], samples[current_end] = samples[current_end], samples[p] - n_missing += 1 - current_end -= 1 - - current_feature_value = X[samples[p], current_feature] - feature_values[p] = current_feature_value - if current_feature_value < min_feature_value: - min_feature_value = current_feature_value - elif current_feature_value > max_feature_value: - max_feature_value = current_feature_value - p += 1 - else: - min_feature_value = X[samples[self.start], current_feature] - max_feature_value = min_feature_value - - feature_values[self.start] = min_feature_value - for p in range(self.start + 1, self.end): - current_feature_value = X[samples[p], current_feature] - feature_values[p] = current_feature_value - - if current_feature_value < min_feature_value: - min_feature_value = current_feature_value - elif current_feature_value > max_feature_value: - max_feature_value = current_feature_value - - min_feature_value_out[0] = min_feature_value - max_feature_value_out[0] = max_feature_value - self.n_missing = n_missing - - cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil: - """Compute the next p_prev and p for iteratiing over feature values. - - The missing values are not included when iterating through the feature values. - """ - cdef: - float32_t[::1] feature_values = self.feature_values - intp_t end_non_missing = self.end - self.n_missing - - while ( - p[0] + 1 < end_non_missing and - feature_values[p[0] + 1] <= feature_values[p[0]] + FEATURE_THRESHOLD - ): - p[0] += 1 - - p_prev[0] = p[0] - - # By adding 1, we have - # (feature_values[p] >= end) or (feature_values[p] > feature_values[p - 1]) - p[0] += 1 - - cdef inline intp_t partition_samples( - self, - float64_t current_threshold - ) noexcept nogil: - """Partition samples for feature_values at the current_threshold.""" - cdef: - intp_t p = self.start - intp_t partition_end = self.end - intp_t[::1] samples = self.samples - float32_t[::1] feature_values = self.feature_values - - while p < partition_end: - if feature_values[p] <= current_threshold: - p += 1 - else: - partition_end -= 1 - - feature_values[p], feature_values[partition_end] = ( - feature_values[partition_end], feature_values[p] - ) - samples[p], samples[partition_end] = samples[partition_end], samples[p] - - return partition_end - - cdef inline void partition_samples_final( - self, - intp_t best_pos, - float64_t best_threshold, - intp_t best_feature, - intp_t best_n_missing, - ) noexcept nogil: - """Partition samples for X at the best_threshold and best_feature. - - If missing values are present, this method partitions `samples` - so that the `best_n_missing` missing values' indices are in the - right-most end of `samples`, that is `samples[end_non_missing:end]`. - """ - cdef: - # Local invariance: start <= p <= partition_end <= end - intp_t start = self.start - intp_t p = start - intp_t end = self.end - 1 - intp_t partition_end = end - best_n_missing - intp_t[::1] samples = self.samples - const float32_t[:, :] X = self.X - float32_t current_value - - if best_n_missing != 0: - # Move samples with missing values to the end while partitioning the - # non-missing samples - while p < partition_end: - # Keep samples with missing values at the end - if isnan(X[samples[end], best_feature]): - end -= 1 - continue - - # Swap sample with missing values with the sample at the end - current_value = X[samples[p], best_feature] - if isnan(current_value): - samples[p], samples[end] = samples[end], samples[p] - end -= 1 - - # The swapped sample at the end is always a non-missing value, so - # we can continue the algorithm without checking for missingness. - current_value = X[samples[p], best_feature] - - # Partition the non-missing samples - if current_value <= best_threshold: - p += 1 - else: - samples[p], samples[partition_end] = samples[partition_end], samples[p] - partition_end -= 1 - else: - # Partitioning routine when there are no missing values - while p < partition_end: - if X[samples[p], best_feature] <= best_threshold: - p += 1 - else: - samples[p], samples[partition_end] = samples[partition_end], samples[p] - partition_end -= 1 - - -@final -cdef class SparsePartitioner: - """Partitioner specialized for sparse CSC data. - - Note that this partitioner is agnostic to the splitting strategy (best vs. random). - """ - cdef intp_t[::1] samples - cdef float32_t[::1] feature_values - cdef intp_t start - cdef intp_t end - cdef intp_t n_missing - cdef const uint8_t[::1] missing_values_in_feature_mask - - cdef const float32_t[::1] X_data - cdef const int32_t[::1] X_indices - cdef const int32_t[::1] X_indptr - - cdef intp_t n_total_samples - - cdef intp_t[::1] index_to_samples - cdef intp_t[::1] sorted_samples - - cdef intp_t start_positive - cdef intp_t end_negative - cdef bint is_samples_sorted - - def __init__( - self, - object X, - intp_t[::1] samples, - intp_t n_samples, - float32_t[::1] feature_values, - const uint8_t[::1] missing_values_in_feature_mask, - ): - if not (issparse(X) and X.format == "csc"): - raise ValueError("X should be in csc format") - - self.samples = samples - self.feature_values = feature_values - - # Initialize X - cdef intp_t n_total_samples = X.shape[0] - - self.X_data = X.data - self.X_indices = X.indices - self.X_indptr = X.indptr - self.n_total_samples = n_total_samples - - # Initialize auxiliary array used to perform split - self.index_to_samples = np.full(n_total_samples, fill_value=-1, dtype=np.intp) - self.sorted_samples = np.empty(n_samples, dtype=np.intp) - - cdef intp_t p - for p in range(n_samples): - self.index_to_samples[samples[p]] = p - - self.missing_values_in_feature_mask = missing_values_in_feature_mask - - cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil: - """Initialize splitter at the beginning of node_split.""" - self.start = start - self.end = end - self.is_samples_sorted = 0 - self.n_missing = 0 - - cdef inline void sort_samples_and_feature_values( - self, intp_t current_feature - ) noexcept nogil: - """Simultaneously sort based on the feature_values.""" - cdef: - float32_t[::1] feature_values = self.feature_values - intp_t[::1] index_to_samples = self.index_to_samples - intp_t[::1] samples = self.samples - - self.extract_nnz(current_feature) - # Sort the positive and negative parts of `feature_values` - sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start) - if self.start_positive < self.end: - sort( - &feature_values[self.start_positive], - &samples[self.start_positive], - self.end - self.start_positive - ) - - # Update index_to_samples to take into account the sort - for p in range(self.start, self.end_negative): - index_to_samples[samples[p]] = p - for p in range(self.start_positive, self.end): - index_to_samples[samples[p]] = p - - # Add one or two zeros in feature_values, if there is any - if self.end_negative < self.start_positive: - self.start_positive -= 1 - feature_values[self.start_positive] = 0. - - if self.end_negative != self.start_positive: - feature_values[self.end_negative] = 0. - self.end_negative += 1 - - # XXX: When sparse supports missing values, this should be set to the - # number of missing values for current_feature - self.n_missing = 0 - - cdef inline void find_min_max( - self, - intp_t current_feature, - float32_t* min_feature_value_out, - float32_t* max_feature_value_out, - ) noexcept nogil: - """Find the minimum and maximum value for current_feature.""" - cdef: - intp_t p - float32_t current_feature_value, min_feature_value, max_feature_value - float32_t[::1] feature_values = self.feature_values - - self.extract_nnz(current_feature) - - if self.end_negative != self.start_positive: - # There is a zero - min_feature_value = 0 - max_feature_value = 0 - else: - min_feature_value = feature_values[self.start] - max_feature_value = min_feature_value - - # Find min, max in feature_values[start:end_negative] - for p in range(self.start, self.end_negative): - current_feature_value = feature_values[p] - - if current_feature_value < min_feature_value: - min_feature_value = current_feature_value - elif current_feature_value > max_feature_value: - max_feature_value = current_feature_value - - # Update min, max given feature_values[start_positive:end] - for p in range(self.start_positive, self.end): - current_feature_value = feature_values[p] - - if current_feature_value < min_feature_value: - min_feature_value = current_feature_value - elif current_feature_value > max_feature_value: - max_feature_value = current_feature_value - - min_feature_value_out[0] = min_feature_value - max_feature_value_out[0] = max_feature_value - - cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil: - """Compute the next p_prev and p for iteratiing over feature values.""" - cdef: - intp_t p_next - float32_t[::1] feature_values = self.feature_values - - if p[0] + 1 != self.end_negative: - p_next = p[0] + 1 - else: - p_next = self.start_positive - - while (p_next < self.end and - feature_values[p_next] <= feature_values[p[0]] + FEATURE_THRESHOLD): - p[0] = p_next - if p[0] + 1 != self.end_negative: - p_next = p[0] + 1 - else: - p_next = self.start_positive - - p_prev[0] = p[0] - p[0] = p_next - - cdef inline intp_t partition_samples( - self, - float64_t current_threshold - ) noexcept nogil: - """Partition samples for feature_values at the current_threshold.""" - return self._partition(current_threshold, self.start_positive) - - cdef inline void partition_samples_final( - self, - intp_t best_pos, - float64_t best_threshold, - intp_t best_feature, - intp_t n_missing, - ) noexcept nogil: - """Partition samples for X at the best_threshold and best_feature.""" - self.extract_nnz(best_feature) - self._partition(best_threshold, best_pos) - - cdef inline intp_t _partition(self, float64_t threshold, intp_t zero_pos) noexcept nogil: - """Partition samples[start:end] based on threshold.""" - cdef: - intp_t p, partition_end - intp_t[::1] index_to_samples = self.index_to_samples - float32_t[::1] feature_values = self.feature_values - intp_t[::1] samples = self.samples - - if threshold < 0.: - p = self.start - partition_end = self.end_negative - elif threshold > 0.: - p = self.start_positive - partition_end = self.end - else: - # Data are already split - return zero_pos - - while p < partition_end: - if feature_values[p] <= threshold: - p += 1 - - else: - partition_end -= 1 - - feature_values[p], feature_values[partition_end] = ( - feature_values[partition_end], feature_values[p] - ) - sparse_swap(index_to_samples, samples, p, partition_end) - - return partition_end - - cdef inline void extract_nnz(self, intp_t feature) noexcept nogil: - """Extract and partition values for a given feature. - - The extracted values are partitioned between negative values - feature_values[start:end_negative[0]] and positive values - feature_values[start_positive[0]:end]. - The samples and index_to_samples are modified according to this - partition. - - The extraction corresponds to the intersection between the arrays - X_indices[indptr_start:indptr_end] and samples[start:end]. - This is done efficiently using either an index_to_samples based approach - or binary search based approach. - - Parameters - ---------- - feature : intp_t, - Index of the feature we want to extract non zero value. - """ - cdef intp_t[::1] samples = self.samples - cdef float32_t[::1] feature_values = self.feature_values - cdef intp_t indptr_start = self.X_indptr[feature], - cdef intp_t indptr_end = self.X_indptr[feature + 1] - cdef intp_t n_indices = (indptr_end - indptr_start) - cdef intp_t n_samples = self.end - self.start - cdef intp_t[::1] index_to_samples = self.index_to_samples - cdef intp_t[::1] sorted_samples = self.sorted_samples - cdef const int32_t[::1] X_indices = self.X_indices - cdef const float32_t[::1] X_data = self.X_data - - # Use binary search if n_samples * log(n_indices) < - # n_indices and index_to_samples approach otherwise. - # O(n_samples * log(n_indices)) is the running time of binary - # search and O(n_indices) is the running time of index_to_samples - # approach. - if ((1 - self.is_samples_sorted) * n_samples * log(n_samples) + - n_samples * log(n_indices) < EXTRACT_NNZ_SWITCH * n_indices): - extract_nnz_binary_search(X_indices, X_data, - indptr_start, indptr_end, - samples, self.start, self.end, - index_to_samples, - feature_values, - &self.end_negative, &self.start_positive, - sorted_samples, &self.is_samples_sorted) - - # Using an index to samples technique to extract non zero values - # index_to_samples is a mapping from X_indices to samples - else: - extract_nnz_index_to_samples(X_indices, X_data, - indptr_start, indptr_end, - samples, self.start, self.end, - index_to_samples, - feature_values, - &self.end_negative, &self.start_positive) - - -cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil: - """Comparison function for sort. - - This must return an `int` as it is used by stdlib's qsort, which expects - an `int` return value. - """ - return ((a)[0] - (b)[0]) - - -cdef inline void binary_search(const int32_t[::1] sorted_array, - int32_t start, int32_t end, - intp_t value, intp_t* index, - int32_t* new_start) noexcept nogil: - """Return the index of value in the sorted array. - - If not found, return -1. new_start is the last pivot + 1 - """ - cdef int32_t pivot - index[0] = -1 - while start < end: - pivot = start + (end - start) / 2 - - if sorted_array[pivot] == value: - index[0] = pivot - start = pivot + 1 - break - - if sorted_array[pivot] < value: - start = pivot + 1 - else: - end = pivot - new_start[0] = start - - -cdef inline void extract_nnz_index_to_samples(const int32_t[::1] X_indices, - const float32_t[::1] X_data, - int32_t indptr_start, - int32_t indptr_end, - intp_t[::1] samples, - intp_t start, - intp_t end, - intp_t[::1] index_to_samples, - float32_t[::1] feature_values, - intp_t* end_negative, - intp_t* start_positive) noexcept nogil: - """Extract and partition values for a feature using index_to_samples. - - Complexity is O(indptr_end - indptr_start). - """ - cdef int32_t k - cdef intp_t index - cdef intp_t end_negative_ = start - cdef intp_t start_positive_ = end - - for k in range(indptr_start, indptr_end): - if start <= index_to_samples[X_indices[k]] < end: - if X_data[k] > 0: - start_positive_ -= 1 - feature_values[start_positive_] = X_data[k] - index = index_to_samples[X_indices[k]] - sparse_swap(index_to_samples, samples, index, start_positive_) - - elif X_data[k] < 0: - feature_values[end_negative_] = X_data[k] - index = index_to_samples[X_indices[k]] - sparse_swap(index_to_samples, samples, index, end_negative_) - end_negative_ += 1 - - # Returned values - end_negative[0] = end_negative_ - start_positive[0] = start_positive_ - - -cdef inline void extract_nnz_binary_search(const int32_t[::1] X_indices, - const float32_t[::1] X_data, - int32_t indptr_start, - int32_t indptr_end, - intp_t[::1] samples, - intp_t start, - intp_t end, - intp_t[::1] index_to_samples, - float32_t[::1] feature_values, - intp_t* end_negative, - intp_t* start_positive, - intp_t[::1] sorted_samples, - bint* is_samples_sorted) noexcept nogil: - """Extract and partition values for a given feature using binary search. - - If n_samples = end - start and n_indices = indptr_end - indptr_start, - the complexity is - - O((1 - is_samples_sorted[0]) * n_samples * log(n_samples) + - n_samples * log(n_indices)). - """ - cdef intp_t n_samples - - if not is_samples_sorted[0]: - n_samples = end - start - memcpy(&sorted_samples[start], &samples[start], - n_samples * sizeof(intp_t)) - qsort(&sorted_samples[start], n_samples, sizeof(intp_t), - compare_SIZE_t) - is_samples_sorted[0] = 1 - - while (indptr_start < indptr_end and - sorted_samples[start] > X_indices[indptr_start]): - indptr_start += 1 - - while (indptr_start < indptr_end and - sorted_samples[end - 1] < X_indices[indptr_end - 1]): - indptr_end -= 1 - - cdef intp_t p = start - cdef intp_t index - cdef intp_t k - cdef intp_t end_negative_ = start - cdef intp_t start_positive_ = end - - while (p < end and indptr_start < indptr_end): - # Find index of sorted_samples[p] in X_indices - binary_search(X_indices, indptr_start, indptr_end, - sorted_samples[p], &k, &indptr_start) - - if k != -1: - # If k != -1, we have found a non zero value - - if X_data[k] > 0: - start_positive_ -= 1 - feature_values[start_positive_] = X_data[k] - index = index_to_samples[X_indices[k]] - sparse_swap(index_to_samples, samples, index, start_positive_) - - elif X_data[k] < 0: - feature_values[end_negative_] = X_data[k] - index = index_to_samples[X_indices[k]] - sparse_swap(index_to_samples, samples, index, end_negative_) - end_negative_ += 1 - p += 1 - - # Returned values - end_negative[0] = end_negative_ - start_positive[0] = start_positive_ - - -cdef inline void sparse_swap(intp_t[::1] index_to_samples, intp_t[::1] samples, - intp_t pos_1, intp_t pos_2) noexcept nogil: - """Swap sample pos_1 and pos_2 preserving sparse invariant.""" - samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1] - index_to_samples[samples[pos_1]] = pos_1 - index_to_samples[samples[pos_2]] = pos_2 - - cdef class BestSplitter(Splitter): """Splitter for finding the best split on dense data.""" cdef DensePartitioner partitioner diff --git a/sklearn/tree/meson.build b/sklearn/tree/meson.build index 4bc4e0cf9e464..3e16af150b7ae 100644 --- a/sklearn/tree/meson.build +++ b/sklearn/tree/meson.build @@ -5,6 +5,9 @@ tree_extension_metadata = { '_splitter': {'sources': ['_splitter.pyx'], 'override_options': ['optimization=3']}, + '_partitioner': + {'sources': ['_partitioner.pyx'], + 'override_options': ['optimization=3']}, '_criterion': {'sources': ['_criterion.pyx'], 'override_options': ['optimization=3']}, From 63e158462fdca475215f181bdfc5f732bcb8ae46 Mon Sep 17 00:00:00 2001 From: Yao Xiao <108576690+Charlie-XIAO@users.noreply.github.com> Date: Sun, 21 Jul 2024 04:35:49 +0800 Subject: [PATCH 35/35] DOC fix dropdown anchor and collapse-all button in `sphinx-design==0.6.0` (#29493) --- .../doc_min_dependencies_environment.yml | 2 +- .../doc_min_dependencies_linux-64_conda.lock | 31 +++---- doc/js/scripts/dropdown.js | 88 +++++++++---------- doc/scss/custom.scss | 28 +++--- doc/sphinxext/dropdown_anchors.py | 38 ++------ pyproject.toml | 2 +- sklearn/_min_dependencies.py | 2 +- 7 files changed, 84 insertions(+), 107 deletions(-) diff --git a/build_tools/circle/doc_min_dependencies_environment.yml b/build_tools/circle/doc_min_dependencies_environment.yml index e27c3a700fdad..84cfe2fc53d49 100644 --- a/build_tools/circle/doc_min_dependencies_environment.yml +++ b/build_tools/circle/doc_min_dependencies_environment.yml @@ -33,7 +33,7 @@ dependencies: - polars=0.20.23 # min - pooch=1.6.0 # min - sphinx-remove-toctrees=1.0.0.post1 # min - - sphinx-design=0.5.0 # min + - sphinx-design=0.6.0 # min - pydata-sphinx-theme=0.15.3 # min - pip - pip: diff --git a/build_tools/circle/doc_min_dependencies_linux-64_conda.lock b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock index 7e1e34d47bd1c..4ac0cafb6c0cc 100644 --- a/build_tools/circle/doc_min_dependencies_linux-64_conda.lock +++ b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock @@ -1,32 +1,33 @@ # Generated by conda-lock. # platform: linux-64 -# input_hash: 433b1585e49151feaef8c61dcbd44b6b72bc2e4c7741317e6b0795a0106fa0cf +# input_hash: c9e2f9de85f55e6ab811a43c7c2d9b5d7e61a09e812b195c8d11b698b7eac1dd @EXPLICIT https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/noarch/_sysroot_linux-64_curr_repodata_hack-3-h69a702a_16.conda#1c005af0c6ff22814b7c52ee448d4bea https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.7.4-hbcca054_0.conda#23ab7665c5f63cfb9f1f6195256daac6 https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_2.conda#cbbe59391138ea5ad3658c76912e147f -https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-2.6.32-he073ed8_17.conda#d731b543793afc0433c4fd593e693fce https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-hf3520f5_7.conda#b80f2f396ca2c28b8c14c437a4ed1e74 https://conda.anaconda.org/conda-forge/linux-64/mkl-include-2024.1.0-ha957f24_693.conda#249c91c2186d236c6d180342241db2ec https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.9-4_cp39.conda#bfe4b3259a8ac6cdf0037752904da6a7 https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda#161081fc7cec0bfda0d86d7cb595f8d8 https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 +https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-h4a8ded7_16.conda#ff7f38675b226cfb855aebfc32a13e31 https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-12.3.0-h6b66f73_113.conda#7fc690ec9db2902e5ee90cebfdab31e7 https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.1.0-h77fa898_0.conda#ae061a5ed5f05818acdf9adab72c146d https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-12.3.0-h6b66f73_113.conda#3706e34877bd82d04cb1e9e9baeb2739 -https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.12-he073ed8_17.conda#595db67e32b276298ff3d94d07d47fbf -https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.40-ha1999f0_7.conda#3f840c7ed70a96b5ebde8044b2f36f32 https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab +https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.17-h4a8ded7_16.conda#223fe8a3ff6d5e78484a9d58eb34d055 +https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.40-ha1999f0_7.conda#3f840c7ed70a96b5ebde8044b2f36f32 https://conda.anaconda.org/conda-forge/linux-64/binutils-2.40-h4852527_7.conda#df53aa8418f8c289ae9b9665986034f8 https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.40-hb3c18ed_9.conda#bb3fb8553a669828501e80d13b6bd744 https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_kmp_llvm.tar.bz2#562b26ba2e19059551a811e72ab7f793 https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.1.0-h77fa898_0.conda#ca0fad6a41ddaef54a153b78eccb5037 https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.12-h4ab18f5_0.conda#7ed427f0871fd41cb1d9c17727c17589 https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.1-h166bdaf_1.tar.bz2#d9c69a24ad678ffce24c6543a0176b00 -https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda#69b8b6202a07720f448be700e300ccf4 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 https://conda.anaconda.org/conda-forge/linux-64/dav1d-1.2.1-hd590300_0.conda#418c6ca5929a611cbd69204907a83995 https://conda.anaconda.org/conda-forge/linux-64/gettext-tools-0.22.5-h59595ed_2.conda#985f2f453fb72408d6b6f1be0f324033 https://conda.anaconda.org/conda-forge/linux-64/giflib-5.2.2-hd590300_0.conda#3bf7b9fd5a7136126e0234db4b87c8b6 @@ -108,11 +109,11 @@ https://conda.anaconda.org/conda-forge/linux-64/freetype-2.12.1-h267a509_2.conda https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-12.3.0-h58ffeeb_13.conda#93325fff774c4cc8dcc8c65039cb4646 https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 https://conda.anaconda.org/conda-forge/linux-64/libasprintf-devel-0.22.5-h661eb56_2.conda#02e41ab5834dcdcc8590cf29d9526f50 -https://conda.anaconda.org/conda-forge/linux-64/libavif16-1.0.4-h9b56c87_5.conda#fc2577679cbe608fa0e17d049d1733d0 +https://conda.anaconda.org/conda-forge/linux-64/libavif16-1.1.0-h9b56c87_0.conda#ab39000b12375e3a30ee79fea996e3c5 https://conda.anaconda.org/conda-forge/linux-64/libglib-2.80.3-h8a4344b_1.conda#6ea440297aacee4893f02ad759e6ffbc -https://conda.anaconda.org/conda-forge/linux-64/libjxl-0.10.2-hcae5a98_0.conda#901db891e1e21afd8524cd636a8c8e3b +https://conda.anaconda.org/conda-forge/linux-64/libjxl-0.10.3-h66b40c8_0.conda#a394f85083195ab8aa33911f40d76870 https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.6.0-h1dd3fc0_3.conda#66f03896ffbe1a110ffda05c7a856504 -https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.12.7-hc051c1a_1.conda#340278ded8b0dc3a73f3660bbb0adbc6 +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.12.7-h4c95cb1_3.conda#0ac9aff6010a7751961c8e4b863a40e7 https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-18.1.8-hf5423f3_0.conda#322be9d39e030673e105b0abb320514e https://conda.anaconda.org/conda-forge/linux-64/mysql-libs-8.3.0-hca2cd23_4.conda#1b50eebe2a738a3146c154d2eceaa8b6 https://conda.anaconda.org/conda-forge/linux-64/nss-3.102-h593d115_0.conda#40e5e48c55a45621c4399ca9236406b7 @@ -126,7 +127,7 @@ https://conda.anaconda.org/conda-forge/noarch/alabaster-0.7.16-pyhd8ed1ab_0.cond https://conda.anaconda.org/conda-forge/noarch/appdirs-1.4.4-pyh9f0ad1d_0.tar.bz2#5f095bc6454094e96f146491fd03633b https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hd590300_1.conda#f27a24d46e3ea7b70a1f98e50c62508f https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py39h3d6467e_1.conda#c48418c8b35f1d59ae9ae1174812b40a -https://conda.anaconda.org/conda-forge/noarch/certifi-2024.6.2-pyhd8ed1ab_0.conda#8821ec1c8fcdc9e1d291d7b9f6e9968a +https://conda.anaconda.org/conda-forge/noarch/certifi-2024.7.4-pyhd8ed1ab_0.conda#24e7fd6ca65997938fff9e5ab6f653e4 https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.3.2-pyhd8ed1ab_0.conda#7f4a9e3fcff3f6356ae99244a014da6a https://conda.anaconda.org/conda-forge/noarch/click-8.1.7-unix_pyh707e725_0.conda#f3ad426304898027fc619827ff428eca https://conda.anaconda.org/conda-forge/noarch/cloudpickle-3.0.0-pyhd8ed1ab_0.conda#753d29fe41bb881e4b9c004f0abf973f @@ -135,7 +136,7 @@ https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_0.conda#5 https://conda.anaconda.org/conda-forge/linux-64/cython-3.0.10-py39h3d6467e_0.conda#76b5d215fb735a6dc43010ffbe78040e https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-h5008d03_3.tar.bz2#ecfff944ba3960ecb334b9a2663d708d https://conda.anaconda.org/conda-forge/noarch/docutils-0.21.2-pyhd8ed1ab_0.conda#e8cd5d629f65bdf0f3bb312cde14659e -https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.0-pyhd8ed1ab_2.conda#8d652ea2ee8eaee02ed8dc820bc794aa +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.2-pyhd8ed1ab_0.conda#d02ae936e42063ca46af6cdad2dbd1e0 https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_0.conda#15dda3cdbf330abfe9f555d22f66db46 https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.14.2-h14ed4e7_0.conda#0f69b688f52ff6da70bccb7ff7001d1d https://conda.anaconda.org/conda-forge/noarch/fsspec-2024.6.1-pyhff2d567_0.conda#996bf792cdb8c0ac38ff54b9fde56841 @@ -153,7 +154,7 @@ https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.5-py39h7633fee_1.conda#c9f74d717e5a2847a9f8b779c54130f2 https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.16-hb7c19ff_0.conda#51bb7010fc86f70eee639b4bb7a894f5 https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-h4637d8d_4.conda#d4529f4dff3057982a7617c7ac58fde3 -https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.11.0-default_h5622ce7_1000.conda#695ee1e435b873780efccc64362cda89 +https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.11.1-default_hecaa2ac_1000.conda#f54aeebefb5c5ff84eca4fb05ca8aa3a https://conda.anaconda.org/conda-forge/linux-64/libllvm15-15.0.7-hb3ce162_4.conda#8a35df3cbc0c8b12cc8af9473ae75eef https://conda.anaconda.org/conda-forge/linux-64/libllvm18-18.1.8-hc9dba70_0.conda#f94ed0c5953c78dcca7adb953f4c5bfb https://conda.anaconda.org/conda-forge/linux-64/libpq-16.3-ha72fbe1_0.conda#bac737ae28b79cfbafd515258d97d29e @@ -212,7 +213,7 @@ https://conda.anaconda.org/conda-forge/linux-64/libflac-1.4.3-h59595ed_0.conda#e https://conda.anaconda.org/conda-forge/linux-64/libgpg-error-1.50-h4f305b6_0.conda#0d7ff1a8e69565ca3add6925e18e708f https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.7.0-h2c5496b_1.conda#e2eaefa4de2b7237af7c907b8bbc760a https://conda.anaconda.org/conda-forge/noarch/memory_profiler-0.61.0-pyhd8ed1ab_0.tar.bz2#8b45f9f2b2f7a98b0ec179c8991a4a9b -https://conda.anaconda.org/conda-forge/noarch/meson-1.4.1-pyhd8ed1ab_0.conda#714ca123839eeebb25d12b443067ea64 +https://conda.anaconda.org/conda-forge/noarch/meson-1.5.0-pyhd8ed1ab_0.conda#9d971c5bf99aed063664d6650e7e7ed8 https://conda.anaconda.org/conda-forge/noarch/partd-1.4.2-pyhd8ed1ab_0.conda#0badf9c54e24cecfb0ad2f99d680c163 https://conda.anaconda.org/conda-forge/linux-64/pillow-10.4.0-py39h16a7006_0.conda#d9a6b19174a6cf5185296b16f781951f https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda#f586ac1e56c8638b64f9c8122a7b8a67 @@ -221,7 +222,7 @@ https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.8.0-pyhd8ed1a https://conda.anaconda.org/conda-forge/noarch/pytest-8.2.2-pyhd8ed1ab_0.conda#0f3f49c22c7ef3a1195fa61dad3c43be https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda#2cf4264fffb9e6eff6031c5b6884d61c https://conda.anaconda.org/conda-forge/linux-64/sip-6.7.12-py39h3d6467e_0.conda#e667a3ab0df62c54e60e1843d2e6defb -https://conda.anaconda.org/conda-forge/linux-64/tbb-2021.12.0-h434a139_2.conda#9e78ded802220ee1f67c908cb2ef188f +https://conda.anaconda.org/conda-forge/linux-64/tbb-2021.12.0-h434a139_3.conda#c667c11d1e488a38220ede8a34441bff https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.7.0-h00ab1b0_1.conda#28de2e073db9ca9b72858bee9fb6f571 https://conda.anaconda.org/conda-forge/linux-64/fortran-compiler-1.7.0-heb67821_1.conda#cf4b0e7c4c78bb0662aed9b27c414a3c https://conda.anaconda.org/conda-forge/linux-64/gstreamer-1.24.5-haf2f30d_0.conda#c5252c02592373fa8caf5a5327165a89 @@ -233,7 +234,7 @@ https://conda.anaconda.org/conda-forge/noarch/meson-python-0.16.0-pyh0c530f3_0.c https://conda.anaconda.org/conda-forge/linux-64/mkl-2024.1.0-ha957f24_693.conda#ff0f4abf6f94e36a918f1ef4dbeb9769 https://conda.anaconda.org/conda-forge/linux-64/pyqt5-sip-12.12.2-py39h3d6467e_5.conda#93aff412f3e49fdb43361c0215cbd72d https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.6.1-pyhd8ed1ab_0.conda#b39568655c127a9c4a44d178ac99b6d0 -https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.22.0-py39h81c9582_1.conda#c1dd22d67b1f8cef888b64b688b71ffd +https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py39h623c9ba_0.conda#a19d023682384c637cb356d270c276c0 https://conda.anaconda.org/conda-forge/linux-64/compilers-1.7.0-ha770c72_1.conda#d8d07866ac3b5b6937213c89a1874f08 https://conda.anaconda.org/conda-forge/noarch/dask-core-2024.7.0-pyhd8ed1ab_0.conda#755e47653ae38f5c50f1435af756e844 https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.24.5-hbaaba92_0.conda#4a485842570569ba754863b2c083b346 @@ -270,7 +271,7 @@ https://conda.anaconda.org/conda-forge/noarch/seaborn-0.12.2-hd8ed1ab_0.conda#50 https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.2-pyhd8ed1ab_0.tar.bz2#025ad7ca2c7f65007ab6b6f5d93a56eb https://conda.anaconda.org/conda-forge/noarch/pydata-sphinx-theme-0.15.3-pyhd8ed1ab_0.conda#55e445f4fcb07f2471fb0e1102d36488 https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_0.conda#ac832cc43adc79118cf6e23f1f9b8995 -https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.5.0-pyhd8ed1ab_0.conda#264b3c697fa9cdade87eb0abe4440d54 +https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.6.0-pyhd8ed1ab_0.conda#b04f3c04e4f7939c6207dc0c0355f468 https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.16.0-pyhd8ed1ab_0.conda#add28691ee89e875b190eda07929d5d4 https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.4.0-pyhd8ed1ab_0.tar.bz2#88ee91e8679603f2a5bd036d52919cc2 https://conda.anaconda.org/conda-forge/noarch/sphinx-remove-toctrees-1.0.0.post1-pyhd8ed1ab_0.conda#6dee8412218288a17f99f2cfffab334d diff --git a/doc/js/scripts/dropdown.js b/doc/js/scripts/dropdown.js index ec2e6d9419a28..d76b7f943bf8a 100644 --- a/doc/js/scripts/dropdown.js +++ b/doc/js/scripts/dropdown.js @@ -8,54 +8,54 @@ * want them to show up in that case. */ -function addToggleAllButtons() { +document.addEventListener("DOMContentLoaded", () => { // Get all sphinx-design dropdowns const allDropdowns = document.querySelectorAll("details.sd-dropdown"); - function collapseAll() { - // Function to collapse all dropdowns on the page - console.log("[SK] Collapsing all dropdowns..."); - allDropdowns.forEach((dropdown) => { - dropdown.removeAttribute("open"); - }); - } - - function expandAll() { - // Function to expand all dropdowns on the page - console.log("[SK] Expanding all dropdowns..."); - allDropdowns.forEach((dropdown) => { - dropdown.setAttribute("open", ""); - }); - } - - const buttonConfigs = new Map([ - ["up", { desc: "Collapse", action: collapseAll }], - ["down", { desc: "Expand", action: expandAll }], - ]); - allDropdowns.forEach((dropdown) => { // Get the summary element of the dropdown, where we will place the buttons const summaryTitle = dropdown.querySelector("summary.sd-summary-title"); - for (const [direction, config] of buttonConfigs) { - // Button with icon inside - var newButton = document.createElement("button"); - var newIcon = document.createElement("i"); - newIcon.classList.add("fa-solid", `fa-angles-${direction}`); - newButton.appendChild(newIcon); - // Class for styling; `sd-summary-up/down` is implemented by sphinx-design; - // `sk-toggle-all` is implemented by us - newButton.classList.add(`sd-summary-${direction}`, `sk-toggle-all`); - // Bootstrap tooltip configurations - newButton.setAttribute("data-bs-toggle", "tooltip"); - newButton.setAttribute("data-bs-placement", "top"); - newButton.setAttribute("data-bs-offset", "0,10"); - newButton.setAttribute("data-bs-title", `${config.desc} all dropdowns`); - // Assign the collapse/expand action to the button - newButton.onclick = config.action; - // Append the button to the summary element - summaryTitle.appendChild(newButton); - } - }); -} -document.addEventListener("DOMContentLoaded", addToggleAllButtons); + // The state marker with the toggle all icon inside + const newStateMarker = document.createElement("span"); + const newIcon = document.createElement("i"); + newIcon.classList.add("fa-solid", "fa-angles-right"); + newStateMarker.appendChild(newIcon); + + // Classes for styling; `sd-summary-state-marker` and `sd-summary-chevron-right` are + // implemented by sphinx-design; `sk-toggle-all` is implemented by us + newStateMarker.classList.add( + "sd-summary-state-marker", + "sd-summary-chevron-right", + "sk-toggle-all" + ); + + // Bootstrap tooltip configurations + newStateMarker.setAttribute("data-bs-toggle", "tooltip"); + newStateMarker.setAttribute("data-bs-placement", "top"); + newStateMarker.setAttribute("data-bs-offset", "0,10"); + newStateMarker.setAttribute("data-bs-title", "Toggle all dropdowns"); + + // Assign the collapse/expand action to the state marker + newStateMarker.addEventListener("click", () => { + if (dropdown.open) { + console.log("[SK] Collapsing all dropdowns..."); + allDropdowns.forEach((node) => { + if (node !== dropdown) { + node.removeAttribute("open"); + } + }); + } else { + console.log("[SK] Expanding all dropdowns..."); + allDropdowns.forEach((node) => { + if (node !== dropdown) { + node.setAttribute("open", ""); + } + }); + } + }); + + // Append the state marker to the summary element + summaryTitle.insertBefore(newStateMarker, summaryTitle.lastElementChild); + }); +}); diff --git a/doc/scss/custom.scss b/doc/scss/custom.scss index ce4451fce4467..6af234b871ed6 100644 --- a/doc/scss/custom.scss +++ b/doc/scss/custom.scss @@ -86,33 +86,29 @@ code.literal { /* Dropdowns (sphinx-design) */ details.sd-dropdown { - &:hover > summary.sd-summary-title > a.headerlink { - visibility: visible; + &:hover > summary.sd-summary-title { + > .sd-summary-text > a.headerlink { + visibility: visible; + } + + > .sk-toggle-all { + opacity: 1; + } } > summary.sd-summary-title { - > a.headerlink { + > .sd-summary-text > a.headerlink { font-size: 1rem; } // See `js/scripts/dropdown.js`: this is styling the "expand/collapse all" button - > button.sk-toggle-all { + > .sk-toggle-all { color: var(--pst-sd-dropdown-color); - top: 0.9rem !important; - right: 3rem !important; + margin-right: 0.5rem; pointer-events: auto !important; - display: none; - border: none; - background: transparent; + opacity: 0; } } - - &[open] > summary.sd-summary-title:hover > .sd-summary-up.sk-toggle-all, - &:not([open]) - > summary.sd-summary-title:hover - > .sd-summary-down.sk-toggle-all { - display: block; - } } /* scikit-learn buttons */ diff --git a/doc/sphinxext/dropdown_anchors.py b/doc/sphinxext/dropdown_anchors.py index eb0b414de6ae8..a001dfa11d403 100644 --- a/doc/sphinxext/dropdown_anchors.py +++ b/doc/sphinxext/dropdown_anchors.py @@ -2,7 +2,7 @@ from docutils import nodes from sphinx.transforms.post_transforms import SphinxPostTransform -from sphinx_design.dropdown import dropdown_main, dropdown_title +from sphinx_design.dropdown import dropdown_main class DropdownAnchorAdder(SphinxPostTransform): @@ -12,26 +12,8 @@ class DropdownAnchorAdder(SphinxPostTransform): need to make sure that the old anchors still work. See the original implementation (in JS): https://github.com/scikit-learn/scikit-learn/pull/27409 - The structure of each sphinx-design dropdown node is expected to be: - - - - ...icon <-- This exists if the "icon" option of the sphinx-design - dropdown is set; we do not use it in our documentation - - ...title <-- This may contain multiple nodes, e.g. literal nodes if - there are inline codes; we use the concatenated text of - all these nodes to generate the anchor ID - - Here we insert the anchor link! - - <-- The "dropdown closed" marker - <-- The "dropdown open" marker - - - ...main contents - - + The anchor links are inserted at the end of the node with class "sd-summary-text" + which includes only the title text part of the dropdown (no icon, markers, etc). """ default_priority = 9999 # Apply later than everything else @@ -44,15 +26,13 @@ def run(self): anchor_id_counters = {} for sd_dropdown in self.document.findall(dropdown_main): - # Grab the dropdown title - sd_dropdown_title = sd_dropdown.next_node(dropdown_title) + # Grab the summary text node + sd_summary_text = sd_dropdown.next_node( + lambda node: "sd-summary-text" in node.get("classes", []) + ) # Concatenate the text of relevant nodes as the title text - # Since we do not have the prefix icon, the relevant nodes are the very - # first child node until the third last node (last two are markers) - title_text = "".join( - node.astext() for node in sd_dropdown_title.children[:-2] - ) + title_text = "".join(node.astext() for node in sd_summary_text.children) # The ID uses the first line, lowercased, with spaces replaced by dashes; # suffix the anchor ID with a counter if it already exists @@ -71,7 +51,7 @@ def run(self): 'title="Link to this dropdown">#' ) anchor_node = nodes.raw("", anchor_html, format="html") - sd_dropdown_title.insert(-2, anchor_node) # before the two markers + sd_summary_text.append(anchor_node) def setup(app): diff --git a/pyproject.toml b/pyproject.toml index e253dfe311487..a143495bf363f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ docs = [ "sphinxext-opengraph>=0.9.1", "plotly>=5.14.0", "polars>=0.20.23", - "sphinx-design>=0.5.0", + "sphinx-design>=0.6.0", "sphinxcontrib-sass>=0.3.4", "pydata-sphinx-theme>=0.15.3", "sphinx-remove-toctrees>=1.0.0.post1", diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py index 9c108791b45bc..2003ccdea1abc 100644 --- a/sklearn/_min_dependencies.py +++ b/sklearn/_min_dependencies.py @@ -46,7 +46,7 @@ "plotly": ("5.14.0", "docs, examples"), "sphinxcontrib-sass": ("0.3.4", "docs"), "sphinx-remove-toctrees": ("1.0.0.post1", "docs"), - "sphinx-design": ("0.5.0", "docs"), + "sphinx-design": ("0.6.0", "docs"), "pydata-sphinx-theme": ("0.15.3", "docs"), # XXX: Pin conda-lock to the latest released version (needs manual update # from time to time)