Merge pull request #3 from ssec-jhu/scarliles/regression-benchmark

upstream changes
neurodata · Apr 24, 2024 · ffc6328 · ffc6328
2 parents a7f5e92 + cf285c1
commit ffc6328
Show file tree

Hide file tree

Showing 237 changed files with 1,345 additions and 926 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,6 @@
+.* export-ignore
+asv_benchmarks export-ignore
+azure-pipelines.yml export-ignore
+benchmarks export-ignore
+build_tools export-ignore
+maint_tools export-ignore
diff --git a/.github/scripts/label_title_regex.py b/.github/scripts/label_title_regex.py
@@ -1,5 +1,6 @@
 """Labels PRs based on title. Must be run in a github action with the
 pull_request_target event."""
+
 import json
 import os
 import re

diff --git a/.github/workflows/check-manifest.yml → .github/workflows/check-sdist.yml b/.github/workflows/check-manifest.yml → .github/workflows/check-sdist.yml
@@ -1,11 +1,11 @@
-name: "Check Manifest"
+name: "Check sdist"
 
 on:
   schedule:
     - cron: '0 0 * * *'
 
 jobs:
-  check-manifest:
+  check-sdist:
     # Don't run on forks
     if: github.repository == 'scikit-learn/scikit-learn'
 
@@ -19,15 +19,15 @@ jobs:
         # scipy and cython are required to build sdist
         run: |
           python -m pip install --upgrade pip
-          pip install check-manifest scipy cython
+          pip install check-sdist
       - run: |
-          check-manifest -v
+          check-sdist --inject-junk
 
   update-tracker:
     uses: ./.github/workflows/update_tracking_issue.yml
     if: ${{ always() }}
-    needs: [check-manifest]
+    needs: [check-sdist]
     with:
-      job_status: ${{ needs.check-manifest.result }}
+      job_status: ${{ needs.check-sdist.result }}
     secrets:
       BOT_GITHUB_TOKEN: ${{ secrets.BOT_GITHUB_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -12,11 +12,11 @@ repos:
     -   id: ruff
         args: ["--fix", "--output-format=full"]
 -   repo: https://github.com/psf/black
-    rev: 23.3.0
+    rev: 24.3.0
     hooks:
     -   id: black
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.3.0
+    rev: v1.9.0
     hooks:
      -  id: mypy
         files: sklearn/

diff --git a/COPYING b/COPYING
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2007-2023 The scikit-learn developers.
+Copyright (c) 2007-2024 The scikit-learn developers.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/SECURITY.md b/SECURITY.md
@@ -4,8 +4,8 @@
 
 | Version       | Supported          |
 | ------------- | ------------------ |
-| 1.4.1.post1   | :white_check_mark: |
-| < 1.4.1.post1 | :x:                |
+| 1.4.2         | :white_check_mark: |
+| < 1.4.2       | :x:                |
 
 ## Reporting a Vulnerability
 

diff --git a/asv_benchmarks/asv.conf.json b/asv_benchmarks/asv.conf.json
@@ -78,7 +78,7 @@
     "matrix": {
         "numpy": ["1.25.2"],
         "scipy": ["1.11.2"],
-        "cython": ["3.0.9"],
+        "cython": ["3.0.10"],
         "joblib": ["1.3.2"],
         "threadpoolctl": ["3.2.0"],
         "pandas": ["2.1.0"]

diff --git a/asv_benchmarks/benchmarks/ensemble.py b/asv_benchmarks/benchmarks/ensemble.py
@@ -2,15 +2,58 @@
     GradientBoostingClassifier,
     HistGradientBoostingClassifier,
     RandomForestClassifier,
+    RandomForestRegressor,
 )
 
 from .common import Benchmark, Estimator, Predictor
 from .datasets import (
     _20newsgroups_highdim_dataset,
     _20newsgroups_lowdim_dataset,
     _synth_classification_dataset,
+    _synth_regression_dataset,
+    _synth_regression_sparse_dataset,
 )
-from .utils import make_gen_classif_scorers
+from .utils import make_gen_classif_scorers, make_gen_reg_scorers
+
+
+class RandomForestRegressorBenchmark(Predictor, Estimator, Benchmark):
+    """
+    Benchmarks for RandomForestRegressor.
+    """
+
+    param_names = ["representation", "n_jobs"]
+    params = (["dense", "sparse"], Benchmark.n_jobs_vals)
+
+    def setup_cache(self):
+        super().setup_cache()
+
+    def make_data(self, params):
+        representation, n_jobs = params
+
+        if representation == "sparse":
+            data = _synth_regression_sparse_dataset()
+        else:
+            data = _synth_regression_dataset()
+
+        return data
+
+    def make_estimator(self, params):
+        representation, n_jobs = params
+
+        n_estimators = 500 if Benchmark.data_size == "large" else 100
+
+        estimator = RandomForestRegressor(
+            n_estimators=n_estimators,
+            min_samples_split=10,
+            max_features="log2",
+            n_jobs=n_jobs,
+            random_state=0,
+        )
+
+        return estimator
+
+    def make_scorers(self):
+        make_gen_reg_scorers(self)
 
 
 class RandomForestClassifierBenchmark(Predictor, Estimator, Benchmark):

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -124,9 +124,9 @@ jobs:
     vmImage: ubuntu-22.04
   variables:
     # Need to match Python version and Emscripten version for the correct
-    # Pyodide version. For example, for Pyodide version 0.25.0, see
-    # https://github.com/pyodide/pyodide/blob/0.25.0/Makefile.envs
-    PYODIDE_VERSION: '0.25.0'
+    # Pyodide version. For example, for Pyodide version 0.25.1, see
+    # https://github.com/pyodide/pyodide/blob/0.25.1/Makefile.envs
+    PYODIDE_VERSION: '0.25.1'
     EMSCRIPTEN_VERSION: '3.1.46'
     PYTHON_VERSION: '3.11.3'
 

diff --git a/benchmarks/bench_glm.py b/benchmarks/bench_glm.py
@@ -4,6 +4,7 @@
 Data comes from a random square matrix.
 
 """
+
 from datetime import datetime
 
 import numpy as np

diff --git a/benchmarks/bench_glmnet.py b/benchmarks/bench_glmnet.py
@@ -16,6 +16,7 @@
 
 In both cases, only 10% of the features are informative.
 """
+
 import gc
 from time import time
 

diff --git a/benchmarks/bench_isotonic.py b/benchmarks/bench_isotonic.py
@@ -10,6 +10,7 @@
 This allows the scaling of the algorithm with the problem size to be
 visualized and understood.
 """
+
 import argparse
 import gc
 from datetime import datetime

diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
@@ -35,6 +35,7 @@
 You can also set `arpack_all=True` to activate arpack solver for large number
 of components (this takes more time).
 """
+
 # Authors: Sylvain MARIE, Schneider Electric
 
 import time

diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py
@@ -37,6 +37,7 @@
 Solvers comparison benchmark: time vs n_components", where this time the number
 of examples is fixed, and the desired number of components varies.
 """
+
 # Author: Sylvain MARIE, Schneider Electric
 
 import time

diff --git a/benchmarks/bench_lasso.py b/benchmarks/bench_lasso.py
@@ -11,6 +11,7 @@
 
 In both cases, only 10% of the features are informative.
 """
+
 import gc
 from time import time
 

diff --git a/benchmarks/bench_plot_lasso_path.py b/benchmarks/bench_plot_lasso_path.py
@@ -2,6 +2,7 @@
 
 The input data is mostly low rank but is a fat infinite tail.
 """
+
 import gc
 import sys
 from collections import defaultdict

diff --git a/benchmarks/bench_plot_neighbors.py b/benchmarks/bench_plot_neighbors.py
@@ -1,6 +1,7 @@
 """
 Plot the scaling of the nearest neighbors algorithms with k, D, and N
 """
+
 from time import time
 
 import matplotlib.pyplot as plt

diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py
@@ -1,6 +1,7 @@
 """
 Benchmarks of Non-Negative Matrix Factorization
 """
+
 # Authors: Tom Dupre la Tour (benchmark)
 #          Chih-Jen Linn (original projected gradient NMF implementation)
 #          Anthony Di Franco (projected gradient, Python and NumPy port)
@@ -258,8 +259,7 @@ def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
         if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0:
             raise ValueError(
                 "Maximum number of iterations must be a positive "
-                "integer; got (max_iter=%r)"
-                % self.max_iter
+                "integer; got (max_iter=%r)" % self.max_iter
             )
         if not isinstance(self.tol, numbers.Number) or self.tol < 0:
             raise ValueError(
@@ -305,8 +305,7 @@ def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
         if n_iter == self.max_iter and self.tol > 0:
             warnings.warn(
                 "Maximum number of iteration %d reached. Increase it"
-                " to improve convergence."
-                % self.max_iter,
+                " to improve convergence." % self.max_iter,
                 ConvergenceWarning,
             )
 

diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py
@@ -3,6 +3,7 @@
 
 The input data is mostly low rank but is a fat infinite tail.
 """
+
 import gc
 import sys
 from time import time

diff --git a/benchmarks/bench_plot_polynomial_kernel_approximation.py b/benchmarks/bench_plot_polynomial_kernel_approximation.py
@@ -38,6 +38,7 @@
 (https://people.cs.rutgers.edu/~farach/pubs/FrequentStream.pdf)
 
 """
+
 # Author: Daniel Lopez-Sanchez <lope@usal.es>
 # License: BSD 3 clause
 

diff --git a/benchmarks/bench_plot_svd.py b/benchmarks/bench_plot_svd.py
@@ -2,6 +2,7 @@
 
 The data is mostly low rank but is a fat infinite tail.
 """
+
 import gc
 from collections import defaultdict
 from time import time

diff --git a/benchmarks/bench_random_projections.py b/benchmarks/bench_random_projections.py
@@ -6,6 +6,7 @@
 Benchmarks for random projections.
 
 """
+
 import collections
 import gc
 import optparse

diff --git a/benchmarks/bench_saga.py b/benchmarks/bench_saga.py
@@ -3,6 +3,7 @@
 Benchmarks of sklearn SAGA vs lightning SAGA vs Liblinear. Shows the gain
 in using multinomial logistic regression in term of learning time.
 """
+
 import json
 import os
 import time
@@ -118,9 +119,7 @@ def fit_single(
                 # Lightning predict_proba is not implemented for n_classes > 2
                 y_pred = _predict_proba(lr, X)
             score = log_loss(y, y_pred, normalize=False) / n_samples
-            score += 0.5 * alpha * np.sum(lr.coef_**2) + beta * np.sum(
-                np.abs(lr.coef_)
-            )
+            score += 0.5 * alpha * np.sum(lr.coef_**2) + beta * np.sum(np.abs(lr.coef_))
             scores.append(score)
         train_score, test_score = tuple(scores)
 

diff --git a/benchmarks/bench_sample_without_replacement.py b/benchmarks/bench_sample_without_replacement.py
@@ -2,6 +2,7 @@
 Benchmarks for sampling without replacement of integer.
 
 """
+
 import gc
 import operator
 import optparse

diff --git a/benchmarks/bench_text_vectorizers.py b/benchmarks/bench_text_vectorizers.py
@@ -8,6 +8,7 @@
  * psutil (optional, but recommended)
 
 """
+
 import itertools
 import timeit
 

diff --git a/benchmarks/bench_tree.py b/benchmarks/bench_tree.py
@@ -13,6 +13,7 @@
 training set, classify a sample and plot the time taken as a function
 of the number of dimensions.
 """
+
 import gc
 from datetime import datetime
 

diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py
@@ -130,7 +130,8 @@ def sanitize(filename):
         try:
             from bhtsne.bhtsne import run_bh_tsne
         except ImportError as e:
-            raise ImportError("""\
+            raise ImportError(
+                """\
 If you want comparison with the reference implementation, build the
 binary from source (https://github.com/lvdmaaten/bhtsne) in the folder
 benchmarks/bhtsne and add an empty `__init__.py` file in the folder:
@@ -140,7 +141,8 @@ def sanitize(filename):
 $ g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2
 $ touch __init__.py
 $ cd ..
-""") from e
+"""
+            ) from e
 
         def bhtsne(X):
             """Wrapper for the reference lvdmaaten/bhtsne implementation."""

diff --git a/build_tools/azure/debian_atlas_32bit_lock.txt b/build_tools/azure/debian_atlas_32bit_lock.txt
@@ -8,7 +8,7 @@ attrs==23.2.0
     # via pytest
 coverage==7.4.4
     # via pytest-cov
-cython==3.0.9
+cython==3.0.10
     # via -r build_tools/azure/debian_atlas_32bit_requirements.txt
 iniconfig==2.0.0
     # via pytest

diff --git a/build_tools/azure/debian_atlas_32bit_requirements.txt b/build_tools/azure/debian_atlas_32bit_requirements.txt
@@ -1,7 +1,7 @@
 # DO NOT EDIT: this file is generated from the specification found in the
 # following script to centralize the configuration for CI builds:
 # build_tools/update_environments_and_lock_files.py
-cython==3.0.9  # min
+cython==3.0.10  # min
 joblib==1.2.0  # min
 threadpoolctl==2.2.0
 pytest==7.1.2  # min
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ @@
     Data comes from a random square matrix.
     """
     from datetime import datetime
     import numpy as np
@@ Expand Down @@