Skip to content

Commit

Permalink
Merge pull request #3 from ssec-jhu/scarliles/regression-benchmark
Browse files Browse the repository at this point in the history
upstream changes
  • Loading branch information
SamuelCarliles3 authored Apr 24, 2024
2 parents a7f5e92 + cf285c1 commit ffc6328
Show file tree
Hide file tree
Showing 237 changed files with 1,345 additions and 926 deletions.
6 changes: 6 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.* export-ignore
asv_benchmarks export-ignore
azure-pipelines.yml export-ignore
benchmarks export-ignore
build_tools export-ignore
maint_tools export-ignore
1 change: 1 addition & 0 deletions .github/scripts/label_title_regex.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Labels PRs based on title. Must be run in a github action with the
pull_request_target event."""

import json
import os
import re
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
name: "Check Manifest"
name: "Check sdist"

on:
schedule:
- cron: '0 0 * * *'

jobs:
check-manifest:
check-sdist:
# Don't run on forks
if: github.repository == 'scikit-learn/scikit-learn'

Expand All @@ -19,15 +19,15 @@ jobs:
# scipy and cython are required to build sdist
run: |
python -m pip install --upgrade pip
pip install check-manifest scipy cython
pip install check-sdist
- run: |
check-manifest -v
check-sdist --inject-junk
update-tracker:
uses: ./.github/workflows/update_tracking_issue.yml
if: ${{ always() }}
needs: [check-manifest]
needs: [check-sdist]
with:
job_status: ${{ needs.check-manifest.result }}
job_status: ${{ needs.check-sdist.result }}
secrets:
BOT_GITHUB_TOKEN: ${{ secrets.BOT_GITHUB_TOKEN }}
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ repos:
- id: ruff
args: ["--fix", "--output-format=full"]
- repo: https://github.com/psf/black
rev: 23.3.0
rev: 24.3.0
hooks:
- id: black
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.3.0
rev: v1.9.0
hooks:
- id: mypy
files: sklearn/
Expand Down
2 changes: 1 addition & 1 deletion COPYING
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
BSD 3-Clause License

Copyright (c) 2007-2023 The scikit-learn developers.
Copyright (c) 2007-2024 The scikit-learn developers.
All rights reserved.

Redistribution and use in source and binary forms, with or without
Expand Down
4 changes: 2 additions & 2 deletions SECURITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

| Version | Supported |
| ------------- | ------------------ |
| 1.4.1.post1 | :white_check_mark: |
| < 1.4.1.post1 | :x: |
| 1.4.2 | :white_check_mark: |
| < 1.4.2 | :x: |

## Reporting a Vulnerability

Expand Down
2 changes: 1 addition & 1 deletion asv_benchmarks/asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
"matrix": {
"numpy": ["1.25.2"],
"scipy": ["1.11.2"],
"cython": ["3.0.9"],
"cython": ["3.0.10"],
"joblib": ["1.3.2"],
"threadpoolctl": ["3.2.0"],
"pandas": ["2.1.0"]
Expand Down
45 changes: 44 additions & 1 deletion asv_benchmarks/benchmarks/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,58 @@
GradientBoostingClassifier,
HistGradientBoostingClassifier,
RandomForestClassifier,
RandomForestRegressor,
)

from .common import Benchmark, Estimator, Predictor
from .datasets import (
_20newsgroups_highdim_dataset,
_20newsgroups_lowdim_dataset,
_synth_classification_dataset,
_synth_regression_dataset,
_synth_regression_sparse_dataset,
)
from .utils import make_gen_classif_scorers
from .utils import make_gen_classif_scorers, make_gen_reg_scorers


class RandomForestRegressorBenchmark(Predictor, Estimator, Benchmark):
"""
Benchmarks for RandomForestRegressor.
"""

param_names = ["representation", "n_jobs"]
params = (["dense", "sparse"], Benchmark.n_jobs_vals)

def setup_cache(self):
super().setup_cache()

def make_data(self, params):
representation, n_jobs = params

if representation == "sparse":
data = _synth_regression_sparse_dataset()
else:
data = _synth_regression_dataset()

return data

def make_estimator(self, params):
representation, n_jobs = params

n_estimators = 500 if Benchmark.data_size == "large" else 100

estimator = RandomForestRegressor(
n_estimators=n_estimators,
min_samples_split=10,
max_features="log2",
n_jobs=n_jobs,
random_state=0,
)

return estimator

def make_scorers(self):
make_gen_reg_scorers(self)


class RandomForestClassifierBenchmark(Predictor, Estimator, Benchmark):
Expand Down
6 changes: 3 additions & 3 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ jobs:
vmImage: ubuntu-22.04
variables:
# Need to match Python version and Emscripten version for the correct
# Pyodide version. For example, for Pyodide version 0.25.0, see
# https://github.com/pyodide/pyodide/blob/0.25.0/Makefile.envs
PYODIDE_VERSION: '0.25.0'
# Pyodide version. For example, for Pyodide version 0.25.1, see
# https://github.com/pyodide/pyodide/blob/0.25.1/Makefile.envs
PYODIDE_VERSION: '0.25.1'
EMSCRIPTEN_VERSION: '3.1.46'
PYTHON_VERSION: '3.11.3'

Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Data comes from a random square matrix.
"""

from datetime import datetime

import numpy as np
Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_glmnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
In both cases, only 10% of the features are informative.
"""

import gc
from time import time

Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_isotonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
This allows the scaling of the algorithm with the problem size to be
visualized and understood.
"""

import argparse
import gc
from datetime import datetime
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
You can also set `arpack_all=True` to activate arpack solver for large number
of components (this takes more time).
"""

# Authors: Sylvain MARIE, Schneider Electric

import time
Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
Solvers comparison benchmark: time vs n_components", where this time the number
of examples is fixed, and the desired number of components varies.
"""

# Author: Sylvain MARIE, Schneider Electric

import time
Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_lasso.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
In both cases, only 10% of the features are informative.
"""

import gc
from time import time

Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_plot_lasso_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
The input data is mostly low rank but is a fat infinite tail.
"""

import gc
import sys
from collections import defaultdict
Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_plot_neighbors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Plot the scaling of the nearest neighbors algorithms with k, D, and N
"""

from time import time

import matplotlib.pyplot as plt
Expand Down
7 changes: 3 additions & 4 deletions benchmarks/bench_plot_nmf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Benchmarks of Non-Negative Matrix Factorization
"""

# Authors: Tom Dupre la Tour (benchmark)
# Chih-Jen Linn (original projected gradient NMF implementation)
# Anthony Di Franco (projected gradient, Python and NumPy port)
Expand Down Expand Up @@ -258,8 +259,7 @@ def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0:
raise ValueError(
"Maximum number of iterations must be a positive "
"integer; got (max_iter=%r)"
% self.max_iter
"integer; got (max_iter=%r)" % self.max_iter
)
if not isinstance(self.tol, numbers.Number) or self.tol < 0:
raise ValueError(
Expand Down Expand Up @@ -305,8 +305,7 @@ def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
if n_iter == self.max_iter and self.tol > 0:
warnings.warn(
"Maximum number of iteration %d reached. Increase it"
" to improve convergence."
% self.max_iter,
" to improve convergence." % self.max_iter,
ConvergenceWarning,
)

Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_plot_omp_lars.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
The input data is mostly low rank but is a fat infinite tail.
"""

import gc
import sys
from time import time
Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_plot_polynomial_kernel_approximation.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
(https://people.cs.rutgers.edu/~farach/pubs/FrequentStream.pdf)
"""

# Author: Daniel Lopez-Sanchez <lope@usal.es>
# License: BSD 3 clause

Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_plot_svd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
The data is mostly low rank but is a fat infinite tail.
"""

import gc
from collections import defaultdict
from time import time
Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_random_projections.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
Benchmarks for random projections.
"""

import collections
import gc
import optparse
Expand Down
5 changes: 2 additions & 3 deletions benchmarks/bench_saga.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Benchmarks of sklearn SAGA vs lightning SAGA vs Liblinear. Shows the gain
in using multinomial logistic regression in term of learning time.
"""

import json
import os
import time
Expand Down Expand Up @@ -118,9 +119,7 @@ def fit_single(
# Lightning predict_proba is not implemented for n_classes > 2
y_pred = _predict_proba(lr, X)
score = log_loss(y, y_pred, normalize=False) / n_samples
score += 0.5 * alpha * np.sum(lr.coef_**2) + beta * np.sum(
np.abs(lr.coef_)
)
score += 0.5 * alpha * np.sum(lr.coef_**2) + beta * np.sum(np.abs(lr.coef_))
scores.append(score)
train_score, test_score = tuple(scores)

Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_sample_without_replacement.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Benchmarks for sampling without replacement of integer.
"""

import gc
import operator
import optparse
Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_text_vectorizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* psutil (optional, but recommended)
"""

import itertools
import timeit

Expand Down
1 change: 1 addition & 0 deletions benchmarks/bench_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
training set, classify a sample and plot the time taken as a function
of the number of dimensions.
"""

import gc
from datetime import datetime

Expand Down
6 changes: 4 additions & 2 deletions benchmarks/bench_tsne_mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ def sanitize(filename):
try:
from bhtsne.bhtsne import run_bh_tsne
except ImportError as e:
raise ImportError("""\
raise ImportError(
"""\
If you want comparison with the reference implementation, build the
binary from source (https://github.com/lvdmaaten/bhtsne) in the folder
benchmarks/bhtsne and add an empty `__init__.py` file in the folder:
Expand All @@ -140,7 +141,8 @@ def sanitize(filename):
$ g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2
$ touch __init__.py
$ cd ..
""") from e
"""
) from e

def bhtsne(X):
"""Wrapper for the reference lvdmaaten/bhtsne implementation."""
Expand Down
2 changes: 1 addition & 1 deletion build_tools/azure/debian_atlas_32bit_lock.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ attrs==23.2.0
# via pytest
coverage==7.4.4
# via pytest-cov
cython==3.0.9
cython==3.0.10
# via -r build_tools/azure/debian_atlas_32bit_requirements.txt
iniconfig==2.0.0
# via pytest
Expand Down
2 changes: 1 addition & 1 deletion build_tools/azure/debian_atlas_32bit_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# DO NOT EDIT: this file is generated from the specification found in the
# following script to centralize the configuration for CI builds:
# build_tools/update_environments_and_lock_files.py
cython==3.0.9 # min
cython==3.0.10 # min
joblib==1.2.0 # min
threadpoolctl==2.2.0
pytest==7.1.2 # min
Expand Down
Loading

0 comments on commit ffc6328

Please sign in to comment.