From eb946d44c83405edb21ba068c95e1e19eeff4aa9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 11 Oct 2023 22:07:48 -0400 Subject: [PATCH] [ENH] Cython axis-aligned multi-view splitter (#129) * Cython multiview --------- Signed-off-by: Adam Li --- .github/workflows/build_wheels.yml | 11 + .github/workflows/main.yml | 46 -- .github/workflows/release.yml | 61 ++ DEVELOPING.md | 4 + doc/api.rst | 2 + doc/whats_new/v0.3.rst | 1 + examples/README.txt | 2 +- examples/hypothesis_testing/README.txt | 6 + ...t_MI_gigantic_hypothesis_testing_forest.py | 0 .../plot_might_auc.py | 0 examples/outlier_detection/README.txt | 6 + .../plot_extended_isolation_forest.py | 0 examples/plot_extra_oblique_random_forest.py | 4 + examples/plot_extra_orf_sample_size.py | 1 + examples/plot_oblique_random_forest.py | 3 + examples/splitters/README.txt | 6 + .../plot_multiview_axis_aligned_splitter.py | 124 +++ .../plot_projection_matrices.py | 48 +- .../plot_sparse_projection_matrix.py | 129 ++++ sktree/__init__.py | 3 +- sktree/cv.py | 55 ++ sktree/ensemble/__init__.py | 1 + sktree/ensemble/_multiview.py | 308 ++++++++ sktree/ensemble/meson.build | 3 +- sktree/stats/forestht.py | 3 - sktree/stats/tests/test_forestht.py | 29 +- sktree/tests/meson.build | 3 +- sktree/tests/test_multiview_forest.py | 89 +++ sktree/tests/test_supervised_forest.py | 9 +- sktree/tree/__init__.py | 2 + sktree/tree/_classes.py | 9 + sktree/tree/_multiview.py | 403 ++++++++++ sktree/tree/_oblique_splitter.pxd | 73 ++ sktree/tree/_oblique_splitter.pyx | 717 ++++++++++++++---- sktree/tree/manifold/_morf_splitter.pxd | 4 +- sktree/tree/manifold/_morf_splitter.pyx | 8 +- sktree/tree/meson.build | 1 + sktree/tree/tests/meson.build | 1 + sktree/tree/tests/test_multiview.py | 86 +++ sktree/tree/tests/test_tree.py | 12 +- sktree/tree/tests/test_utils.py | 3 + sktree/tree/unsupervised/_unsup_criterion.pxd | 7 - sktree/tree/unsupervised/_unsup_criterion.pyx | 5 - 43 files changed, 2031 insertions(+), 257 deletions(-) create mode 100644 .github/workflows/release.yml create mode 100644 examples/hypothesis_testing/README.txt rename examples/{ => hypothesis_testing}/plot_MI_gigantic_hypothesis_testing_forest.py (100%) rename examples/{ => hypothesis_testing}/plot_might_auc.py (100%) create mode 100644 examples/outlier_detection/README.txt rename examples/{ => outlier_detection}/plot_extended_isolation_forest.py (100%) create mode 100644 examples/splitters/README.txt create mode 100644 examples/splitters/plot_multiview_axis_aligned_splitter.py rename examples/{ => splitters}/plot_projection_matrices.py (88%) create mode 100644 examples/splitters/plot_sparse_projection_matrix.py create mode 100644 sktree/cv.py create mode 100644 sktree/ensemble/_multiview.py create mode 100644 sktree/tests/test_multiview_forest.py create mode 100644 sktree/tree/_multiview.py create mode 100644 sktree/tree/tests/test_multiview.py diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 12c35fb54..b2c415c49 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -19,6 +19,17 @@ concurrency: cancel-in-progress: true jobs: + create_artifacts_folder: + name: Create Artifacts Folder + runs-on: ubuntu-latest + outputs: + artifacts_folder: ${{ steps.set_folder.outputs.folder_name }} + + steps: + - name: Set Artifacts Folder + id: set_folder + run: echo "folder_name=artifacts/${{ github.run_id }}" >> $GITHUB_ENV + build_wheels: name: Build wheels on ${{ matrix.os[1] }} - ${{ matrix.os[2] }} with Python ${{ matrix.python[0] }} runs-on: ${{ matrix.os[0] }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a943e3d13..32298fc7f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -249,52 +249,6 @@ jobs: name: sktree-build path: $PWD/build - # release is ran when a release is made on Github - release: - name: Release - runs-on: ubuntu-latest - needs: [build_and_test_slow] - if: startsWith(github.ref, 'refs/tags/') - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v4.6.1 - with: - python-version: 3.9 - architecture: "x64" - - name: Install dependencies - run: | - python -m pip install --progress-bar off --upgrade pip setuptools wheel - python -m pip install --progress-bar off build twine - - name: Prepare environment - run: | - echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV - echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV - - name: Download package distribution files - uses: actions/download-artifact@v3 - with: - name: package - path: dist - # TODO: refactor scripts to generate release notes from `whats_new.rst` file instead - # - name: Generate release notes - # run: | - # python scripts/release_notes.py > ${{ github.workspace }}-RELEASE_NOTES.md - - name: Publish package to PyPI - run: | - twine upload -u ${{ secrets.PYPI_USERNAME }} -p ${{ secrets.PYPI_PASSWORD }} dist/* - - name: Publish GitHub release - uses: softprops/action-gh-release@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - # body_path: ${{ github.workspace }}-RELEASE_NOTES.md - prerelease: ${{ contains(env.TAG, 'rc') }} - files: | - dist/* - - - # build-windows: # name: Meson build Windows # runs-on: windows-2019 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 000000000..c137735f6 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,61 @@ +name: "Release to PyPI" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.number }}-${{ github.event.type }} + cancel-in-progress: true + +on: + workflow_run: + workflows: ["build_and_test_slow"] + types: + - completed + workflow_dispatch: + +env: + INSTALLDIR: "build-install" + CCACHE_DIR: "${{ github.workspace }}/.ccache" + +jobs: + # release is ran when a release is made on Github + release: + name: Release + runs-on: ubuntu-latest + needs: [build_and_test_slow] + if: startsWith(github.ref, 'refs/tags/') + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v4.6.1 + with: + python-version: 3.9 + architecture: "x64" + - name: Install dependencies + run: | + python -m pip install --progress-bar off --upgrade pip setuptools wheel + python -m pip install --progress-bar off build twine + - name: Prepare environment + run: | + echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV + echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV + - name: Download package distribution files + uses: actions/download-artifact@v3 + with: + name: package + path: dist + # TODO: refactor scripts to generate release notes from `whats_new.rst` file instead + # - name: Generate release notes + # run: | + # python scripts/release_notes.py > ${{ github.workspace }}-RELEASE_NOTES.md + - name: Publish package to PyPI + run: | + twine upload -u ${{ secrets.PYPI_USERNAME }} -p ${{ secrets.PYPI_PASSWORD }} dist/* + - name: Publish GitHub release + uses: softprops/action-gh-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + # body_path: ${{ github.workspace }}-RELEASE_NOTES.md + prerelease: ${{ contains(env.TAG, 'rc') }} + files: | + dist/* diff --git a/DEVELOPING.md b/DEVELOPING.md index a9d64457c..200789d7c 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -76,4 +76,8 @@ Verify that installations work as expected on your machine. twine upload dist/* +or if you have two-factor authentication enabled: https://pypi.org/help/#apitoken + + twine upload dist/* --repository scikit-tree + 4. Update version number on ``meson.build`` and ``_version.py`` to the relevant version. \ No newline at end of file diff --git a/doc/api.rst b/doc/api.rst index 2a74a7a01..59238d77a 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -65,6 +65,7 @@ how scikit-learn builds trees. PatchObliqueRandomForestClassifier PatchObliqueRandomForestRegressor HonestForestClassifier + MultiViewRandomForestClassifier .. currentmodule:: sktree.tree .. autosummary:: @@ -75,6 +76,7 @@ how scikit-learn builds trees. PatchObliqueDecisionTreeClassifier PatchObliqueDecisionTreeRegressor HonestTreeClassifier + MultiViewDecisionTreeClassifier Unsupervised ------------ diff --git a/doc/whats_new/v0.3.rst b/doc/whats_new/v0.3.rst index ebae29995..fec97bb01 100644 --- a/doc/whats_new/v0.3.rst +++ b/doc/whats_new/v0.3.rst @@ -14,6 +14,7 @@ Changelog --------- - |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`) - |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`) +- |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`) Code and Documentation Contributors ----------------------------------- diff --git a/examples/README.txt b/examples/README.txt index f9f595033..e13d3e890 100644 --- a/examples/README.txt +++ b/examples/README.txt @@ -1,4 +1,4 @@ Examples --------- +======== Examples demonstrating how to use scikit-tree algorithms. diff --git a/examples/hypothesis_testing/README.txt b/examples/hypothesis_testing/README.txt new file mode 100644 index 000000000..9a60917ca --- /dev/null +++ b/examples/hypothesis_testing/README.txt @@ -0,0 +1,6 @@ +.. _hyppo_examples: + +Hypothesis testing with decision trees +-------------------------------------- + +Examples demonstrating how to use decision-trees for statistical hypothesis testing. diff --git a/examples/plot_MI_gigantic_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py similarity index 100% rename from examples/plot_MI_gigantic_hypothesis_testing_forest.py rename to examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py diff --git a/examples/plot_might_auc.py b/examples/hypothesis_testing/plot_might_auc.py similarity index 100% rename from examples/plot_might_auc.py rename to examples/hypothesis_testing/plot_might_auc.py diff --git a/examples/outlier_detection/README.txt b/examples/outlier_detection/README.txt new file mode 100644 index 000000000..f7f13ea8e --- /dev/null +++ b/examples/outlier_detection/README.txt @@ -0,0 +1,6 @@ +.. _outlier_examples: + +Outlier-detection +----------------- + +Examples concerning how to do outlier detection with decision trees. diff --git a/examples/plot_extended_isolation_forest.py b/examples/outlier_detection/plot_extended_isolation_forest.py similarity index 100% rename from examples/plot_extended_isolation_forest.py rename to examples/outlier_detection/plot_extended_isolation_forest.py diff --git a/examples/plot_extra_oblique_random_forest.py b/examples/plot_extra_oblique_random_forest.py index 497d1d98c..586edd833 100644 --- a/examples/plot_extra_oblique_random_forest.py +++ b/examples/plot_extra_oblique_random_forest.py @@ -58,6 +58,9 @@ are not sorted and the split is determined by randomly drawing a threshold from the feature's range, hence the complexity is `O(n)`. This makes the algorithm more suitable for large datasets. +To see how sample-sizes affect the performance of Extra Oblique Trees vs regular Oblique Trees, +see :ref:`sphx_glr_auto_examples_plot_extra_orf_sample_size.py` + References ---------- .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees", Machine Learning, 63(1), @@ -149,6 +152,7 @@ def get_scores(X, y, d_name, n_cv=5, n_repeats=1, **kwargs): "random_state": random_state, "n_cv": 10, "n_repeats": 1, + "n_jobs": -1, } for data_id in data_ids: diff --git a/examples/plot_extra_orf_sample_size.py b/examples/plot_extra_orf_sample_size.py index 78c29643f..5651d27ec 100644 --- a/examples/plot_extra_orf_sample_size.py +++ b/examples/plot_extra_orf_sample_size.py @@ -122,6 +122,7 @@ def get_scores(X, y, d_name, n_cv=5, n_repeats=1, **kwargs): "random_state": random_state, "n_cv": 10, "n_repeats": 1, + "n_jobs": -1, } for data_id in data_ids: diff --git a/examples/plot_oblique_random_forest.py b/examples/plot_oblique_random_forest.py index 9a7a0acb3..34cb8c68a 100644 --- a/examples/plot_oblique_random_forest.py +++ b/examples/plot_oblique_random_forest.py @@ -16,6 +16,9 @@ will notice, of these three datasets, the oblique forest outperforms axis-aligned random forest on cnae-9 utilizing sparse random projection mechanism. All datasets are subsampled due to computational constraints. + +For an example of using extra-oblique trees/forests in practice on data, see the following +example :ref:`sphx_glr_auto_examples_plot_extra_oblique_random_forest.py`. """ from datetime import datetime diff --git a/examples/splitters/README.txt b/examples/splitters/README.txt new file mode 100644 index 000000000..40ea1e232 --- /dev/null +++ b/examples/splitters/README.txt @@ -0,0 +1,6 @@ +.. _splitter_examples: + +Decision-tree splitters +----------------------- + +Examples demonstrating different node-splitting strategies for decision trees. diff --git a/examples/splitters/plot_multiview_axis_aligned_splitter.py b/examples/splitters/plot_multiview_axis_aligned_splitter.py new file mode 100644 index 000000000..8fc5b32c2 --- /dev/null +++ b/examples/splitters/plot_multiview_axis_aligned_splitter.py @@ -0,0 +1,124 @@ +""" +================================================================================= +Demonstrate and visualize a multi-view projection matrix for an axis-aligned tree +================================================================================= + +This example shows how multi-view projection matrices are generated for a decision tree, +specifically the :class:`sktree.tree.MultiViewDecisionTreeClassifier`. + +Multi-view projection matrices operate under the assumption that the input ``X`` array +consists of multiple feature-sets that are groups of features important for predicting +``y``. + +For details on how to use the hyperparameters related to the multi-view, see +:class:`sktree.tree.MultiViewDecisionTreeClassifier`. +""" + +# import modules +# .. note:: We use a private Cython module here to demonstrate what the patches +# look like. This is not part of the public API. The Cython module used +# is just a Python wrapper for the underlying Cython code and is not the +# same as the Cython splitter used in the actual implementation. +# To use the actual splitter, one should use the public API for the +# relevant tree/forests class. + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.cm import ScalarMappable +from matplotlib.colors import ListedColormap + +from sktree._lib.sklearn.tree._criterion import Gini +from sktree.tree._oblique_splitter import MultiViewSplitterTester + +criterion = Gini(1, np.array((0, 1))) +max_features = 5 +min_samples_leaf = 1 +min_weight_leaf = 0.0 +random_state = np.random.RandomState(10) + +# we "simulate" three feature sets, with 3, 2 and 4 features respectively +feature_set_ends = np.array([3, 5, 9], dtype=np.intp) +n_feature_sets = len(feature_set_ends) + +feature_combinations = 1 +monotonic_cst = None +missing_value_feature_mask = None + +# initialize some dummy data +X = np.repeat(np.arange(feature_set_ends[-1]).astype(np.float32), 5).reshape(5, -1) +y = np.array([0, 0, 0, 1, 1]).reshape(-1, 1).astype(np.float64) +sample_weight = np.ones(5) + +print("The shape of our dataset is: ", X.shape, y.shape, sample_weight.shape) + +# %% +# Initialize the multi-view splitter +# ---------------------------------- +# The multi-view splitter is a Cython class that is initialized internally +# in scikit-tree. However, we expose a Python tester object to demonstrate +# how the splitter works in practice. +# +# .. warning:: Do not use this interface directly in practice. + +splitter = MultiViewSplitterTester( + criterion, + max_features, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + feature_combinations, + feature_set_ends, + n_feature_sets, +) +splitter.init_test(X, y, sample_weight, missing_value_feature_mask) + +# %% +# Sample the projection matrix +# ---------------------------- +# The projection matrix is sampled by the splitter. The projection +# matrix is a (max_features, n_features) matrix that selects which features of ``X`` +# to define candidate split dimensions. The multi-view +# splitter's projection matrix though samples from multiple feature sets, +# which are aligned contiguously over the columns of ``X``. + +projection_matrix = splitter.sample_projection_matrix_py() +print(projection_matrix) + +cmap = ListedColormap(["white", "green"][:n_feature_sets]) + +# Create a heatmap to visualize the indices +fig, ax = plt.subplots(figsize=(6, 6)) + +ax.imshow( + projection_matrix, cmap=cmap, aspect=feature_set_ends[-1] / max_features, interpolation="none" +) +ax.axvline(feature_set_ends[0] - 0.5, color="black", linewidth=1, label="Feature Sets") +for iend in feature_set_ends[1:]: + ax.axvline(iend - 0.5, color="black", linewidth=1) + +ax.set(title="Sampled Projection Matrix", xlabel="Feature Index", ylabel="Projection Vector Index") +ax.set_xticks(np.arange(feature_set_ends[-1])) +ax.set_yticks(np.arange(max_features)) +ax.set_yticklabels(np.arange(max_features, dtype=int) + 1) +ax.set_xticklabels(np.arange(feature_set_ends[-1], dtype=int) + 1) +ax.legend() + +# Create a mappable object +sm = ScalarMappable(cmap=cmap) +sm.set_array([]) # You can set an empty array or values here + +# Create a color bar with labels for each feature set +colorbar = fig.colorbar(sm, ax=ax, ticks=[0.25, 0.75], format="%d") +colorbar.set_label("Projection Weight (I.e. Sampled Feature From a Feature Set)") +colorbar.ax.set_yticklabels(["0", "1"]) + +plt.show() + +# %% +# Discussion +# ---------- +# As we can see, the multi-view splitter samples split candidates uniformly across the feature sets. +# In contrast, the normal splitter in :class:`sklearn.tree.DecisionTreeClassifier` samples +# randomly across all ``n_features`` features because it is not aware of the multi-view structure. +# This is the key difference between the two splitters. diff --git a/examples/plot_projection_matrices.py b/examples/splitters/plot_projection_matrices.py similarity index 88% rename from examples/plot_projection_matrices.py rename to examples/splitters/plot_projection_matrices.py index 7e4b83b38..9e1cbaef1 100644 --- a/examples/plot_projection_matrices.py +++ b/examples/splitters/plot_projection_matrices.py @@ -1,7 +1,7 @@ """ -=============================================== -Plot the projection matrices of an oblique tree -=============================================== +=================================================================================== +Plot the projection matrices of an oblique tree for sampling images, or time-series +=================================================================================== This example shows how projection matrices are generated for an oblique tree, specifically the :class:`sktree.tree.PatchObliqueDecisionTreeClassifier`. @@ -61,6 +61,7 @@ boundary = None feature_weight = None +monotonic_cst = None missing_value_feature_mask = None # initialize some dummy data @@ -76,18 +77,25 @@ # Now that we have th patch splitter initialized, we can generate some patches # and visualize how they appear on the data. We will make the patch 1D, which # samples multiple rows contiguously. This is a 1D patch of size 3. +# +# .. warning:: Do not use this interface directly in practice. + min_patch_dims = np.array((1, 1)) max_patch_dims = np.array((3, 1)) dim_contiguous = np.array((True, True)) data_dims = np.array((5, 5)) +# Note: not used, but passed for API compatibility +feature_combinations = 1.5 + splitter = BestPatchSplitterTester( criterion, max_features, min_samples_leaf, min_weight_leaf, random_state, - missing_value_feature_mask, + monotonic_cst, + feature_combinations, min_patch_dims, max_patch_dims, dim_contiguous, @@ -95,10 +103,10 @@ boundary, feature_weight, ) -splitter.init_test(X, y, sample_weight, None) +splitter.init_test(X, y, sample_weight, missing_value_feature_mask) # sample the projection matrix that consists of 1D patches -proj_mat = splitter.sample_projection_matrix() +proj_mat = splitter.sample_projection_matrix_py() print(proj_mat.shape) # Visualize 1D patches @@ -132,7 +140,8 @@ min_samples_leaf, min_weight_leaf, random_state, - missing_value_feature_mask, + monotonic_cst, + feature_combinations, min_patch_dims, max_patch_dims, dim_contiguous, @@ -140,10 +149,10 @@ boundary, feature_weight, ) -splitter.init_test(X, y, sample_weight) +splitter.init_test(X, y, sample_weight, missing_value_feature_mask) # sample the projection matrix that consists of 1D patches -proj_mat = splitter.sample_projection_matrix() +proj_mat = splitter.sample_projection_matrix_py() # Visualize 2D patches fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True) @@ -181,7 +190,8 @@ min_samples_leaf, min_weight_leaf, random_state, - missing_value_feature_mask, + monotonic_cst, + feature_combinations, min_patch_dims, max_patch_dims, dim_contiguous, @@ -189,10 +199,10 @@ boundary, feature_weight, ) -splitter.init_test(X, y, sample_weight) +splitter.init_test(X, y, sample_weight, missing_value_feature_mask) # sample the projection matrix that consists of 1D patches -proj_mat = splitter.sample_projection_matrix() +proj_mat = splitter.sample_projection_matrix_py() print(proj_mat.shape) fig = plt.figure() @@ -244,7 +254,8 @@ min_samples_leaf, min_weight_leaf, random_state, - missing_value_feature_mask, + monotonic_cst, + feature_combinations, min_patch_dims, max_patch_dims, dim_contiguous, @@ -252,10 +263,10 @@ boundary, feature_weight, ) -splitter.init_test(X, y, sample_weight) +splitter.init_test(X, y, sample_weight, missing_value_feature_mask) # sample the projection matrix that consists of 1D patches -proj_mat = splitter.sample_projection_matrix() +proj_mat = splitter.sample_projection_matrix_py() # Visualize 2D patches fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True) @@ -282,7 +293,8 @@ min_samples_leaf, min_weight_leaf, random_state, - missing_value_feature_mask, + monotonic_cst, + feature_combinations, min_patch_dims, max_patch_dims, dim_contiguous, @@ -290,10 +302,10 @@ boundary, feature_weight, ) -splitter.init_test(X, y, sample_weight) +splitter.init_test(X, y, sample_weight, missing_value_feature_mask) # sample the projection matrix that consists of 1D patches -proj_mat = splitter.sample_projection_matrix() +proj_mat = splitter.sample_projection_matrix_py() # Visualize 2D patches fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True) diff --git a/examples/splitters/plot_sparse_projection_matrix.py b/examples/splitters/plot_sparse_projection_matrix.py new file mode 100644 index 000000000..6eb53e0ca --- /dev/null +++ b/examples/splitters/plot_sparse_projection_matrix.py @@ -0,0 +1,129 @@ +""" +====================================================== +Plot the sparse projection matrices of an oblique tree +====================================================== + +This example shows how projection matrices are generated for an oblique tree, +specifically the :class:`sktree.tree.ObliqueDecisionTreeClassifier`. + +The projection matrix here samples a subset of features from the input ``X`` +controlled by the parameter ``feature_combinations``. The projection matrix +is sparse when ``feature_combinations`` is small, meaning that it is mostly zero. +The non-zero elements of the projection matrix are the features that are sampled +from the input ``X`` and linearly combined to form candidate split dimensions. + +For details on how to use the hyperparameters related to the patches, see +:class:`sktree.tree.ObliqueDecisionTreeClassifier`. +""" +import matplotlib.pyplot as plt + +# import modules +# .. note:: We use a private Cython module here to demonstrate what the patches +# look like. This is not part of the public API. The Cython module used +# is just a Python wrapper for the underlying Cython code and is not the +# same as the Cython splitter used in the actual implementation. +# To use the actual splitter, one should use the public API for the +# relevant tree/forests class. +import numpy as np +from matplotlib.cm import ScalarMappable +from matplotlib.colors import ListedColormap + +from sktree._lib.sklearn.tree._criterion import Gini +from sktree.tree._oblique_splitter import BestObliqueSplitterTester + +# %% +# Initialize patch splitter +# ------------------------- +# The patch splitter is used to generate patches for the projection matrices. +# We will initialize the patch with some dummy values for the sake of this +# example. + +criterion = Gini(1, np.array((0, 1))) +max_features = 6 +min_samples_leaf = 1 +min_weight_leaf = 0.0 +random_state = np.random.RandomState(1) + +feature_combinations = 1.5 +monotonic_cst = None +missing_value_feature_mask = None + +n_features = 10 +n_samples = 5 + +# initialize some dummy data +X = np.ones((n_samples, n_features), dtype=np.float32) +y = np.array([0, 0, 0, 1, 1]).reshape(-1, 1).astype(np.float64) +sample_weight = np.ones(5) + +print("The shape of our dataset is: ", X.shape, y.shape, sample_weight.shape) + +# %% +# Initialize the splitter +# ----------------------- +# The splitter is initialized in the decision-tree classes, but we expose +# a testing interface here to demonstrate how the projection matrices are +# sampled internally. +# +# .. warning:: Do not use this interface directly in practice. + +splitter = BestObliqueSplitterTester( + criterion, + max_features, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + feature_combinations, +) +splitter.init_test(X, y, sample_weight, missing_value_feature_mask) + +# %% +# Generate projection matrix +# -------------------------- +# Sample the projection matrix that consists of randomly sampled features +# with an average of ``feature_combinations * max_features`` non-zeros +# in the ``(max_features, n_features)`` matrix. + +projection_matrix = splitter.sample_projection_matrix_py() +print(projection_matrix.shape) + +# Visualize the projection matrix +cmap = ListedColormap(["orange", "white", "green"]) + +# Create a heatmap to visualize the indices +fig, ax = plt.subplots(figsize=(6, 6)) + +ax.imshow(projection_matrix, cmap=cmap, aspect=n_features / max_features, interpolation="none") + +ax.set(title="Sampled Projection Matrix", xlabel="Feature Index", ylabel="Projection Vector Index") +ax.set_xticks(np.arange(n_features)) +ax.set_yticks(np.arange(max_features)) +ax.set_yticklabels(np.arange(max_features, dtype=int) + 1) +ax.set_xticklabels(np.arange(n_features, dtype=int) + 1) + +# Create a mappable object +sm = ScalarMappable(cmap=cmap) +sm.set_array([]) # You can set an empty array or values here + +# Create a color bar with labels for each feature set +colorbar = fig.colorbar(sm, ax=ax, ticks=[0, 0.5, 1], format="%d") +colorbar.set_label("Projection Weight") +colorbar.ax.set_yticklabels(["-1", "0", "1"]) + +plt.show() + +# %% +# Discussion +# ---------- +# As we can see, the (sparse) oblique splitter samples random features to +# linearly combine to form candidate split dimensions. +# +# In contrast, the normal splitter in :class:`sklearn.tree.DecisionTreeClassifier` samples +# randomly across all ``n_features`` features. +# +# For an example of using oblique trees/forests in practice on data, see the following +# examples: +# +# - :ref:`sphx_glr_auto_examples_plot_oblique_forests_iris.py` +# - :ref:`sphx_glr_auto_examples_plot_oblique_random_forest.py` diff --git a/sktree/__init__.py b/sktree/__init__.py index 1aa58557f..1ca249fc0 100644 --- a/sktree/__init__.py +++ b/sktree/__init__.py @@ -44,7 +44,7 @@ ExtraTreesRegressor, ) from .neighbors import NearestNeighborsMetaEstimator - from .ensemble import ExtendedIsolationForest + from .ensemble import ExtendedIsolationForest, MultiViewRandomForestClassifier from .ensemble._unsupervised_forest import ( UnsupervisedRandomForest, UnsupervisedObliqueRandomForest, @@ -85,4 +85,5 @@ "ExtraTreesClassifier", "ExtraTreesRegressor", "ExtendedIsolationForest", + "MultiViewRandomForestClassifier", ] diff --git a/sktree/cv.py b/sktree/cv.py new file mode 100644 index 000000000..cd38e7793 --- /dev/null +++ b/sktree/cv.py @@ -0,0 +1,55 @@ +from typing import Any, List, Tuple + +from numpy.typing import ArrayLike +from sklearn.base import clone +from sklearn.model_selection import BaseCrossValidator, StratifiedKFold + +from sktree._lib.sklearn.ensemble._forest import BaseForest + + +def cross_fit_forest( + forest: BaseForest, + X: ArrayLike, + y: ArrayLike, + cv: BaseCrossValidator = None, + n_splits: int = 5, + random_state=None, +) -> Tuple[List[Any], List[ArrayLike]]: + """Perform cross-fitting of a forest estimator. + + Parameters + ---------- + forest : BaseForest + Forest. + X : ArrayLike of shape (n_samples, n_features) + Input data. + y : ArrayLike of shape (n_samples, [n_outputs]) + Target data. + cv : BaseCrossValidator, optional + Cross validation object, by default None, which defaults to + :class:`sklearn.model_selection.StratifiedKFold`. + n_splits : int, optional + Number of folds to generate, by default 5. + random_state : int, optional + Random seed. + + Returns + ------- + fitted_forests : List[BaseForest] + List of fitted forests. + test_indices : List[ArrayLike] + List of test indices over ``n_samples``. + """ + if cv is None: + cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True) + + fitted_forests = [] + test_indices = [] + for train_index, test_index in cv.split(X, y): + new_forest = clone(forest) + new_forest.fit(X[train_index], y[train_index]) + + fitted_forests.append(new_forest) + test_indices.append(test_index) + + return fitted_forests, test_indices diff --git a/sktree/ensemble/__init__.py b/sktree/ensemble/__init__.py index 1345565c0..aa97d0215 100644 --- a/sktree/ensemble/__init__.py +++ b/sktree/ensemble/__init__.py @@ -1,5 +1,6 @@ from ._eiforest import ExtendedIsolationForest from ._honest_forest import HonestForestClassifier +from ._multiview import MultiViewRandomForestClassifier from ._supervised_forest import ( ExtraObliqueRandomForestClassifier, ExtraObliqueRandomForestRegressor, diff --git a/sktree/ensemble/_multiview.py b/sktree/ensemble/_multiview.py new file mode 100644 index 000000000..a33ec6047 --- /dev/null +++ b/sktree/ensemble/_multiview.py @@ -0,0 +1,308 @@ +from sklearn.utils._param_validation import StrOptions + +from .._lib.sklearn.ensemble._forest import ForestClassifier +from ..tree import MultiViewDecisionTreeClassifier +from ..tree._neighbors import SimMatrixMixin + + +class MultiViewRandomForestClassifier(SimMatrixMixin, ForestClassifier): + """ + A multi-view axis-aligned random forest classifier. + + A multi-view random forest is a meta estimator similar to a random + forest that fits a number of multi-view decision tree classifiers + on various sub-samples of the dataset and uses averaging to + improve the predictive accuracy and control over-fitting. + + Parameters + ---------- + n_estimators : int, default=100 + The number of trees in the forest. + + criterion : {"gini", "entropy"}, default="gini" + The function to measure the quality of a split. Supported criteria are + "gini" for the Gini impurity and "entropy" for the information gain. + Note: this parameter is tree-specific. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : {"sqrt", "log2", None}, int or float, default="sqrt" + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `round(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + bootstrap : bool, default=True + Whether bootstrap samples are used when building trees. If False, the + whole dataset is used to build each tree. + + oob_score : bool, default=False + Whether to use out-of-bag samples to estimate the generalization score. + Only available if bootstrap=True. + + n_jobs : int, default=None + The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, + :meth:`decision_path` and :meth:`apply` are all parallelized over the + trees. ``None`` means 1 unless in a `joblib.parallel_backend` + context. ``-1`` means using all processors. See :term:`Glossary + ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls both the randomness of the bootstrapping of the samples used + when building trees (if ``bootstrap=True``) and the sampling of the + features to consider when looking for the best split at each node + (if ``max_features < n_features``). + See :term:`Glossary ` for details. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`the Glossary `. + + class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \ + default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + Note that for multioutput (including multilabel) weights should be + defined for each class of every column in its own dict. For example, + for four-class multilabel classification weights should be + [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of + [{1:1}, {2:5}, {3:1}, {4:1}]. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + + The "balanced_subsample" mode is the same as "balanced" except that + weights are computed based on the bootstrap sample for every tree + grown. + + For multi-output, the weights of each column of y will be multiplied. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0.0, 1.0]`. + + feature_combinations : float, default=None + The number of features to combine on average at each split + of the decision trees. If ``None``, then will default to the minimum of + ``(1.5, n_features)``. This controls the number of non-zeros is the + projection matrix. Setting the value to 1.0 is equivalent to a + traditional decision-tree. ``feature_combinations * max_features`` + gives the number of expected non-zeros in the projection matrix of shape + ``(max_features, n_features)``. Thus this value must always be less than + ``n_features`` in order to be valid. + + feature_set_ends : array-like of int of shape (n_feature_sets,), default=None + The indices of the end of each feature set. For example, if the first + feature set is the first 10 features, and the second feature set is the + next 20 features, then ``feature_set_ends = [10, 30]``. If ``None``, + then this will assume that there is only one feature set. + + Attributes + ---------- + base_estimator_ : sktree.tree.ObliqueDecisionTreeClassifier + The child estimator template used to create the collection of fitted + sub-estimators. + + estimators_ : list of sktree.tree.ObliqueDecisionTreeClassifier + The collection of fitted sub-estimators. + + classes_ : ndarray of shape (n_classes,) or a list of such arrays + The classes labels (single output problem), or a list of arrays of + class labels (multi-output problem). + + n_classes_ : int or list + The number of classes (single output problem), or a list containing the + number of classes for each output (multi-output problem). + + n_features_ : int + The number of features when ``fit`` is performed. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + n_outputs_ : int + The number of outputs when ``fit`` is performed. + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + This attribute exists only when ``oob_score`` is True. + + oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \ + (n_samples, n_classes, n_outputs) + Decision function computed with out-of-bag estimate on the training + set. If n_estimators is small it might be possible that a data point + was never left out during the bootstrap. In this case, + `oob_decision_function_` might contain NaN. This attribute exists + only when ``oob_score`` is True. + + See Also + -------- + sktree.tree.ObliqueDecisionTreeClassifier : An oblique decision + tree classifier. + sklearn.ensemble.RandomForestClassifier : An axis-aligned decision + forest classifier. + """ + + tree_type = "oblique" + _parameter_constraints: dict = { + **ForestClassifier._parameter_constraints, + **MultiViewDecisionTreeClassifier._parameter_constraints, + "class_weight": [ + StrOptions({"balanced_subsample", "balanced"}), + dict, + list, + None, + ], + } + _parameter_constraints.pop("splitter") + + def __init__( + self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + max_samples=None, + feature_combinations=None, + feature_set_ends=None, + ): + super().__init__( + estimator=MultiViewDecisionTreeClassifier(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "feature_combinations", + "feature_set_ends", + ), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + class_weight=class_weight, + max_samples=max_samples, + ) + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.max_features = max_features + self.feature_combinations = feature_combinations + self.feature_set_ends = feature_set_ends + + # unused by oblique forests + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease diff --git a/sktree/ensemble/meson.build b/sktree/ensemble/meson.build index a27293dd9..a8df74f57 100644 --- a/sktree/ensemble/meson.build +++ b/sktree/ensemble/meson.build @@ -3,7 +3,8 @@ python_sources = [ '_supervised_forest.py', '_unsupervised_forest.py', '_honest_forest.py', - '_eiforest.py' + '_eiforest.py', + '_multiview.py', ] py3.install_sources( diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index aa9e81e9b..4d6dc7b77 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -332,9 +332,6 @@ def statistic( if self._type_of_target_ is None: self._type_of_target_ = type_of_target(y) - # if self.sample_dataset_per_tree and not self.permute_per_tree: - # raise ValueError("sample_dataset_per_tree is only valid when permute_per_tree=True") - if covariate_index is None: self.estimator_ = self._get_estimator() estimator = self.estimator_ diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index c16eecbe2..cecf34b8c 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -291,8 +291,10 @@ def test_pickle(tmpdir): assert_array_equal(getattr(clf, attr), getattr(clf_pickle, attr)) -@pytest.mark.parametrize("permute_per_tree", [True, False]) -@pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) +@pytest.mark.parametrize("permute_per_tree", [True, False], ids=["permute_per_tree", "no_permute"]) +@pytest.mark.parametrize( + "sample_dataset_per_tree", [True, False], ids=["sample_dataset_per_tree", "no_sample_dataset"] +) def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_dataset_per_tree): """Test that the test-sample indices are what is expected.""" clf = FeatureImportanceForestClassifier( @@ -313,10 +315,25 @@ def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_ X, y, covariate_index=None, return_posteriors=True, metric="mi" ) if sample_dataset_per_tree: - assert_array_equal( - sorted(np.unique(samples)), - sorted(np.unique(np.concatenate([x[1] for x in clf.train_test_samples_]).flatten())), + # check the non-nans + non_nan_idx = _non_nan_samples(posteriors) + assert clf.n_samples_test_ == n_samples, f"{clf.n_samples_test_} != {n_samples}" + + sorted_sample_idx = sorted(np.unique(samples)) + sorted_est_samples_idx = sorted( + np.unique(np.concatenate([x[1] for x in clf.train_test_samples_]).flatten()) ) + assert_array_equal(sorted_sample_idx, non_nan_idx) + + # the sample indices are equal to the output of the train/test indices_ + # only if there are no nans in the posteriors over all the samples + if np.sum(non_nan_idx) == n_samples: + assert_array_equal( + sorted_sample_idx, + sorted_est_samples_idx, + f"{set(sorted_sample_idx) - set(sorted_est_samples_idx)} and " + f"{set(sorted_est_samples_idx) - set(sorted_sample_idx)}", + ) else: assert_array_equal(samples, sorted(clf.train_test_samples_[0][1])) assert len(_non_nan_samples(posteriors)) == len(samples) @@ -430,7 +447,7 @@ def test_small_dataset_dependent(seed): clf = FeatureImportanceForestClassifier( estimator=HonestForestClassifier( - n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.5 + n_estimators=50, random_state=seed, n_jobs=1, honest_fraction=0.5 ), test_size=0.2, permute_per_tree=False, diff --git a/sktree/tests/meson.build b/sktree/tests/meson.build index 6ccd37305..6fc67af5a 100644 --- a/sktree/tests/meson.build +++ b/sktree/tests/meson.build @@ -4,7 +4,8 @@ python_sources = [ 'test_unsupervised_forest.py', 'test_neighbors.py', 'test_honest_forest.py', - 'test_eiforest.py' + 'test_eiforest.py', + 'test_multiview_forest.py', ] py3.install_sources( diff --git a/sktree/tests/test_multiview_forest.py b/sktree/tests/test_multiview_forest.py new file mode 100644 index 000000000..dad4a7a14 --- /dev/null +++ b/sktree/tests/test_multiview_forest.py @@ -0,0 +1,89 @@ +import numpy as np +import pytest +from sklearn.datasets import make_blobs +from sklearn.metrics import accuracy_score +from sklearn.model_selection import cross_val_score +from sklearn.utils.estimator_checks import parametrize_with_checks + +from sktree import MultiViewRandomForestClassifier, RandomForestClassifier + +seed = 12345 + + +@parametrize_with_checks( + [ + MultiViewRandomForestClassifier(random_state=12345, n_estimators=10), + ] +) +def test_sklearn_compatible_estimator(estimator, check): + check(estimator) + + +@pytest.mark.parametrize("baseline_est", [MultiViewRandomForestClassifier, RandomForestClassifier]) +def test_multiview_classification(baseline_est): + """Test that explicit knowledge of multi-view structure improves classification accuracy. + + In very high-dimensional noise setting across two views, when the max_depth and max_features + are constrained, the multi-view random forests will still obtain perfect accuracy, while + the single-view random forest will not. + """ + rng = np.random.default_rng(seed=seed) + + n_samples = 100 + n_estimators = 20 + n_features_1 = 5 + n_features_2 = 1000 + cluster_std = 2.0 + + # Create a high-dimensional multiview dataset with a low-dimensional informative + # subspace in one view of the dataset. + X0_first, y0 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1, + random_state=rng.integers(1, 10000), + centers=1, + ) + + X1_first, y1 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1, + random_state=rng.integers(1, 10000), + centers=1, + ) + y1[:] = 1 + X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X = np.vstack((X0, X1)) + y = np.hstack((y0, y1)).T + + # Compare multiview decision tree vs single-view decision tree + clf = MultiViewRandomForestClassifier( + random_state=seed, + feature_set_ends=[n_features_1, X.shape[1]], + max_features="sqrt", + max_depth=5, + n_estimators=n_estimators, + ) + clf.fit(X, y) + assert ( + accuracy_score(y, clf.predict(X)) == 1.0 + ), f"Accuracy score: {accuracy_score(y, clf.predict(X))}" + assert ( + cross_val_score(clf, X, y, cv=5).mean() == 1.0 + ), f"CV score: {cross_val_score(clf, X, y, cv=5).mean()}" + + clf = baseline_est( + random_state=seed, + max_depth=5, + max_features="sqrt", + n_estimators=n_estimators, + ) + clf.fit(X, y) + assert ( + accuracy_score(y, clf.predict(X)) == 1.0 + ), f"Accuracy score: {accuracy_score(y, clf.predict(X))}" + assert ( + cross_val_score(clf, X, y, cv=5).mean() <= 0.95 + ), f"CV score: {cross_val_score(clf, X, y, cv=5).mean()}" diff --git a/sktree/tests/test_supervised_forest.py b/sktree/tests/test_supervised_forest.py index 045653490..291d8a2e9 100644 --- a/sktree/tests/test_supervised_forest.py +++ b/sktree/tests/test_supervised_forest.py @@ -248,7 +248,8 @@ def test_oblique_forest_orthant(): random_state=0, ) - rc_clf = ObliqueRandomForestClassifier(max_features=None, random_state=0) + n_features = X.shape[1] + rc_clf = ObliqueRandomForestClassifier(max_features=int(n_features * 1.5), random_state=0) rc_clf.fit(X_train, y_train) y_hat = rc_clf.predict(X_test) rc_accuracy = accuracy_score(y_test, y_hat) @@ -258,7 +259,7 @@ def test_oblique_forest_orthant(): y_hat = ri_clf.predict(X_test) ri_accuracy = accuracy_score(y_test, y_hat) - assert rc_accuracy >= ri_accuracy + assert rc_accuracy >= ri_accuracy, f"{rc_accuracy} < {ri_accuracy}" assert ri_accuracy > 0.84 assert rc_accuracy > 0.85 @@ -442,4 +443,6 @@ def test_regression(forest, criterion, dtype): y = y_large_reg.astype(dtype, copy=False) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test, random_state=123) estimator.fit(X_train, y_train) - assert estimator.score(X_test, y_test) > 0.88, f"Failed for {estimator} and {criterion}" + + test_score = estimator.score(X_test, y_test) + assert test_score > 0.85, f"Failed for {estimator} and {criterion}: {test_score}" diff --git a/sktree/tree/__init__.py b/sktree/tree/__init__.py index 1ed18f3f0..797338ac3 100644 --- a/sktree/tree/__init__.py +++ b/sktree/tree/__init__.py @@ -15,6 +15,7 @@ UnsupervisedObliqueDecisionTree, ) from ._honest_tree import HonestTreeClassifier +from ._multiview import MultiViewDecisionTreeClassifier from ._neighbors import compute_forest_similarity_matrix __all__ = [ @@ -32,4 +33,5 @@ "DecisionTreeRegressor", "ExtraTreeClassifier", "ExtraTreeRegressor", + "MultiViewDecisionTreeClassifier", ] diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py index a9e00c6d0..4763bb53d 100644 --- a/sktree/tree/_classes.py +++ b/sktree/tree/_classes.py @@ -53,6 +53,7 @@ OBLIQUE_DENSE_SPLITTERS = { "best": _oblique_splitter.BestObliqueSplitter, "random": _oblique_splitter.RandomObliqueSplitter, + "best-multiview": _oblique_splitter.MultiViewSplitter, } PATCH_DENSE_SPLITTERS = { @@ -1708,6 +1709,9 @@ def _build_tree( random_state : int, RandomState instance or None, default=None Controls the randomness of the estimator. """ + # Not used. Here for API compatibility + self.feature_combinations_ = 1 + if self.feature_weight is not None: self.feature_weight = self._validate_data( self.feature_weight, ensure_2d=True, dtype=DTYPE @@ -1797,6 +1801,7 @@ def _build_tree( min_weight_leaf, random_state, monotonic_cst, + self.feature_combinations_, self.min_patch_dims_, self.max_patch_dims_, self.dim_contiguous_, @@ -2170,6 +2175,9 @@ def _build_tree( random_state : int, RandomState instance or None, default=None Controls the randomness of the estimator. """ + # Not used. Here for API compatibility + self.feature_combinations_ = 1 + if self.feature_weight is not None: self.feature_weight = self._validate_data( self.feature_weight, ensure_2d=True, dtype=DTYPE @@ -2260,6 +2268,7 @@ def _build_tree( min_weight_leaf, random_state, monotonic_cst, + self.feature_combinations_, self.min_patch_dims_, self.max_patch_dims_, self.dim_contiguous_, diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py new file mode 100644 index 000000000..58b370db0 --- /dev/null +++ b/sktree/tree/_multiview.py @@ -0,0 +1,403 @@ +import copy +from numbers import Real + +import numpy as np +from scipy.sparse import issparse +from sklearn.utils._param_validation import Interval + +from .._lib.sklearn.tree import DecisionTreeClassifier, _criterion +from .._lib.sklearn.tree import _tree as _sklearn_tree +from .._lib.sklearn.tree._criterion import BaseCriterion +from .._lib.sklearn.tree._tree import BestFirstTreeBuilder, DepthFirstTreeBuilder +from . import _oblique_splitter +from ._neighbors import SimMatrixMixin +from ._oblique_splitter import ObliqueSplitter +from ._oblique_tree import ObliqueTree + +DTYPE = _sklearn_tree.DTYPE +DOUBLE = _sklearn_tree.DOUBLE + +CRITERIA_CLF = { + "gini": _criterion.Gini, + "log_loss": _criterion.Entropy, + "entropy": _criterion.Entropy, +} +CRITERIA_REG = { + "squared_error": _criterion.MSE, + "friedman_mse": _criterion.FriedmanMSE, + "absolute_error": _criterion.MAE, + "poisson": _criterion.Poisson, +} + +DENSE_SPLITTERS = { + "best": _oblique_splitter.MultiViewSplitter, +} + + +class MultiViewDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): + """A multi-view axis-aligned decision tree classifier. + + This is an experimental feature that applies an oblique decision tree to + multiple feature-sets concatenated across columns in ``X``. + + Parameters + ---------- + criterion : {"gini", "entropy"}, default="gini" + The function to measure the quality of a split. Supported criteria are + "gini" for the Gini impurity and "entropy" for the information gain. + + splitter : {"best"}, default="best" + The strategy used to choose the split at each node. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : int, float or {"auto", "sqrt", "log2"}, default=None + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `int(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + Note: Compared to axis-aligned Random Forests, one can set + max_features to a number greater then ``n_features``. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. The features are always + randomly permuted at each split, even if ``splitter`` is set to + ``"best"``. When ``max_features < n_features``, the algorithm will + select ``max_features`` at random at each split before finding the best + split among them. But the best found split may vary across different + runs, even if ``max_features=n_features``. That is the case, if the + improvement of the criterion is identical for several splits and one + split has to be selected at random. To obtain a deterministic behaviour + during fitting, ``random_state`` has to be fixed to an integer. + See :term:`Glossary ` for details. + + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + class_weight : dict, list of dict or "balanced", default=None + Weights associated with classes in the form ``{class_label: weight}``. + If None, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + Note that for multioutput (including multilabel) weights should be + defined for each class of every column in its own dict. For example, + for four-class multilabel classification weights should be + [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of + [{1:1}, {2:5}, {3:1}, {4:1}]. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + + For multi-output, the weights of each column of y will be multiplied. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + feature_combinations : float, default=None + Not used. + + ccp_alpha : non-negative float, default=0.0 + Not used. + + store_leaf_values : bool, default=False + Whether to store the leaf values. + + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + Not used. + + feature_set_ends : array-like of int of shape (n_feature_sets,), default=None + The indices of the end of each feature set. For example, if the first + feature set is the first 10 features, and the second feature set is the + next 20 features, then ``feature_set_ends = [10, 30]``. If ``None``, + then this will assume that there is only one feature set. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) or list of ndarray + The classes labels (single output problem), + or a list of arrays of class labels (multi-output problem). + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance [4]_. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + max_features_ : int + The inferred value of max_features. + + n_classes_ : int or list of int + The number of classes (for single output problems), + or a list containing the number of classes for each + output (for multi-output problems). + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + n_outputs_ : int + The number of outputs when ``fit`` is performed. + + tree_ : Tree instance + The underlying Tree object. Please refer to + ``help(sklearn.tree._tree.Tree)`` for + attributes of Tree object. + + feature_combinations_ : float + The number of feature combinations on average taken to fit the tree. + + feature_set_ends_ : array-like of int of shape (n_feature_sets,) + The indices of the end of each feature set. + + n_feature_sets_ : int + The number of feature sets. + + See Also + -------- + sklearn.tree.DecisionTreeClassifier : An axis-aligned decision tree classifier. + """ + + tree_type = "oblique" + + _parameter_constraints = { + **DecisionTreeClassifier._parameter_constraints, + "feature_combinations": [ + Interval(Real, 1.0, None, closed="left"), + None, + ], + } + + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + class_weight=None, + feature_combinations=None, + ccp_alpha=0.0, + store_leaf_values=False, + monotonic_cst=None, + feature_set_ends=None, + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + max_leaf_nodes=max_leaf_nodes, + class_weight=class_weight, + random_state=random_state, + min_impurity_decrease=min_impurity_decrease, + ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, + monotonic_cst=monotonic_cst, + ) + + self.feature_combinations = feature_combinations + self.feature_set_ends = feature_set_ends + + def _build_tree( + self, + X, + y, + sample_weight, + missing_values_in_feature_mask, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + + min_samples_leaf : int or float + The minimum number of samples required to be at a leaf node. + + min_weight_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights. + + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. + """ + monotonic_cst = None + _, n_features = X.shape + + self.feature_combinations_ = 1 + + # Build tree + criterion = self.criterion + if not isinstance(criterion, BaseCriterion): + criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + + if self.feature_set_ends is None: + self.feature_set_ends_ = np.asarray([n_features], dtype=np.intp) + else: + self.feature_set_ends_ = np.atleast_1d(self.feature_set_ends).astype(np.intp) + self.n_feature_sets_ = len(self.feature_set_ends_) + + splitter = self.splitter + if issparse(X): + raise ValueError( + "Sparse input is not supported for oblique trees. " + "Please convert your data to a dense array." + ) + else: + SPLITTERS = DENSE_SPLITTERS + + if not isinstance(self.splitter, ObliqueSplitter): + splitter = SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + self.feature_combinations_, + self.feature_set_ends_, + self.n_feature_sets_, + ) + + self.tree_ = ObliqueTree(self.n_features_in_, self.n_classes_, self.n_outputs_) + + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise + if max_leaf_nodes < 0: + self.builder_ = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + ) + else: + self.builder_ = BestFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + max_leaf_nodes, + self.min_impurity_decrease, + ) + + self.builder_.build(self.tree_, X, y, sample_weight, None) + + if self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index c9c833d90..3f7b84e43 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -20,6 +20,7 @@ from .._lib.sklearn.tree._tree cimport DTYPE_t # Type of X from .._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer from .._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters from .._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer +from .._lib.sklearn.utils._typedefs cimport intp_t from ._sklearn_splitter cimport sort @@ -88,6 +89,12 @@ cdef class BaseObliqueSplitter(Splitter): double upper_bound, ) except -1 nogil + cdef inline void fisher_yates_shuffle_memview( + self, + SIZE_t[::1] indices_to_sample, + SIZE_t grid_size, + UINT32_t* random_state + ) noexcept nogil cdef class ObliqueSplitter(BaseObliqueSplitter): # The splitter searches in the input space for a linear combination of features and a threshold @@ -106,3 +113,69 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): vector[vector[DTYPE_t]]& proj_mat_weights, vector[vector[SIZE_t]]& proj_mat_indices ) noexcept nogil + + +cdef class BestObliqueSplitter(ObliqueSplitter): + cdef int node_split( + self, + double impurity, # Impurity of the node + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound, + ) except -1 nogil + + +cdef class RandomObliqueSplitter(ObliqueSplitter): + cdef void find_min_max( + self, + DTYPE_t[::1] feature_values, + DTYPE_t* min_feature_value_out, + DTYPE_t* max_feature_value_out, + ) noexcept nogil + + cdef SIZE_t partition_samples( + self, + double current_threshold + ) noexcept nogil + + cdef int node_split( + self, + double impurity, # Impurity of the node + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound, + ) except -1 nogil + + +# XXX: This splitter is experimental. Expect changes frequently. +cdef class MultiViewSplitter(BestObliqueSplitter): + cdef const intp_t[:] feature_set_ends # an array indicating the column indices of the end of each feature set + cdef intp_t n_feature_sets # the number of feature sets is the length of feature_set_ends + 1 + + cdef vector[vector[SIZE_t]] multi_indices_to_sample + + cdef void sample_proj_mat( + self, + vector[vector[DTYPE_t]]& proj_mat_weights, + vector[vector[SIZE_t]]& proj_mat_indices + ) noexcept nogil + + +# XXX: This splitter is experimental. Expect changes frequently. +cdef class MultiViewObliqueSplitter(BestObliqueSplitter): + cdef const intp_t[:] feature_set_ends # an array indicating the column indices of the end of each feature set + cdef intp_t n_feature_sets # the number of feature sets is the length of feature_set_ends + 1 + + # whether or not to uniformly sample feature-sets into each projection vector + # if True, then sample from each feature set for each projection vector + cdef bint uniform_sampling + + cdef vector[vector[SIZE_t]] multi_indices_to_sample + + cdef void sample_proj_mat( + self, + vector[vector[DTYPE_t]]& proj_mat_weights, + vector[vector[SIZE_t]]& proj_mat_indices + ) noexcept nogil diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index bc29cafa7..92aa612a7 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -124,150 +124,19 @@ cdef class BaseObliqueSplitter(Splitter): feature_values[idx] = 0.0 feature_values[idx] += self.X[samples[idx], col_idx] * col_weight - cdef int node_split( + cdef inline void fisher_yates_shuffle_memview( self, - double impurity, - SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound, - ) except -1 nogil: - """Find the best_split split on node samples[start:end] - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - """ - # typecast the pointer to an ObliqueSplitRecord - cdef ObliqueSplitRecord* oblique_split = (split) - - # Draw random splits and pick the best_split - cdef SIZE_t[::1] samples = self.samples - cdef SIZE_t start = self.start - cdef SIZE_t end = self.end - - # pointer array to store feature values to split on - cdef DTYPE_t[::1] feature_values = self.feature_values - cdef SIZE_t max_features = self.max_features - cdef SIZE_t min_samples_leaf = self.min_samples_leaf - - # keep track of split record for current_split node and the best_split split - # found among the sampled projection vectors - cdef ObliqueSplitRecord best_split, current_split - cdef double current_proxy_improvement = -INFINITY - cdef double best_proxy_improvement = -INFINITY - - cdef SIZE_t feat_i, p # index over computed features and start/end - cdef SIZE_t partition_end - cdef DTYPE_t temp_d # to compute a projection feature value - - # instantiate the split records - _init_split(&best_split, end) - - # Sample the projection matrix - self.sample_proj_mat(self.proj_mat_weights, self.proj_mat_indices) - - # For every vector in the projection matrix - for feat_i in range(max_features): - # Projection vector has no nonzeros - if self.proj_mat_weights[feat_i].empty(): - continue - - # XXX: 'feature' is not actually used in oblique split records - # Just indicates which split was sampled - current_split.feature = feat_i - current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] - current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] - - # Compute linear combination of features and then - # sort samples according to the feature values. - self.compute_features_over_samples( - start, - end, - samples, - feature_values, - &self.proj_mat_weights[feat_i], - &self.proj_mat_indices[feat_i] - ) - - # Sort the samples - sort(&feature_values[start], &samples[start], end - start) - - # Evaluate all splits - self.criterion.reset() - p = start - while p < end: - while (p + 1 < end and feature_values[p + 1] <= feature_values[p] + FEATURE_THRESHOLD): - p += 1 - - p += 1 - - if p < end: - current_split.pos = p - - # Reject if min_samples_leaf is not guaranteed - if (((current_split.pos - start) < min_samples_leaf) or - ((end - current_split.pos) < min_samples_leaf)): - continue - - self.criterion.update(current_split.pos) - # Reject if min_weight_leaf is not satisfied - if self.check_postsplit_conditions() == 1: - continue - - current_proxy_improvement = \ - self.criterion.proxy_impurity_improvement() - - if current_proxy_improvement > best_proxy_improvement: - best_proxy_improvement = current_proxy_improvement - # sum of halves is used to avoid infinite value - current_split.threshold = feature_values[p - 1] / 2.0 + feature_values[p] / 2.0 - - if ( - (current_split.threshold == feature_values[p]) or - (current_split.threshold == INFINITY) or - (current_split.threshold == -INFINITY) - ): - current_split.threshold = feature_values[p - 1] - - best_split = current_split # copy - - # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] - if best_split.pos < end: - partition_end = end - p = start - - while p < partition_end: - # Account for projection vector - temp_d = 0.0 - for j in range(best_split.proj_vec_indices.size()): - temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ - deref(best_split.proj_vec_weights)[j] - - if temp_d <= best_split.threshold: - p += 1 - - else: - partition_end -= 1 - samples[p], samples[partition_end] = \ - samples[partition_end], samples[p] - - self.criterion.reset() - self.criterion.update(best_split.pos) - self.criterion.children_impurity(&best_split.impurity_left, - &best_split.impurity_right) - best_split.improvement = self.criterion.impurity_improvement( - impurity, best_split.impurity_left, best_split.impurity_right) + SIZE_t[::1] indices_to_sample, + SIZE_t grid_size, + UINT32_t* random_state, + ) noexcept nogil: + cdef int i, j - # Return values - deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices - deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights - deref(oblique_split).feature = best_split.feature - deref(oblique_split).pos = best_split.pos - deref(oblique_split).threshold = best_split.threshold - deref(oblique_split).improvement = best_split.improvement - deref(oblique_split).impurity_left = best_split.impurity_left - deref(oblique_split).impurity_right = best_split.impurity_right - return 0 + # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? + for i in range(0, grid_size - 1): + j = rand_int(i, grid_size, random_state) + indices_to_sample[j], indices_to_sample[i] = \ + indices_to_sample[i], indices_to_sample[j] cdef class ObliqueSplitter(BaseObliqueSplitter): def __cinit__( @@ -330,12 +199,6 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): # or max w/ 1... self.n_non_zeros = max(int(self.max_features * self.feature_combinations), 1) - def __getstate__(self): - return {} - - def __setstate__(self, d): - pass - cdef int init( self, object X, @@ -369,9 +232,9 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): Parameters ---------- - proj_mat_weights : vector of vectors reference + proj_mat_weights : vector of vectors reference of shape (mtry, n_features) The memory address of projection matrix non-zero weights. - proj_mat_indices : vector of vectors reference + proj_mat_indices : vector of vectors reference of shape (mtry, n_features) The memory address of projection matrix non-zero indices. Notes @@ -380,7 +243,6 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): it is assumed ``feature_combinations`` is forced to be smaller than ``n_features`` before instantiating an oblique splitter. """ - cdef SIZE_t n_features = self.n_features cdef SIZE_t n_non_zeros = self.n_non_zeros cdef UINT32_t* random_state = &self.rand_r_state @@ -393,10 +255,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): cdef SIZE_t grid_size = self.max_features * self.n_features # shuffle indices over the 2D grid to sample using Fisher-Yates - for i in range(0, grid_size): - j = rand_int(0, grid_size - i, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] + self.fisher_yates_shuffle_memview(indices_to_sample, grid_size, random_state) # sample 'n_non_zeros' in a mtry X n_features projection matrix # which consists of +/- 1's chosen at a 1/2s rate @@ -404,7 +263,8 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): # get the next index from the shuffled index array rand_vec_index = indices_to_sample[i] - # get the projection index and feature index + # get the projection index (i.e. row of the projection matrix) and + # feature index (i.e. column of the projection matrix) proj_i = rand_vec_index // n_features feat_i = rand_vec_index % n_features @@ -428,6 +288,151 @@ cdef class BestObliqueSplitter(ObliqueSplitter): self.feature_combinations, ), self.__getstate__()) + cdef int node_split( + self, + double impurity, + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound, + ) except -1 nogil: + """Find the best_split split on node samples[start:end] + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + # typecast the pointer to an ObliqueSplitRecord + cdef ObliqueSplitRecord* oblique_split = (split) + + # Draw random splits and pick the best_split + cdef SIZE_t[::1] samples = self.samples + cdef SIZE_t start = self.start + cdef SIZE_t end = self.end + + # pointer array to store feature values to split on + cdef DTYPE_t[::1] feature_values = self.feature_values + cdef SIZE_t max_features = self.max_features + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + + # keep track of split record for current_split node and the best_split split + # found among the sampled projection vectors + cdef ObliqueSplitRecord best_split, current_split + cdef double current_proxy_improvement = -INFINITY + cdef double best_proxy_improvement = -INFINITY + + cdef SIZE_t feat_i, p # index over computed features and start/end + cdef SIZE_t partition_end + cdef DTYPE_t temp_d # to compute a projection feature value + + # instantiate the split records + _init_split(&best_split, end) + + # Sample the projection matrix + self.sample_proj_mat(self.proj_mat_weights, self.proj_mat_indices) + + # For every vector in the projection matrix + for feat_i in range(max_features): + # Projection vector has no nonzeros + if self.proj_mat_weights[feat_i].empty(): + continue + + # XXX: 'feature' is not actually used in oblique split records + # Just indicates which split was sampled + current_split.feature = feat_i + current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] + current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] + + # Compute linear combination of features and then + # sort samples according to the feature values. + self.compute_features_over_samples( + start, + end, + samples, + feature_values, + &self.proj_mat_weights[feat_i], + &self.proj_mat_indices[feat_i] + ) + + # Sort the samples + sort(&feature_values[start], &samples[start], end - start) + + # Evaluate all splits + self.criterion.reset() + p = start + while p < end: + while (p + 1 < end and feature_values[p + 1] <= feature_values[p] + FEATURE_THRESHOLD): + p += 1 + + p += 1 + + if p < end: + current_split.pos = p + + # Reject if min_samples_leaf is not guaranteed + if (((current_split.pos - start) < min_samples_leaf) or + ((end - current_split.pos) < min_samples_leaf)): + continue + + self.criterion.update(current_split.pos) + # Reject if min_weight_leaf is not satisfied + if self.check_postsplit_conditions() == 1: + continue + + current_proxy_improvement = \ + self.criterion.proxy_impurity_improvement() + + if current_proxy_improvement > best_proxy_improvement: + best_proxy_improvement = current_proxy_improvement + # sum of halves is used to avoid infinite value + current_split.threshold = feature_values[p - 1] / 2.0 + feature_values[p] / 2.0 + + if ( + (current_split.threshold == feature_values[p]) or + (current_split.threshold == INFINITY) or + (current_split.threshold == -INFINITY) + ): + current_split.threshold = feature_values[p - 1] + + best_split = current_split # copy + + # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] + if best_split.pos < end: + partition_end = end + p = start + + while p < partition_end: + # Account for projection vector + temp_d = 0.0 + for j in range(best_split.proj_vec_indices.size()): + temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ + deref(best_split.proj_vec_weights)[j] + + if temp_d <= best_split.threshold: + p += 1 + + else: + partition_end -= 1 + samples[p], samples[partition_end] = \ + samples[partition_end], samples[p] + + self.criterion.reset() + self.criterion.update(best_split.pos) + self.criterion.children_impurity(&best_split.impurity_left, + &best_split.impurity_right) + best_split.improvement = self.criterion.impurity_improvement( + impurity, best_split.impurity_left, best_split.impurity_right) + + # Return values + deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices + deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights + deref(oblique_split).feature = best_split.feature + deref(oblique_split).pos = best_split.pos + deref(oblique_split).threshold = best_split.threshold + deref(oblique_split).improvement = best_split.improvement + deref(oblique_split).impurity_left = best_split.impurity_left + deref(oblique_split).impurity_right = best_split.impurity_right + return 0 + cdef class RandomObliqueSplitter(ObliqueSplitter): def __reduce__(self): """Enable pickling the splitter.""" @@ -652,3 +657,403 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): deref(oblique_split).impurity_left = best_split.impurity_left deref(oblique_split).impurity_right = best_split.impurity_right return 0 + + +cdef class MultiViewSplitter(BestObliqueSplitter): + def __cinit__( + self, + Criterion criterion, + SIZE_t max_features, + SIZE_t min_samples_leaf, + double min_weight_leaf, + object random_state, + const cnp.int8_t[:] monotonic_cst, + double feature_combinations, + const intp_t[:] feature_set_ends, + intp_t n_feature_sets, + *argv + ): + self.feature_set_ends = feature_set_ends + + # infer the number of feature sets + self.n_feature_sets = n_feature_sets + + def __getstate__(self): + return {} + + def __setstate__(self, d): + pass + + def __reduce__(self): + """Enable pickling the splitter.""" + return (type(self), + ( + self.criterion, + self.max_features, + self.min_samples_leaf, + self.min_weight_leaf, + self.random_state, + self.monotonic_cst.base if self.monotonic_cst is not None else None, + self.feature_combinations, + self.feature_set_ends.base if self.feature_set_ends is not None else None, + self.n_feature_sets, + ), self.__getstate__()) + + cdef int init( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + const unsigned char[::1] missing_values_in_feature_mask, + ) except -1: + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) + + self.X = X + + # create a helper array for allowing efficient Fisher-Yates + self.multi_indices_to_sample = vector[vector[SIZE_t]](self.n_feature_sets) + + cdef SIZE_t i_feature = 0 + cdef SIZE_t feature_set_begin = 0 + cdef SIZE_t size_of_feature_set + cdef intp_t ifeat = 0 + for i_feature in range(self.n_feature_sets): + size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin + for ifeat in range(size_of_feature_set): + self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin) + + feature_set_begin = self.feature_set_ends[i_feature] + return 0 + + cdef void sample_proj_mat( + self, + vector[vector[DTYPE_t]]& proj_mat_weights, + vector[vector[SIZE_t]]& proj_mat_indices + ) noexcept nogil: + """Sample projection matrix accounting for multi-views. + + This proceeds as a normal sampling projection matrix, + but now also uniformly samples features from each feature set. + """ + cdef UINT32_t* random_state = &self.rand_r_state + cdef int feat_i, proj_i + cdef DTYPE_t weight + + # keep track of the beginning and ending indices of each feature set + cdef int idx + cdef int ifeature = 0 + cdef int grid_size + + # 03: Algorithm samples features from each set with equal probability + proj_i = 0 + cdef bint finished_feature_set = False + + cdef int i, j + while proj_i < self.max_features and not finished_feature_set: + # sample from a feature set + for idx in range(self.n_feature_sets): + # indices_to_sample = self.multi_indices_to_sample[idx] + grid_size = self.multi_indices_to_sample[idx].size() + + # Note: a temporary variable must occur + if proj_i == 0: + for i in range(0, self.multi_indices_to_sample[idx].size() - 1): + j = rand_int(i + 1, grid_size, random_state) + self.multi_indices_to_sample[idx][i], self.multi_indices_to_sample[idx][j] = self.multi_indices_to_sample[idx][j], self.multi_indices_to_sample[idx][i] + + if ifeature >= grid_size: + finished_feature_set = True + continue + + # sample random feature in this set + feat_i = self.multi_indices_to_sample[idx][ifeature] + + # here, axis-aligned splits are entirely weights of 1 + weight = 1 # if (rand_int(0, 2, random_state) == 1) else -1 + + proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero + proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero + + proj_i += 1 + if proj_i >= self.max_features: + break + + finished_feature_set = False + + ifeature += 1 + +# XXX: not used right now +cdef class MultiViewObliqueSplitter(BestObliqueSplitter): + def __cinit__( + self, + Criterion criterion, + SIZE_t max_features, + SIZE_t min_samples_leaf, + double min_weight_leaf, + object random_state, + const cnp.int8_t[:] monotonic_cst, + double feature_combinations, + const intp_t[:] feature_set_ends, + intp_t n_feature_sets, + bint uniform_sampling, + *argv + ): + self.feature_set_ends = feature_set_ends + self.uniform_sampling = uniform_sampling + + # infer the number of feature sets + self.n_feature_sets = n_feature_sets + + def __reduce__(self): + """Enable pickling the splitter.""" + return (type(self), + ( + self.criterion, + self.max_features, + self.min_samples_leaf, + self.min_weight_leaf, + self.random_state, + self.monotonic_cst.base if self.monotonic_cst is not None else None, + self.feature_combinations, + self.feature_set_ends, + self.n_feature_sets, + self.uniform_sampling, + ), self.__getstate__()) + + cdef int init( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + const unsigned char[::1] missing_values_in_feature_mask, + ) except -1: + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) + + self.X = X + + # create a helper array for allowing efficient Fisher-Yates + self.multi_indices_to_sample = vector[vector[SIZE_t]](self.n_feature_sets) + + cdef SIZE_t i_feature = 0 + cdef SIZE_t feature_set_begin = 0 + cdef SIZE_t size_of_feature_set + cdef intp_t ifeat = 0 + cdef intp_t iproj = 0 + while iproj < self.max_features: + for i_feature in range(self.n_feature_sets): + size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin + + for ifeat in range(size_of_feature_set): + self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin + (iproj * self.n_features)) + iproj += 1 + if iproj >= self.max_features: + break + if iproj >= self.max_features: + break + + feature_set_begin = self.feature_set_ends[i_feature] + return 0 + + cdef void sample_proj_mat( + self, + vector[vector[DTYPE_t]]& proj_mat_weights, + vector[vector[SIZE_t]]& proj_mat_indices + ) noexcept nogil: + """Sample projection matrix accounting for multi-views. + + This proceeds as a normal sampling projection matrix, + but now also uniformly samples features from each feature set. + """ + cdef SIZE_t n_features = self.n_features + cdef SIZE_t n_non_zeros = self.n_non_zeros + cdef UINT32_t* random_state = &self.rand_r_state + + cdef int i, j, feat_i, proj_i, rand_vec_index + cdef DTYPE_t weight + + # construct an array to sample from mTry x n_features set of indices + cdef vector[SIZE_t] indices_to_sample + cdef SIZE_t grid_size + + # compute the number of features in each feature set + cdef SIZE_t n_features_in_set + + # keep track of the beginning and ending indices of each feature set + cdef int feature_set_begin, feature_set_end, idx + feature_set_begin = 0 + + # keep track of number of features sampled relative to n_non_zeros + cdef int ifeature = 0 + + if self.uniform_sampling: + # 01: This algorithm samples features from each feature set uniformly and combines them + # into one sparse projection vector. + while ifeature < n_non_zeros: + for idx in range(self.n_feature_sets): + feature_set_end = self.feature_set_ends[idx] + n_features_in_set = feature_set_end - feature_set_begin + indices_to_sample = self.multi_indices_to_sample[idx] + grid_size = indices_to_sample.size() + + # shuffle indices over the 2D grid for this feature set to sample using Fisher-Yates + for i in range(0, grid_size): + j = rand_int(0, grid_size, random_state) + indices_to_sample[j], indices_to_sample[i] = \ + indices_to_sample[i], indices_to_sample[j] + + # sample a n_non_zeros matrix for each feature set, which proceeds by: + # - sample 'n_non_zeros' in a mtry X n_features projection matrix + # - which consists of +/- 1's chosen at a 1/2s rate + # for i in range(0, n_non_zeros_per_set): + # get the next index from the shuffled index array + rand_vec_index = indices_to_sample[0] + + # get the projection index (i.e. row of the projection matrix) and + # feature index (i.e. column of the projection matrix) + proj_i = rand_vec_index // n_features + feat_i = rand_vec_index % n_features + + # sample a random weight + weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 + + proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero + proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero + + # the new beginning is the previous end + feature_set_begin = feature_set_end + + ifeature += 1 + else: + # 02: Algorithm samples feature combinations from each feature set uniformly and evaluates + # them independently. + feature_set_begin = 0 + + # sample from a feature set + for idx in range(self.n_feature_sets): + feature_set_end = self.feature_set_ends[idx] + n_features_in_set = feature_set_end - feature_set_begin + + # indices to sample is a 1D-index array of size (max_features * n_features_in_set) + # which is Fisher-Yates shuffled to sample random features in each feature set + indices_to_sample = self.multi_indices_to_sample[idx] + grid_size = indices_to_sample.size() + + # shuffle indices over the 2D grid for this feature set to sample using Fisher-Yates + for i in range(0, grid_size): + j = rand_int(0, grid_size, random_state) + indices_to_sample[j], indices_to_sample[i] = \ + indices_to_sample[i], indices_to_sample[j] + + for i in range(0, n_non_zeros): + # get the next index from the shuffled index array + rand_vec_index = indices_to_sample[i] + + # get the projection index (i.e. row of the projection matrix) and + # feature index (i.e. column of the projection matrix) + proj_i = rand_vec_index // n_features + feat_i = rand_vec_index % n_features + + # sample a random weight + weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 + + proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero + proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero + + # the new beginning is the previous end + feature_set_begin = feature_set_end + + +cdef class MultiViewSplitterTester(MultiViewSplitter): + """A class to expose a Python interface for testing.""" + + cpdef sample_projection_matrix_py(self): + """Sample projection matrix using a patch. + + Used for testing purposes. + + Returns projection matrix of shape (max_features, n_features). + """ + cdef vector[vector[DTYPE_t]] proj_mat_weights = vector[vector[DTYPE_t]](self.max_features) + cdef vector[vector[SIZE_t]] proj_mat_indices = vector[vector[SIZE_t]](self.max_features) + cdef SIZE_t i, j + + # sample projection matrix in C/C++ + self.sample_proj_mat(proj_mat_weights, proj_mat_indices) + + # convert the projection matrix to something that can be used in Python + proj_vecs = np.zeros((self.max_features, self.n_features), dtype=np.float32) + for i in range(0, self.max_features): + for j in range(0, proj_mat_weights[i].size()): + weight = proj_mat_weights[i][j] + feat = proj_mat_indices[i][j] + + proj_vecs[i, feat] = weight + + return proj_vecs + + cpdef init_test(self, X, y, sample_weight, missing_values_in_feature_mask=None): + """Initializes the state of the splitter. + + Used for testing purposes. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The input samples. + y : array-like, shape (n_samples,) + The target values (class labels in classification, real numbers in + regression). + sample_weight : array-like, shape (n_samples,) + Sample weights. + missing_values_in_feature_mask : array-like, shape (n_features,) + Whether or not a feature has missing values. + """ + self.init(X, y, sample_weight, missing_values_in_feature_mask) + + +cdef class BestObliqueSplitterTester(BestObliqueSplitter): + """A class to expose a Python interface for testing.""" + + cpdef sample_projection_matrix_py(self): + """Sample projection matrix using a patch. + + Used for testing purposes. + + Returns projection matrix of shape (max_features, n_features). + """ + cdef vector[vector[DTYPE_t]] proj_mat_weights = vector[vector[DTYPE_t]](self.max_features) + cdef vector[vector[SIZE_t]] proj_mat_indices = vector[vector[SIZE_t]](self.max_features) + cdef SIZE_t i, j + + # sample projection matrix in C/C++ + self.sample_proj_mat(proj_mat_weights, proj_mat_indices) + + # convert the projection matrix to something that can be used in Python + proj_vecs = np.zeros((self.max_features, self.n_features), dtype=np.float32) + for i in range(0, self.max_features): + for j in range(0, proj_mat_weights[i].size()): + weight = proj_mat_weights[i][j] + feat = proj_mat_indices[i][j] + + proj_vecs[i, feat] = weight + + return proj_vecs + + cpdef init_test(self, X, y, sample_weight, missing_values_in_feature_mask=None): + """Initializes the state of the splitter. + + Used for testing purposes. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The input samples. + y : array-like, shape (n_samples,) + The target values (class labels in classification, real numbers in + regression). + sample_weight : array-like, shape (n_samples,) + Sample weights. + missing_values_in_feature_mask : array-like, shape (n_features,) + Whether or not a feature has missing values. + """ + self.init(X, y, sample_weight, missing_values_in_feature_mask) diff --git a/sktree/tree/manifold/_morf_splitter.pxd b/sktree/tree/manifold/_morf_splitter.pxd index 9387d6797..a05bb50e8 100644 --- a/sktree/tree/manifold/_morf_splitter.pxd +++ b/sktree/tree/manifold/_morf_splitter.pxd @@ -19,7 +19,7 @@ from ..._lib.sklearn.tree._tree cimport DTYPE_t # Type of X from ..._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer from ..._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters from ..._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer -from .._oblique_splitter cimport BaseObliqueSplitter, ObliqueSplitRecord +from .._oblique_splitter cimport BestObliqueSplitter, ObliqueSplitRecord # https://github.com/cython/cython/blob/master/Cython/Includes/libcpp/algorithm.pxd # shows how to include standard library functions in Cython @@ -38,7 +38,7 @@ cdef extern from "" namespace "std" nogil: void swap[T](T& a, T& b) except + # array overload also works -cdef class PatchSplitter(BaseObliqueSplitter): +cdef class PatchSplitter(BestObliqueSplitter): # The PatchSplitter creates candidate feature values by sampling 2D patches from # an input data vector. The input data is vectorized, so `data_height` and # `data_width` are used to determine the vectorized indices corresponding to diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index 039d58008..95c37a2da 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -19,7 +19,7 @@ from ..._lib.sklearn.tree._criterion cimport Criterion from .._utils cimport ravel_multi_index_cython, unravel_index_cython -cdef class PatchSplitter(BaseObliqueSplitter): +cdef class PatchSplitter(BestObliqueSplitter): """Patch splitter. A convolutional 2D patch splitter. @@ -37,7 +37,7 @@ cdef class PatchSplitter(BaseObliqueSplitter): const DOUBLE_t[:] sample_weight, const unsigned char[::1] missing_values_in_feature_mask, ) except -1: - BaseObliqueSplitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) + BestObliqueSplitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) return 0 @@ -125,6 +125,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): double min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, + double feature_combinations, const SIZE_t[:] min_patch_dims, const SIZE_t[:] max_patch_dims, const cnp.uint8_t[:] dim_contiguous, @@ -186,6 +187,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.min_weight_leaf, self.random_state, self.monotonic_cst.base if self.monotonic_cst is not None else None, + self.feature_combinations, self.min_patch_dims.base if self.min_patch_dims is not None else None, self.max_patch_dims.base if self.max_patch_dims is not None else None, self.dim_contiguous.base if self.dim_contiguous is not None else None, @@ -480,7 +482,7 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): proj_vecs[i, feat] = weight return proj_vecs - cpdef sample_projection_matrix(self): + cpdef sample_projection_matrix_py(self): """Sample projection matrix using a patch. Used for testing purposes. diff --git a/sktree/tree/meson.build b/sktree/tree/meson.build index 09ac76013..cc13bc1b4 100644 --- a/sktree/tree/meson.build +++ b/sktree/tree/meson.build @@ -19,6 +19,7 @@ endforeach python_sources = [ '__init__.py', '_classes.py', + '_multiview.py', '_neighbors.py', '_honest_tree.py', '_marginalize.py', diff --git a/sktree/tree/tests/meson.build b/sktree/tree/tests/meson.build index e3d54ab85..c88eeda7c 100644 --- a/sktree/tree/tests/meson.build +++ b/sktree/tree/tests/meson.build @@ -6,6 +6,7 @@ python_sources = [ 'test_marginal.py', 'test_all_trees.py', 'test_unsupervised_tree.py', + 'test_multiview.py', ] py3.install_sources( diff --git a/sktree/tree/tests/test_multiview.py b/sktree/tree/tests/test_multiview.py new file mode 100644 index 000000000..cddfc661c --- /dev/null +++ b/sktree/tree/tests/test_multiview.py @@ -0,0 +1,86 @@ +import numpy as np +import pytest +from sklearn.datasets import make_blobs +from sklearn.metrics import accuracy_score +from sklearn.model_selection import cross_val_score +from sklearn.utils.estimator_checks import parametrize_with_checks + +from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier + +seed = 12345 + + +@parametrize_with_checks( + [ + MultiViewDecisionTreeClassifier(random_state=12), + ] +) +def test_sklearn_compatible_estimator(estimator, check): + check(estimator) + + +@pytest.mark.parametrize("baseline_est", [MultiViewDecisionTreeClassifier, DecisionTreeClassifier]) +def test_multiview_classification(baseline_est): + """Test that explicit knowledge of multi-view structure improves classification accuracy. + + In very high-dimensional noise setting across two views, when the max_depth and max_features + are constrained, the multi-view decision tree will still obtain perfect accuracy, while + the single-view decision tree will not. + """ + rng = np.random.default_rng(seed=seed) + + n_samples = 20 + n_features_1 = 5 + n_features_2 = 1000 + cluster_std = 2.0 + + # Create a high-dimensional multiview dataset with a low-dimensional informative + # subspace in one view of the dataset. + X0_first, y0 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1, + random_state=rng.integers(1, 10000), + centers=1, + ) + + X1_first, y1 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1, + random_state=rng.integers(1, 10000), + centers=1, + ) + y1[:] = 1 + X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X = np.vstack((X0, X1)) + y = np.hstack((y0, y1)).T + + # Compare multiview decision tree vs single-view decision tree + clf = MultiViewDecisionTreeClassifier( + random_state=seed, + feature_set_ends=[n_features_1, X.shape[1]], + max_features="sqrt", + max_depth=5, + ) + clf.fit(X, y) + assert ( + accuracy_score(y, clf.predict(X)) == 1.0 + ), f"Accuracy score: {accuracy_score(y, clf.predict(X))}" + assert ( + cross_val_score(clf, X, y, cv=5).mean() == 1.0 + ), f"CV score: {cross_val_score(clf, X, y, cv=5).mean()}" + + clf = baseline_est( + random_state=seed, + max_depth=5, + max_features="sqrt", + ) + clf.fit(X, y) + assert ( + accuracy_score(y, clf.predict(X)) == 1.0 + ), f"Accuracy score: {accuracy_score(y, clf.predict(X))}" + assert ( + cross_val_score(clf, X, y, cv=5).mean() <= 0.6 + ), f"CV score: {cross_val_score(clf, X, y, cv=5).mean()}" diff --git a/sktree/tree/tests/test_tree.py b/sktree/tree/tests/test_tree.py index 5bcd23c23..478aa6d5b 100644 --- a/sktree/tree/tests/test_tree.py +++ b/sktree/tree/tests/test_tree.py @@ -170,6 +170,7 @@ def test_pickle_splitters(): dim_contiguous = np.array((True, True)) data_dims = np.array((5, 5)) + feature_combinations = 1.5 splitter = BestObliqueSplitter( criterion, max_features, @@ -177,7 +178,7 @@ def test_pickle_splitters(): min_weight_leaf, random_state, monotonic_cst, - 1.5, + feature_combinations, ) with tempfile.TemporaryFile() as f: joblib.dump(splitter, f) @@ -189,7 +190,7 @@ def test_pickle_splitters(): min_weight_leaf, random_state, monotonic_cst, - 1.5, + feature_combinations, ) with tempfile.TemporaryFile() as f: joblib.dump(splitter, f) @@ -201,6 +202,7 @@ def test_pickle_splitters(): min_weight_leaf, random_state, monotonic_cst, + feature_combinations, min_patch_dims, max_patch_dims, dim_contiguous, @@ -471,7 +473,7 @@ def test_diabetes_overfit(name, Tree, criterion): "criterion, max_depth, metric, max_loss", [ ("squared_error", 15, mean_squared_error, 65), - ("absolute_error", 20, mean_squared_error, 60), + ("absolute_error", 25, mean_squared_error, 60), ("friedman_mse", 15, mean_squared_error, 65), ("poisson", 15, mean_poisson_deviance, 30), ], @@ -483,7 +485,9 @@ def test_diabetes_underfit(name, Tree, criterion, max_depth, metric, max_loss): reg = Tree(criterion=criterion, max_depth=max_depth, max_features=10, random_state=1234) reg.fit(diabetes.data, diabetes.target) loss = metric(diabetes.target, reg.predict(diabetes.data)) - assert 0.0 <= loss < max_loss + assert ( + 0.0 <= loss < max_loss + ), f"Failed with {name}, criterion = {criterion} and loss = {loss}, and max_loss: {max_loss}" def test_numerical_stability(): diff --git a/sktree/tree/tests/test_utils.py b/sktree/tree/tests/test_utils.py index 3ba69adbc..3cf6705bf 100644 --- a/sktree/tree/tests/test_utils.py +++ b/sktree/tree/tests/test_utils.py @@ -100,6 +100,8 @@ def test_best_patch_splitter_contiguous(): # monotonic constraints are not supported for oblique splits monotonic_cst = None + feature_combinations = 1.5 + splitter = BestPatchSplitterTester( criterion, max_features, @@ -107,6 +109,7 @@ def test_best_patch_splitter_contiguous(): min_weight_leaf, random_state, monotonic_cst, + feature_combinations, min_patch_dims, max_patch_dims, dim_contiguous, diff --git a/sktree/tree/unsupervised/_unsup_criterion.pxd b/sktree/tree/unsupervised/_unsup_criterion.pxd index 47967841a..bfbd7428a 100644 --- a/sktree/tree/unsupervised/_unsup_criterion.pxd +++ b/sktree/tree/unsupervised/_unsup_criterion.pxd @@ -2,8 +2,6 @@ # cython: wraparound=False # cython: language_level=3 -from libcpp.unordered_map cimport unordered_map - from ..._lib.sklearn.tree._criterion cimport BaseCriterion from ..._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight from ..._lib.sklearn.tree._tree cimport DTYPE_t # Type of X @@ -47,11 +45,6 @@ cdef class UnsupervisedCriterion(BaseCriterion): cdef double sumsq_left # Same as above, but for the left side of the split cdef double sumsq_right # Same as above, but for the right side of the split - # use memoization to re-compute variance of any subsegment in O(1) - # cdef unordered_map[SIZE_t, DTYPE_t] cumsum_of_squares_map - # cdef unordered_map[SIZE_t, DTYPE_t] cumsum_map - # cdef unordered_map[SIZE_t, DTYPE_t] cumsum_weights_map - # Methods # ------- # The 'init' method is copied here with the almost the exact same signature diff --git a/sktree/tree/unsupervised/_unsup_criterion.pyx b/sktree/tree/unsupervised/_unsup_criterion.pyx index 87874b800..518fc2753 100644 --- a/sktree/tree/unsupervised/_unsup_criterion.pyx +++ b/sktree/tree/unsupervised/_unsup_criterion.pyx @@ -78,11 +78,6 @@ cdef class UnsupervisedCriterion(BaseCriterion): # and then update will never have to iterate through even cdef DOUBLE_t w = 1.0 - # cdef SIZE_t prev_s_idx = -1 - # self.cumsum_of_squares_map[prev_s_idx] = 0.0 - # self.cumsum_map[prev_s_idx] = 0.0 - # self.cumsum_weights_map[prev_s_idx] = 0.0 - for p_idx in range(self.start, self.end): s_idx = self.sample_indices[p_idx]