From ea8ce18f7f27600d9428fc08a7c0cedc3c4b3c74 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 11 Oct 2023 10:44:04 -0400 Subject: [PATCH 1/9] Adding multiview example Signed-off-by: Adam Li --- examples/plot_multiview_dtc.py | 127 +++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 examples/plot_multiview_dtc.py diff --git a/examples/plot_multiview_dtc.py b/examples/plot_multiview_dtc.py new file mode 100644 index 000000000..bd67940db --- /dev/null +++ b/examples/plot_multiview_dtc.py @@ -0,0 +1,127 @@ +""" +============================================================ +Analyze a multi-view dataset with a multi-view random forest +============================================================ + +An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric +multivariate hypothesis test, on simulated datasets. Here, we present a simulation +of how MIGHT is used to evaluate how a "feature set is important for predicting the target". + +We simulate a dataset with 1000 features, 500 samples, and a binary class target +variable. Within each feature set, there is 500 features associated with one feature +set, and another 500 features associated with another feature set. One could think of +these for example as different datasets collected on the same patient in a biomedical setting. +The first feature set (X) is strongly correlated with the target, and the second +feature set (W) is weakly correlated with the target (y). + +We then use MIGHT to calculate the partial AUC of these sets. +""" + +import numpy as np +from scipy.special import expit + +from sktree import HonestForestClassifier +from sktree.stats import FeatureImportanceForestClassifier +from sktree.tree import DecisionTreeClassifier + +seed = 12345 +rng = np.random.default_rng(seed) + +# %% +# Simulate data +# ------------- +# We simulate the two feature sets, and the target variable. We then combine them +# into a single dataset to perform hypothesis testing. + +n_samples = 1000 +n_features_set = 500 +mean = 1.0 +sigma = 2.0 +beta = 5.0 + +unimportant_mean = 0.0 +unimportant_sigma = 4.5 + +# first sample the informative features, and then the uniformative features +X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) +X_important = np.hstack( + [ + X_important, + rng.normal( + loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) + ), + ] +) + +X_unimportant = rng.normal( + loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) +) + +# simulate the binary target variable +y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) + +# %% +# Use partial AUC as test statistic +# --------------------------------- +# You can specify the maximum specificity by modifying ``max_fpr`` in ``statistic``. + +n_estimators = 125 +max_features = "sqrt" +metric = "auc" +test_size = 0.2 +n_jobs = -1 +honest_fraction = 0.7 +max_fpr = 0.1 + +est = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=n_estimators, + max_features=max_features, + tree_estimator=DecisionTreeClassifier(), + random_state=seed, + honest_fraction=honest_fraction, + n_jobs=n_jobs, + ), + random_state=seed, + test_size=test_size, + permute_per_tree=True, + sample_dataset_per_tree=True, +) + +# we test for the first feature set, which is important and thus should return a higher AUC +stat, posterior_arr, samples = est.statistic( + X_important, + y, + metric=metric, + return_posteriors=True, +) + +print(f"ASH-90 / Partial AUC: {stat}") +print(f"Shape of Observed Samples: {samples.shape}") +print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") + +# %% +# Repeat for the second feature set +# --------------------------------- +# This feature set has a smaller statistic, which is expected due to its weak correlation. + +stat, posterior_arr, samples = est.statistic( + X_unimportant, + y, + metric=metric, + return_posteriors=True, +) + +print(f"ASH-90 / Partial AUC: {stat}") +print(f"Shape of Observed Samples: {samples.shape}") +print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") + +# %% +# All posteriors are saved within the model +# ----------------------------------------- +# Extract the results from the model variables anytime. You can save the model with ``pickle``. +# +# ASH-90 / Partial AUC: ``est.observe_stat_`` +# Observed Samples: ``est.observe_samples_`` +# Tree Posteriors for the positive class: ``est.observe_posteriors_`` (n_trees, n_samples_test, 1) +# True Labels: ``est.y_true_final_`` From 3ac096e89f0956e5eb646d93f972da21acddec6f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 11 Oct 2023 12:18:40 -0400 Subject: [PATCH 2/9] Merge main Signed-off-by: Adam Li --- .../plot_multiview_axis_aligned_splitter.py | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 examples/splitters/plot_multiview_axis_aligned_splitter.py diff --git a/examples/splitters/plot_multiview_axis_aligned_splitter.py b/examples/splitters/plot_multiview_axis_aligned_splitter.py new file mode 100644 index 000000000..6663f6fd6 --- /dev/null +++ b/examples/splitters/plot_multiview_axis_aligned_splitter.py @@ -0,0 +1,127 @@ +""" +================================================================================= +Demonstrate and visualize a multi-view projection matrix for an axis-aligned tree +================================================================================= + +This example shows how multi-view projection matrices are generated for an oblique tree, +specifically the :class:`sktree.tree.ObliqueDecisionTreeClassifier`. + +Multi-view projection matrices operate under the assumption that the input ``X`` array +consists of multiple feature-sets that are groups of features important for predicting +``y``. + +For details on how to use the hyperparameters related to the multi-view, see +:class:`sktree.tree.ObliqueDecisionTreeClassifier`. +""" + +# import modules +# .. note:: We use a private Cython module here to demonstrate what the patches +# look like. This is not part of the public API. The Cython module used +# is just a Python wrapper for the underlying Cython code and is not the +# same as the Cython splitter used in the actual implementation. +# To use the actual splitter, one should use the public API for the +# relevant tree/forests class. + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.cm import ScalarMappable +from matplotlib.colors import ListedColormap + +from sktree._lib.sklearn.tree._criterion import Gini +from sktree.tree._oblique_splitter import MultiViewSplitterTester + +criterion = Gini(1, np.array((0, 1))) +max_features = 5 +min_samples_leaf = 1 +min_weight_leaf = 0.0 +random_state = np.random.RandomState(10) + +# we "simulate" three feature sets, with 3, 2 and 4 features respectively +feature_set_ends = np.array([3, 5, 9], dtype=np.intp) +n_feature_sets = len(feature_set_ends) + +feature_combinations = 1 +monotonic_cst = None +missing_value_feature_mask = None + +# initialize some dummy data +X = np.repeat(np.arange(feature_set_ends[-1]).astype(np.float32), 5).reshape(5, -1) +y = np.array([0, 0, 0, 1, 1]).reshape(-1, 1).astype(np.float64) +sample_weight = np.ones(5) + +print("The shape of our dataset is: ", X.shape, y.shape, sample_weight.shape) + +# %% +# Initialize the multi-view splitter +# ---------------------------------- +# The multi-view splitter is a Cython class that is initialized internally +# in scikit-tree. However, we expose a Python tester object to demonstrate +# how the splitter works in practice. +# +# .. warning:: Do not use this interface directly in practice. + +splitter = MultiViewSplitterTester( + criterion, + max_features, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + feature_combinations, + feature_set_ends, + n_feature_sets, +) +splitter.init_test(X, y, sample_weight, missing_value_feature_mask) + +# %% +# Sample the projection matrix +# ---------------------------- +# The projection matrix is sampled by the splitter. The projection +# matrix is a (max_features, n_features) matrix that selects which features of ``X`` +# to define candidate split dimensions. The multi-view +# splitter's projection matrix though samples from multiple feature sets, +# which are aligned contiguously over the columns of ``X``. + +projection_matrix = splitter.sample_projection_matrix_py() +print(projection_matrix) + +cmap = ListedColormap(["white", "green"][:n_feature_sets]) + +# Create a heatmap to visualize the indices +fig, ax = plt.subplots(figsize=(6, 6)) + +ax.imshow( + projection_matrix, cmap=cmap, aspect=feature_set_ends[-1] / max_features, interpolation="none" +) +ax.axvline(feature_set_ends[0] - 0.5, color="black", linewidth=1, label="Feature Sets") +for iend in feature_set_ends[1:]: + ax.axvline(iend - 0.5, color="black", linewidth=1) + +ax.set(title="Sampled Projection Matrix", xlabel="Feature Index", ylabel="Projection Vector Index") +ax.set_xticks(np.arange(feature_set_ends[-1])) +ax.set_yticks(np.arange(max_features)) +ax.set_yticklabels(np.arange(max_features, dtype=int) + 1) +ax.set_xticklabels(np.arange(feature_set_ends[-1], dtype=int) + 1) +ax.legend() + +# Create a mappable object +sm = ScalarMappable(cmap=cmap) +sm.set_array([]) # You can set an empty array or values here + +# Create a color bar with labels for each feature set +colorbar = fig.colorbar(sm, ax=ax, ticks=[0.25, 0.75], format="%d") +colorbar.set_label("Projection Weight (I.e. Sampled Feature From a Feature Set)") +colorbar.ax.set_yticklabels(["0", "1"]) + +plt.show() + +# %% +# Discussion +# ---------- +# As we can see, the multi-view splitter samples split candidates uniformly across the feature sets. +# In contrast, the normal splitter in :class:`sklearn.tree.DecisionTreeClassifier` samples +# randomly across all ``n_features`` features because it is not aware of the multi-view structure. +# This is the key difference between the two splitters. +# +# For an example of using the multi-view splitter in practice on simulated data, see +# :ref:`sphx_glr_auto_examples_plot_multiview_dtc.py`. From 139219ec119cbc15f7b304b79be59b66adfb3620 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 11:35:12 -0400 Subject: [PATCH 3/9] Temp wip Signed-off-by: Adam Li --- .spin/cmds.py | 2 +- ...gigantic_hypothesis_testing_forest copy.py | 138 ++++++++++++++++++ .../plot_MI_imbalanced_hyppo_testing.py | 138 ++++++++++++++++++ examples/multiview/README.txt | 6 + examples/multiview/plot_multiview_dtc.py | 124 ++++++++++++++++ 5 files changed, 407 insertions(+), 1 deletion(-) create mode 100644 examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py create mode 100644 examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py create mode 100644 examples/multiview/README.txt create mode 100644 examples/multiview/plot_multiview_dtc.py diff --git a/.spin/cmds.py b/.spin/cmds.py index 5071dc0a4..a71f863be 100644 --- a/.spin/cmds.py +++ b/.spin/cmds.py @@ -54,7 +54,7 @@ def setup_submodule(forcesubmodule=False): This will update the submodule, which then must be commited so that git knows the submodule needs to be at a certain commit hash. """ - commit_fpath = "./sktree/_lib/sklearn_fork/commit.txt" + commit_fpath = "./sktree/_lib/commit.txt" submodule = "./sktree/_lib/sklearn_fork" commit = "" current_hash = "" diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py new file mode 100644 index 000000000..423bc63dc --- /dev/null +++ b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py @@ -0,0 +1,138 @@ +""" +=========================================================== +Mutual Information for Gigantic Hypothesis Testing (MIGHT) +=========================================================== + +An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric +multivariate hypothesis test, on simulated datasets. Here, we present a simulation +of how MIGHT is used to test the hypothesis that a "feature set is important for +predicting the target". This is a generalization of the framework presented in +:footcite:`coleman2022scalable`. + +We simulate a dataset with 1000 features, 500 samples, and a binary class target +variable. Within each feature set, there is 500 features associated with one feature +set, and another 500 features associated with another feature set. One could think of +these for example as different datasets collected on the same patient in a biomedical setting. +The first feature set (X) is strongly correlated with the target, and the second +feature set (W) is weakly correlated with the target (y). Here, we are testing the +null hypothesis: + +- ``H0: I(X; y) - I(X, W; y) = 0`` +- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with + respect to ``y`` + +where ``I`` is mutual information. For example, this could be true in the following settings, +where X is our informative feature set and W is our uninformative feature set. + +- ``W X -> y``: here ``W`` is completely disconnected from X and y. +- ``W -> X -> y``: here ``W`` is d-separated from y given X. +- ``W <- X -> y``: here ``W`` is d-separated from y given X. + +We then use MIGHT to test the hypothesis that the first feature set is important for +predicting the target, and the second feature set is not important for predicting the +target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`. +""" + +import numpy as np +from scipy.special import expit + +from sktree import HonestForestClassifier +from sktree.stats import FeatureImportanceForestClassifier +from sktree.tree import DecisionTreeClassifier + +seed = 12345 +rng = np.random.default_rng(seed) + +# %% +# Simulate data +# ------------- +# We simulate the two feature sets, and the target variable. We then combine them +# into a single dataset to perform hypothesis testing. + +n_samples = 1000 +n_features_set = 500 +mean = 1.0 +sigma = 2.0 +beta = 5.0 + +unimportant_mean = 0.0 +unimportant_sigma = 4.5 + +# first sample the informative features, and then the uniformative features +X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) +X_important = np.hstack( + [ + X_important, + rng.normal( + loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) + ), + ] +) + +X_unimportant = rng.normal( + loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) +) +X = np.hstack([X_important, X_unimportant]) + +# simulate the binary target variable +y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) + +# %% +# Perform hypothesis testing using Mutual Information +# --------------------------------------------------- +# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis +# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated +# between two forests. One forest is trained on the original dataset, and one forest is trained +# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly. +# +# The null distribution is then estimated in an efficient manner using the framework of +# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors) +# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then +# computed as the proportion of samples in the null distribution that are less than the +# observed test statistic. + +n_estimators = 200 +max_features = "sqrt" +test_size = 0.2 +n_repeats = 1000 +n_jobs = -1 + +est = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=n_estimators, + max_features=max_features, + tree_estimator=DecisionTreeClassifier(), + random_state=seed, + honest_fraction=0.7, + n_jobs=n_jobs, + ), + random_state=seed, + test_size=test_size, + permute_per_tree=True, + sample_dataset_per_tree=False, +) + +print( + f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " + f"{est.sample_dataset_per_tree}" +) +# we test for the first feature set, which is important and thus should return a pvalue < 0.05 +stat, pvalue = est.test( + X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats +) +print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") + +# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 +stat, pvalue = est.test( + X, + y, + covariate_index=np.arange(n_features_set, dtype=int) + n_features_set, + metric="mi", + n_repeats=n_repeats, +) +print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") + +# %% +# References +# ---------- +# .. footbibliography:: diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py new file mode 100644 index 000000000..14c260725 --- /dev/null +++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py @@ -0,0 +1,138 @@ +""" +=============================================================================== +Mutual Information for Gigantic Hypothesis Testing (MIGHT) with Imbalanced Data +=============================================================================== + +An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric +multivariate hypothesis test, on simulated datasets. Here, we present a simulation +of how MIGHT is used to test the hypothesis that a "feature set is important for +predicting the target". This is a generalization of the framework presented in +:footcite:`coleman2022scalable`. + +We simulate a dataset with 1000 features, 500 samples, and a binary class target +variable. Within each feature set, there is 500 features associated with one feature +set, and another 500 features associated with another feature set. One could think of +these for example as different datasets collected on the same patient in a biomedical setting. +The first feature set (X) is strongly correlated with the target, and the second +feature set (W) is weakly correlated with the target (y). Here, we are testing the +null hypothesis: + +- ``H0: I(X; y) - I(X, W; y) = 0`` +- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with + respect to ``y`` + +where ``I`` is mutual information. For example, this could be true in the following settings, +where X is our informative feature set and W is our uninformative feature set. + +- ``W X -> y``: here ``W`` is completely disconnected from X and y. +- ``W -> X -> y``: here ``W`` is d-separated from y given X. +- ``W <- X -> y``: here ``W`` is d-separated from y given X. + +We then use MIGHT to test the hypothesis that the first feature set is important for +predicting the target, and the second feature set is not important for predicting the +target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`. +""" + +import numpy as np +from scipy.special import expit + +from sktree import HonestForestClassifier +from sktree.stats import FeatureImportanceForestClassifier +from sktree.tree import DecisionTreeClassifier + +seed = 12345 +rng = np.random.default_rng(seed) + +# %% +# Simulate data +# ------------- +# We simulate the two feature sets, and the target variable. We then combine them +# into a single dataset to perform hypothesis testing. + +n_samples = 1000 +n_features_set = 500 +mean = 1.0 +sigma = 2.0 +beta = 5.0 + +unimportant_mean = 0.0 +unimportant_sigma = 4.5 + +# first sample the informative features, and then the uniformative features +X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) +X_important = np.hstack( + [ + X_important, + rng.normal( + loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) + ), + ] +) + +X_unimportant = rng.normal( + loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) +) +X = np.hstack([X_important, X_unimportant]) + +# simulate the binary target variable +y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) + +# %% +# Perform hypothesis testing using Mutual Information +# --------------------------------------------------- +# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis +# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated +# between two forests. One forest is trained on the original dataset, and one forest is trained +# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly. +# +# The null distribution is then estimated in an efficient manner using the framework of +# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors) +# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then +# computed as the proportion of samples in the null distribution that are less than the +# observed test statistic. + +n_estimators = 200 +max_features = "sqrt" +test_size = 0.2 +n_repeats = 1000 +n_jobs = -1 + +est = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=n_estimators, + max_features=max_features, + tree_estimator=DecisionTreeClassifier(), + random_state=seed, + honest_fraction=0.7, + n_jobs=n_jobs, + ), + random_state=seed, + test_size=test_size, + permute_per_tree=True, + sample_dataset_per_tree=False, +) + +print( + f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " + f"{est.sample_dataset_per_tree}" +) +# we test for the first feature set, which is important and thus should return a pvalue < 0.05 +stat, pvalue = est.test( + X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats +) +print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") + +# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 +stat, pvalue = est.test( + X, + y, + covariate_index=np.arange(n_features_set, dtype=int) + n_features_set, + metric="mi", + n_repeats=n_repeats, +) +print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") + +# %% +# References +# ---------- +# .. footbibliography:: diff --git a/examples/multiview/README.txt b/examples/multiview/README.txt new file mode 100644 index 000000000..127ad6e57 --- /dev/null +++ b/examples/multiview/README.txt @@ -0,0 +1,6 @@ +.. _multiview_examples: + +Multi-view learning with Decision-trees +--------------------------------------- + +Examples demonstrating multi-view learning using random forest variants. diff --git a/examples/multiview/plot_multiview_dtc.py b/examples/multiview/plot_multiview_dtc.py new file mode 100644 index 000000000..84089547e --- /dev/null +++ b/examples/multiview/plot_multiview_dtc.py @@ -0,0 +1,124 @@ +""" +============================================================ +Analyze a multi-view dataset with a multi-view random forest +============================================================ + +An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric +multivariate hypothesis test, on simulated datasets. Here, we present a simulation +of how MIGHT is used to evaluate how a "feature set is important for predicting the target". + +We simulate a dataset with 1000 features, 500 samples, and a binary class target +variable. Within each feature set, there is 500 features associated with one feature +set, and another 500 features associated with another feature set. One could think of +these for example as different datasets collected on the same patient in a biomedical setting. +The first feature set (X) is strongly correlated with the target, and the second +feature set (W) is weakly correlated with the target (y). + +We then use MIGHT to calculate the partial AUC of these sets. +""" + +import numpy as np +from sklearn.datasets import make_blobs +from sklearn.model_selection import cross_val_score +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from collections import defaultdict +from sktree import MultiViewRandomForestClassifier, RandomForestClassifier + +seed = 12345 +rng = np.random.default_rng(seed) + + +def make_multiview_classification(n_samples=100, n_features_1=5, n_features_2=1000, cluster_std=2.0, seed=None): + rng = np.random.default_rng(seed=seed) + + # Create a high-dimensional multiview dataset with a low-dimensional informative + # subspace in one view of the dataset. + X0_first, y0 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1, + random_state=rng.integers(1, 10000), + centers=1, + ) + + X1_first, y1 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1, + random_state=rng.integers(1, 10000), + centers=1, + ) + y1[:] = 1 + X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X = np.vstack((X0, X1)) + y = np.hstack((y0, y1)).T + + return X, y + +# %% +# Simulate data +# ------------- +# We simulate a 2-view dataset with both views containing informative low-dimensional features. +# The first view has five dimensions, while the second view will vary from five to a thousand +# dimensions. The sample-size will be kept fixed, so we can compare the performance of +# regular Random forests with Multi-view Random Forests. + +n_samples = 1000 +n_features_views = np.linspace(5, 1000, 2).astype(int) + +datasets = [] +for idx, n_features in enumerate(n_features_views): + X, y = make_multiview_classification(n_samples=n_samples, n_features_1=5, n_features_2=n_features, cluster_std=2.0, seed=seed + idx) + datasets.append((X, y)) + +# %% +# Fit Random Forest and Multi-view Random Forest +# ---------------------------------------------- +# Here, we fit both forests over all the datasets. + +n_estimators = 50 +n_jobs = -1 + +scores = defaultdict(list) + +for ((X, y), n_features) in zip(datasets, n_features_views): + feature_set_ends = np.array([5, n_features + 5]) + + rf = RandomForestClassifier( + n_estimators=n_estimators, + n_jobs=n_jobs, + ) + + mvrf = MultiViewRandomForestClassifier( + n_estimators=n_estimators, + n_jobs=n_jobs, + feature_set_ends=n_features_views, + ) + + # obtain the cross-validation score + rf_score = cross_val_score(rf, X, y, cv=2).mean() + mvrf_score = cross_val_score(mvrf, X, y, cv=2).mean() + + scores['rf'].append(rf_score) + scores['mvrf'].append(mvrf_score) + +# %% +# Visualize scores and compare performance +# ---------------------------------------- +# Now, we can compare the performance from the cross-validation experiment. + +scores['n_features'] = n_features_views +df = pd.DataFrame(scores) + +# melt the dataframe, to make it easier to plot +df = pd.melt(df, id_vars=['n_features'], var_name='model', value_name='score') + +fig, ax = plt.subplots() +sns.lineplot(data=df, x='n_features', y='score', hue='model', label='CV Score', ax=ax) +ax.set_ylabel('CV Score') +ax.set_xlabel('Number of features in second view') +ax.set_title('Random Forest vs Multi-view Random Forest') +plt.show() From fdd174f17ed6a682c5daf332513f6139c897e5b8 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 15:36:34 -0400 Subject: [PATCH 4/9] Merging main Signed-off-by: Adam Li --- .github/workflows/build_wheels.yml | 2 +- .github/workflows/main.yml | 2 +- build_requirements.txt | 4 +- pyproject.toml | 2 +- requirements.txt | 2 +- sktree/__init__.py | 1 + sktree/_lib/meson.build | 34 +- sktree/_lib/sklearn_fork | 2 +- sktree/tree/_marginal.pxd | 11 +- sktree/tree/_marginal.pyx | 56 ++-- sktree/tree/_oblique_splitter.pxd | 122 ++++---- sktree/tree/_oblique_splitter.pyx | 296 +++++++++--------- sktree/tree/_oblique_tree.pxd | 24 +- sktree/tree/_oblique_tree.pyx | 68 ++-- sktree/tree/_sklearn_splitter.pxd | 8 +- sktree/tree/_sklearn_splitter.pyx | 122 ++++---- sktree/tree/_utils.pxd | 19 +- sktree/tree/_utils.pyx | 52 +-- sktree/tree/manifold/_morf_splitter.pxd | 51 ++- sktree/tree/manifold/_morf_splitter.pyx | 152 ++++----- sktree/tree/unsupervised/_unsup_criterion.pxd | 34 +- sktree/tree/unsupervised/_unsup_criterion.pyx | 130 ++++---- .../unsupervised/_unsup_oblique_splitter.pxd | 71 ++--- .../unsupervised/_unsup_oblique_splitter.pyx | 120 +++---- .../tree/unsupervised/_unsup_oblique_tree.pxd | 26 +- .../tree/unsupervised/_unsup_oblique_tree.pyx | 64 ++-- sktree/tree/unsupervised/_unsup_splitter.pxd | 39 ++- sktree/tree/unsupervised/_unsup_splitter.pyx | 106 +++---- sktree/tree/unsupervised/_unsup_tree.pxd | 36 +-- sktree/tree/unsupervised/_unsup_tree.pyx | 190 +++++------ 30 files changed, 920 insertions(+), 926 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index b2c415c49..63d8436b5 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -44,7 +44,7 @@ jobs: # - [macos-12, macosx_*, arm64] - [windows-2019, win, AMD64] - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11.0-alpha - 3.11.0"]] + python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]] # python[0] is used to specify the python versions made by cibuildwheel env: diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 32298fc7f..70db81873 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -100,7 +100,7 @@ jobs: - name: Setup build and install scikit-tree run: | - ./spin build -j 2 --forcesubmodule + ./spin build -j 2 - name: Ccache performance shell: bash -l {0} diff --git a/build_requirements.txt b/build_requirements.txt index 360c66711..3341505a4 100644 --- a/build_requirements.txt +++ b/build_requirements.txt @@ -1,9 +1,9 @@ meson meson-python -cython>=3.0 +cython==0.29.36 ninja numpy -scikit-learn>=1.3 +scikit-learn>=1.3.1 click rich-click doit diff --git a/pyproject.toml b/pyproject.toml index d9d901e32..98f6591d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ include = [ dependencies = [ 'numpy', 'scipy>=1.5.0', - 'scikit-learn>=1.3' + 'scikit-learn>=1.3.1' ] diff --git a/requirements.txt b/requirements.txt index 92f3a6b2b..dc42ea7e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy>=1.25 scipy -scikit-learn>=1.3 +scikit-learn>=1.3.1 diff --git a/sktree/__init__.py b/sktree/__init__.py index 1ca249fc0..b6e906702 100644 --- a/sktree/__init__.py +++ b/sktree/__init__.py @@ -59,6 +59,7 @@ ) from .ensemble._honest_forest import HonestForestClassifier except ImportError as e: + print(e.msg) msg = """Error importing scikit-tree: you cannot import scikit-tree while being in scikit-tree source directory; please exit the scikit-tree source tree first and relaunch your Python interpreter.""" diff --git a/sktree/_lib/meson.build b/sktree/_lib/meson.build index ffc70d72c..10dade643 100644 --- a/sktree/_lib/meson.build +++ b/sktree/_lib/meson.build @@ -42,4 +42,36 @@ foreach py_source: python_sources './sklearn/ensemble/' + py_source, subdir: 'sktree/_lib/sklearn/ensemble' ) -endforeach \ No newline at end of file +endforeach + +# TODO: Can remove if included in scikit-learn eventually +# install tree/ submodule +extensions = [ + '_quad_tree', +] + +foreach ext: extensions + py3.extension_module(ext, + cython_gen_cpp.process('./sklearn/neighbors/' + ext + '.pyx'), + c_args: cython_c_args, + include_directories: [incdir_numpy,], + install: true, + subdir: 'sktree/_lib/sklearn/neighbors/', + ) +endforeach + +# install tree/ submodule +extensions = [ + '_typedefs', + '_random', +] + +foreach ext: extensions + py3.extension_module(ext, + cython_gen_cpp.process('./sklearn/utils/' + ext + '.pyx'), + c_args: cython_c_args, + include_directories: [incdir_numpy,], + install: true, + subdir: 'sktree/_lib/sklearn/utils/', + ) +endforeach diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index 01d26303a..6c7a5f44e 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 01d26303ae77ddb8d25cef14feb4be7cd03111f6 +Subproject commit 6c7a5f44eb4ec3bea5dd6a9e4d5db748d12b209e diff --git a/sktree/tree/_marginal.pxd b/sktree/tree/_marginal.pxd index 1b65d60b0..cddeed633 100644 --- a/sktree/tree/_marginal.pxd +++ b/sktree/tree/_marginal.pxd @@ -2,19 +2,16 @@ import numpy as np cimport numpy as cnp -from .._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from .._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from .._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from .._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from .._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer from .._lib.sklearn.tree._tree cimport BaseTree, Node +from .._lib.sklearn.tree._utils cimport UINT32_t +from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t cpdef apply_marginal_tree( BaseTree tree, object X, - const SIZE_t[:] marginal_indices, - int traversal_method, + const intp_t[:] marginal_indices, + intp_t traversal_method, unsigned char use_sample_weight, object random_state ) diff --git a/sktree/tree/_marginal.pyx b/sktree/tree/_marginal.pyx index 04f167b8d..043ee3263 100644 --- a/sktree/tree/_marginal.pyx +++ b/sktree/tree/_marginal.pyx @@ -18,15 +18,15 @@ from numpy import float32 as DTYPE TREE_LEAF = -1 TREE_UNDEFINED = -2 -cdef SIZE_t _TREE_LEAF = TREE_LEAF -cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED +cdef intp_t _TREE_LEAF = TREE_LEAF +cdef intp_t _TREE_UNDEFINED = TREE_UNDEFINED cpdef apply_marginal_tree( BaseTree tree, object X, - const SIZE_t[:] marginal_indices, - int traversal_method, + const intp_t[:] marginal_indices, + intp_t traversal_method, unsigned char use_sample_weight, object random_state ): @@ -41,7 +41,7 @@ cpdef apply_marginal_tree( marginal_indices : ndarray of shape (n_marginals,) The indices of the features to marginalize, which are columns in ``X``. - traversal_method : int + traversal_method : intp_t The traversal method to use. 0 for 'random', 1 for 'weighted'. use_sample_weight : unsigned char @@ -62,13 +62,13 @@ cpdef apply_marginal_tree( if X.dtype != DTYPE: raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) - cdef SIZE_t n_marginals = marginal_indices.shape[0] + cdef intp_t n_marginals = marginal_indices.shape[0] # sklearn_rand_r random number state cdef UINT32_t rand_r_state = random_state.randint(0, RAND_R_MAX) # define a set of all marginal indices - cdef unordered_set[SIZE_t] marginal_indices_map + cdef unordered_set[intp_t] marginal_indices_map # check all marginal indices are valid, and also convert to an unordered map for i in range(n_marginals): @@ -94,19 +94,19 @@ cpdef apply_marginal_tree( cdef void _resample_split_node( BaseTree tree, Node* node, - unordered_set[SIZE_t] marginal_indices_map, - const DTYPE_t[:, :] X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, + unordered_set[intp_t] marginal_indices_map, + const float32_t[:, :] X, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight, ) noexcept nogil: pass cdef inline cnp.ndarray _apply_dense_marginal( BaseTree tree, - const DTYPE_t[:, :] X, - unordered_set[SIZE_t] marginal_indices_map, - int traversal_method, + const float32_t[:, :] X, + unordered_set[intp_t] marginal_indices_map, + intp_t traversal_method, unsigned char use_sample_weight, UINT32_t* rand_r_state ): @@ -122,10 +122,10 @@ cdef inline cnp.ndarray _apply_dense_marginal( The tree to apply. X : const ndarray of shape (n_samples, n_features) The data matrix. - marginal_indices_map : unordered_set[SIZE_t] + marginal_indices_map : unordered_set[intp_t] The indices of the features to marginalize, which are columns in ``X``. - traversal_method : int + traversal_method : intp_t The traversal method to use. 0 for 'random', 1 for 'weighted'. use_sample_weight : unsigned char @@ -135,20 +135,20 @@ cdef inline cnp.ndarray _apply_dense_marginal( The random number state. """ # Extract input - cdef const DTYPE_t[:, :] X_ndarray = X - cdef SIZE_t n_samples = X.shape[0] - cdef DTYPE_t X_i_node_feature + cdef const float32_t[:, :] X_ndarray = X + cdef intp_t n_samples = X.shape[0] + cdef float32_t X_i_node_feature - cdef DTYPE_t n_node_samples, n_right_samples, n_left_samples - cdef double p_left - cdef int is_left + cdef float32_t n_node_samples, n_right_samples, n_left_samples + cdef float64_t p_left + cdef intp_t is_left # Initialize output - cdef SIZE_t[:] out = np.zeros(n_samples, dtype=np.intp) + cdef intp_t[:] out = np.zeros(n_samples, dtype=np.intp) # Initialize auxiliary data-structure cdef Node* node = NULL - cdef SIZE_t i = 0 + cdef intp_t i = 0 with nogil: for i in prange(n_samples): @@ -172,7 +172,7 @@ cdef inline cnp.ndarray _apply_dense_marginal( n_right_samples = tree.nodes[node.right_child].n_node_samples # compute the probabilies for going left and right - p_left = (n_left_samples / n_node_samples) + p_left = (n_left_samples / n_node_samples) # randomly sample a direction is_left = rand_weighted_binary(p_left, rand_r_state) @@ -202,14 +202,14 @@ cdef inline cnp.ndarray _apply_dense_marginal( else: node = &tree.nodes[node.right_child] - out[i] = (node - tree.nodes) # node offset + out[i] = (node - tree.nodes) # node offset return np.asarray(out) -cdef inline int is_element_present(unordered_set[SIZE_t]& my_set, SIZE_t element) noexcept nogil: +cdef inline intp_t is_element_present(unordered_set[intp_t]& my_set, intp_t element) noexcept nogil: """Helper function to check presence of element in set.""" - cdef unordered_set[SIZE_t].iterator it = my_set.find(element) + cdef unordered_set[intp_t].iterator it = my_set.find(element) if it != my_set.end(): return 1 diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index 3f7b84e43..6da9ac2a7 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -15,28 +15,24 @@ from libcpp.vector cimport vector from .._lib.sklearn.tree._criterion cimport Criterion from .._lib.sklearn.tree._splitter cimport SplitRecord, Splitter -from .._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from .._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from .._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from .._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from .._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer -from .._lib.sklearn.utils._typedefs cimport intp_t +from .._lib.sklearn.tree._utils cimport UINT32_t +from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t from ._sklearn_splitter cimport sort cdef struct ObliqueSplitRecord: # Data to track sample split - SIZE_t feature # Which feature to split on. - SIZE_t pos # Split samples array at the given position, + intp_t feature # Which feature to split on. + intp_t pos # Split samples array at the given position, # # i.e. count of samples below threshold for feature. # # pos is >= end if the node is a leaf. - double threshold # Threshold to split at. - double improvement # Impurity improvement given parent node. - double impurity_left # Impurity of the left split. - double impurity_right # Impurity of the right split. + float64_t threshold # Threshold to split at. + float64_t improvement # Impurity improvement given parent node. + float64_t impurity_left # Impurity of the left split. + float64_t impurity_right # Impurity of the right split. - vector[DTYPE_t]* proj_vec_weights # weights of the vector (max_features,) - vector[SIZE_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) + vector[intp_t]* proj_vec_indices # indices of the features (max_features,) cdef class BaseObliqueSplitter(Splitter): @@ -44,55 +40,55 @@ cdef class BaseObliqueSplitter(Splitter): # # Oblique Splitting extra parameters (mtry, n_dims) matrix - cdef vector[vector[DTYPE_t]] proj_mat_weights # nonzero weights of sparse proj_mat matrix - cdef vector[vector[SIZE_t]] proj_mat_indices # nonzero indices of sparse proj_mat matrix + cdef vector[vector[float32_t]] proj_mat_weights # nonzero weights of sparse proj_mat matrix + cdef vector[vector[intp_t]] proj_mat_indices # nonzero indices of sparse proj_mat matrix # TODO: assumes all oblique splitters only work with dense data - cdef const DTYPE_t[:, :] X + cdef const float32_t[:, :] X # feature weights across (n_dims,) - cdef DTYPE_t[:] feature_weights + cdef float32_t[:] feature_weights # All oblique splitters (i.e. non-axis aligned splitters) require a # function to sample a projection matrix that is applied to the feature matrix # to quickly obtain the sampled projections for candidate splits. cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil # Redefined here since the new logic requires calling sample_proj_mat - cdef int node_reset( + cdef intp_t node_reset( self, - SIZE_t start, - SIZE_t end, - double* weighted_n_node_samples + intp_t start, + intp_t end, + float64_t* weighted_n_node_samples ) except -1 nogil cdef void compute_features_over_samples( self, - SIZE_t start, - SIZE_t end, - const SIZE_t[:] samples, - DTYPE_t[:] feature_values, - vector[DTYPE_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[SIZE_t]* proj_vec_indices # indices of the features (max_features,) + intp_t start, + intp_t end, + const intp_t[:] samples, + float32_t[:] feature_values, + vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]* proj_vec_indices # indices of the features (max_features,) ) noexcept nogil - cdef int node_split( + cdef intp_t node_split( self, - double impurity, # Impurity of the node + float64_t impurity, # Impurity of the node SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound, + intp_t* n_constant_features, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil cdef inline void fisher_yates_shuffle_memview( self, - SIZE_t[::1] indices_to_sample, - SIZE_t grid_size, + intp_t[::1] indices_to_sample, + intp_t grid_size, UINT32_t* random_state ) noexcept nogil @@ -101,51 +97,51 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): # to split the samples samples[start:end]. # Oblique Splitting extra parameters - cdef public double feature_combinations # Number of features to combine - cdef SIZE_t n_non_zeros # Number of non-zero features - cdef SIZE_t[::1] indices_to_sample # an array of indices to sample of size mtry X n_features + cdef public float64_t feature_combinations # Number of features to combine + cdef intp_t n_non_zeros # Number of non-zero features + cdef intp_t[::1] indices_to_sample # an array of indices to sample of size mtry X n_features # All oblique splitters (i.e. non-axis aligned splitters) require a # function to sample a projection matrix that is applied to the feature matrix # to quickly obtain the sampled projections for candidate splits. cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil cdef class BestObliqueSplitter(ObliqueSplitter): - cdef int node_split( + cdef intp_t node_split( self, - double impurity, # Impurity of the node + float64_t impurity, # Impurity of the node SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound, + intp_t* n_constant_features, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil cdef class RandomObliqueSplitter(ObliqueSplitter): cdef void find_min_max( self, - DTYPE_t[::1] feature_values, - DTYPE_t* min_feature_value_out, - DTYPE_t* max_feature_value_out, + float32_t[::1] feature_values, + float32_t* min_feature_value_out, + float32_t* max_feature_value_out, ) noexcept nogil - cdef SIZE_t partition_samples( + cdef intp_t partition_samples( self, - double current_threshold + float64_t current_threshold ) noexcept nogil - cdef int node_split( + cdef intp_t node_split( self, - double impurity, # Impurity of the node + float64_t impurity, # Impurity of the node SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound, + intp_t* n_constant_features, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil @@ -154,12 +150,12 @@ cdef class MultiViewSplitter(BestObliqueSplitter): cdef const intp_t[:] feature_set_ends # an array indicating the column indices of the end of each feature set cdef intp_t n_feature_sets # the number of feature sets is the length of feature_set_ends + 1 - cdef vector[vector[SIZE_t]] multi_indices_to_sample + cdef vector[vector[intp_t]] multi_indices_to_sample cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil @@ -172,10 +168,10 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter): # if True, then sample from each feature set for each projection vector cdef bint uniform_sampling - cdef vector[vector[SIZE_t]] multi_indices_to_sample + cdef vector[vector[intp_t]] multi_indices_to_sample cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index 92aa612a7..1dfb256c3 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -13,17 +13,17 @@ from sklearn.tree._utils cimport rand_int, rand_uniform from .._lib.sklearn.tree._criterion cimport Criterion -cdef double INFINITY = np.inf +cdef float64_t INFINITY = np.inf # Mitigate precision differences between 32 bit and 64 bit -cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 +cdef float32_t FEATURE_THRESHOLD = 1e-7 # Constant to switch between algorithm non zero value extract algorithm # in SparseSplitter -cdef DTYPE_t EXTRACT_NNZ_SWITCH = 0.1 +cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 -cdef inline void _init_split(ObliqueSplitRecord* self, SIZE_t start_pos) noexcept nogil: +cdef inline void _init_split(ObliqueSplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY self.impurity_right = INFINITY self.pos = start_pos @@ -44,8 +44,8 @@ cdef class BaseObliqueSplitter(Splitter): def __setstate__(self, d): pass - cdef int node_reset(self, SIZE_t start, SIZE_t end, - double* weighted_n_node_samples) except -1 nogil: + cdef intp_t node_reset(self, intp_t start, intp_t end, + float64_t* weighted_n_node_samples) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -53,11 +53,11 @@ cdef class BaseObliqueSplitter(Splitter): Parameters ---------- - start : SIZE_t + start : intp_t The index of the first sample to consider - end : SIZE_t + end : intp_t The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=double pointer + weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ @@ -80,8 +80,8 @@ cdef class BaseObliqueSplitter(Splitter): cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil: """ Sample the projection vector. @@ -89,28 +89,28 @@ cdef class BaseObliqueSplitter(Splitter): """ pass - cdef int pointer_size(self) noexcept nogil: + cdef intp_t pointer_size(self) noexcept nogil: """Get size of a pointer to record for ObliqueSplitter.""" return sizeof(ObliqueSplitRecord) cdef inline void compute_features_over_samples( self, - SIZE_t start, - SIZE_t end, - const SIZE_t[:] samples, - DTYPE_t[:] feature_values, - vector[DTYPE_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[SIZE_t]* proj_vec_indices # indices of the features (max_features,) + intp_t start, + intp_t end, + const intp_t[:] samples, + float32_t[:] feature_values, + vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]* proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ - cdef SIZE_t idx, jdx - cdef SIZE_t col_idx - cdef DTYPE_t col_weight + cdef intp_t idx, jdx + cdef intp_t col_idx + cdef float32_t col_weight # Compute linear combination of features and then # sort samples according to the feature values. @@ -126,11 +126,11 @@ cdef class BaseObliqueSplitter(Splitter): cdef inline void fisher_yates_shuffle_memview( self, - SIZE_t[::1] indices_to_sample, - SIZE_t grid_size, + intp_t[::1] indices_to_sample, + intp_t grid_size, UINT32_t* random_state, ) noexcept nogil: - cdef int i, j + cdef intp_t i, j # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? for i in range(0, grid_size - 1): @@ -142,12 +142,12 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): def __cinit__( self, Criterion criterion, - SIZE_t max_features, - SIZE_t min_samples_leaf, - double min_weight_leaf, + intp_t max_features, + intp_t min_samples_leaf, + float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - double feature_combinations, + float64_t feature_combinations, *argv ): """ @@ -156,20 +156,20 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): criterion : Criterion The criterion to measure the quality of a split. - max_features : SIZE_t + max_features : intp_t The maximal number of randomly selected features which can be considered for a split. - min_samples_leaf : SIZE_t + min_samples_leaf : intp_t The minimal number of samples each leaf can have, where splits which would result in having less samples in a leaf are not considered. - min_weight_leaf : double + min_weight_leaf : float64_t The minimal weight each leaf can have, where the weight is the sum of the weights of each sample in it. - feature_combinations : double + feature_combinations : float64_t The average number of features to combine in an oblique split. Each feature is independently included with probability ``feature_combination`` / ``n_features``. @@ -190,20 +190,20 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): self.monotonic_cst = monotonic_cst # Sparse max_features x n_features projection matrix - self.proj_mat_weights = vector[vector[DTYPE_t]](self.max_features) - self.proj_mat_indices = vector[vector[SIZE_t]](self.max_features) + self.proj_mat_weights = vector[vector[float32_t]](self.max_features) + self.proj_mat_indices = vector[vector[intp_t]](self.max_features) # Oblique tree parameters self.feature_combinations = feature_combinations # or max w/ 1... - self.n_non_zeros = max(int(self.max_features * self.feature_combinations), 1) + self.n_non_zeros = max((self.max_features * self.feature_combinations), 1) - cdef int init( + cdef intp_t init( self, object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight, const unsigned char[::1] missing_values_in_feature_mask, ) except -1: Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) @@ -215,13 +215,13 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): dtype=np.intp) # XXX: Just to initialize stuff - # self.feature_weights = np.ones((self.n_features,), dtype=DTYPE_t) / self.n_features + # self.feature_weights = np.ones((self.n_features,), dtype=float32_t) / self.n_features return 0 cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil: """Sample oblique projection matrix. @@ -243,16 +243,16 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): it is assumed ``feature_combinations`` is forced to be smaller than ``n_features`` before instantiating an oblique splitter. """ - cdef SIZE_t n_features = self.n_features - cdef SIZE_t n_non_zeros = self.n_non_zeros + cdef intp_t n_features = self.n_features + cdef intp_t n_non_zeros = self.n_non_zeros cdef UINT32_t* random_state = &self.rand_r_state - cdef int i, feat_i, proj_i, rand_vec_index - cdef DTYPE_t weight + cdef intp_t i, feat_i, proj_i, rand_vec_index + cdef float32_t weight # construct an array to sample from mTry x n_features set of indices - cdef SIZE_t[::1] indices_to_sample = self.indices_to_sample - cdef SIZE_t grid_size = self.max_features * self.n_features + cdef intp_t[::1] indices_to_sample = self.indices_to_sample + cdef intp_t grid_size = self.max_features * self.n_features # shuffle indices over the 2D grid to sample using Fisher-Yates self.fisher_yates_shuffle_memview(indices_to_sample, grid_size, random_state) @@ -288,13 +288,13 @@ cdef class BestObliqueSplitter(ObliqueSplitter): self.feature_combinations, ), self.__getstate__()) - cdef int node_split( + cdef intp_t node_split( self, - double impurity, + float64_t impurity, SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound, + intp_t* n_constant_features, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil: """Find the best_split split on node samples[start:end] @@ -305,24 +305,24 @@ cdef class BestObliqueSplitter(ObliqueSplitter): cdef ObliqueSplitRecord* oblique_split = (split) # Draw random splits and pick the best_split - cdef SIZE_t[::1] samples = self.samples - cdef SIZE_t start = self.start - cdef SIZE_t end = self.end + cdef intp_t[::1] samples = self.samples + cdef intp_t start = self.start + cdef intp_t end = self.end # pointer array to store feature values to split on - cdef DTYPE_t[::1] feature_values = self.feature_values - cdef SIZE_t max_features = self.max_features - cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef float32_t[::1] feature_values = self.feature_values + cdef intp_t max_features = self.max_features + cdef intp_t min_samples_leaf = self.min_samples_leaf # keep track of split record for current_split node and the best_split split # found among the sampled projection vectors cdef ObliqueSplitRecord best_split, current_split - cdef double current_proxy_improvement = -INFINITY - cdef double best_proxy_improvement = -INFINITY + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY - cdef SIZE_t feat_i, p # index over computed features and start/end - cdef SIZE_t partition_end - cdef DTYPE_t temp_d # to compute a projection feature value + cdef intp_t feat_i, p # index over computed features and start/end + cdef intp_t partition_end + cdef float32_t temp_d # to compute a projection feature value # instantiate the split records _init_split(&best_split, end) @@ -449,18 +449,18 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): cdef inline void find_min_max( self, - DTYPE_t[::1] feature_values, - DTYPE_t* min_feature_value_out, - DTYPE_t* max_feature_value_out, + float32_t[::1] feature_values, + float32_t* min_feature_value_out, + float32_t* max_feature_value_out, ) noexcept nogil: """Find the minimum and maximum value for current_feature.""" cdef: - DTYPE_t current_feature_value - DTYPE_t min_feature_value = INFINITY - DTYPE_t max_feature_value = -INFINITY - SIZE_t start = self.start - SIZE_t end = self.end - SIZE_t p + float32_t current_feature_value + float32_t min_feature_value = INFINITY + float32_t max_feature_value = -INFINITY + intp_t start = self.start + intp_t end = self.end + intp_t p for p in range(start, end): current_feature_value = feature_values[p] @@ -472,13 +472,13 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): min_feature_value_out[0] = min_feature_value max_feature_value_out[0] = max_feature_value - cdef inline SIZE_t partition_samples(self, double current_threshold) noexcept nogil: + cdef inline intp_t partition_samples(self, float64_t current_threshold) noexcept nogil: """Partition samples for feature_values at the current_threshold.""" cdef: - SIZE_t p = self.start - SIZE_t partition_end = self.end - SIZE_t[::1] samples = self.samples - DTYPE_t[::1] feature_values = self.feature_values + intp_t p = self.start + intp_t partition_end = self.end + intp_t[::1] samples = self.samples + float32_t[::1] feature_values = self.feature_values while p < partition_end: if feature_values[p] <= current_threshold: @@ -494,13 +494,13 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): return partition_end # overwrite the node_split method with random threshold selection - cdef int node_split( + cdef intp_t node_split( self, - double impurity, + float64_t impurity, SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound, + intp_t* n_constant_features, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil: """Find the best_split split on node samples[start:end] @@ -511,36 +511,36 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): cdef ObliqueSplitRecord* oblique_split = (split) # Draw random splits and pick the best_split - cdef SIZE_t[::1] samples = self.samples - cdef SIZE_t start = self.start - cdef SIZE_t end = self.end + cdef intp_t[::1] samples = self.samples + cdef intp_t start = self.start + cdef intp_t end = self.end cdef UINT32_t* random_state = &self.rand_r_state # pointer array to store feature values to split on - cdef DTYPE_t[::1] feature_values = self.feature_values - cdef SIZE_t max_features = self.max_features - cdef SIZE_t min_samples_leaf = self.min_samples_leaf - cdef double min_weight_leaf = self.min_weight_leaf + cdef float32_t[::1] feature_values = self.feature_values + cdef intp_t max_features = self.max_features + cdef intp_t min_samples_leaf = self.min_samples_leaf + cdef float64_t min_weight_leaf = self.min_weight_leaf # keep track of split record for current_split node and the best_split split # found among the sampled projection vectors cdef ObliqueSplitRecord best_split, current_split - cdef double current_proxy_improvement = -INFINITY - cdef double best_proxy_improvement = -INFINITY + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY - cdef SIZE_t p - cdef SIZE_t feat_i - cdef SIZE_t partition_end - cdef DTYPE_t temp_d # to compute a projection feature value - cdef DTYPE_t min_feature_value - cdef DTYPE_t max_feature_value + cdef intp_t p + cdef intp_t feat_i + cdef intp_t partition_end + cdef float32_t temp_d # to compute a projection feature value + cdef float32_t min_feature_value + cdef float32_t max_feature_value # Number of features discovered to be constant during the split search - # cdef SIZE_t n_found_constants = 0 - # cdef SIZE_t n_known_constants = n_constant_features[0] + # cdef intp_t n_found_constants = 0 + # cdef intp_t n_known_constants = n_constant_features[0] # n_total_constants = n_known_constants + n_found_constants - # cdef SIZE_t n_total_constants = n_known_constants - cdef SIZE_t n_visited_features = 0 + # cdef intp_t n_total_constants = n_known_constants + cdef intp_t n_visited_features = 0 # instantiate the split records _init_split(&best_split, end) @@ -663,12 +663,12 @@ cdef class MultiViewSplitter(BestObliqueSplitter): def __cinit__( self, Criterion criterion, - SIZE_t max_features, - SIZE_t min_samples_leaf, - double min_weight_leaf, + intp_t max_features, + intp_t min_samples_leaf, + float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - double feature_combinations, + float64_t feature_combinations, const intp_t[:] feature_set_ends, intp_t n_feature_sets, *argv @@ -699,11 +699,11 @@ cdef class MultiViewSplitter(BestObliqueSplitter): self.n_feature_sets, ), self.__getstate__()) - cdef int init( + cdef intp_t init( self, object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight, const unsigned char[::1] missing_values_in_feature_mask, ) except -1: Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) @@ -711,11 +711,11 @@ cdef class MultiViewSplitter(BestObliqueSplitter): self.X = X # create a helper array for allowing efficient Fisher-Yates - self.multi_indices_to_sample = vector[vector[SIZE_t]](self.n_feature_sets) + self.multi_indices_to_sample = vector[vector[intp_t]](self.n_feature_sets) - cdef SIZE_t i_feature = 0 - cdef SIZE_t feature_set_begin = 0 - cdef SIZE_t size_of_feature_set + cdef intp_t i_feature = 0 + cdef intp_t feature_set_begin = 0 + cdef intp_t size_of_feature_set cdef intp_t ifeat = 0 for i_feature in range(self.n_feature_sets): size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin @@ -727,8 +727,8 @@ cdef class MultiViewSplitter(BestObliqueSplitter): cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil: """Sample projection matrix accounting for multi-views. @@ -736,19 +736,19 @@ cdef class MultiViewSplitter(BestObliqueSplitter): but now also uniformly samples features from each feature set. """ cdef UINT32_t* random_state = &self.rand_r_state - cdef int feat_i, proj_i - cdef DTYPE_t weight + cdef intp_t feat_i, proj_i + cdef float32_t weight # keep track of the beginning and ending indices of each feature set - cdef int idx - cdef int ifeature = 0 - cdef int grid_size + cdef intp_t idx + cdef intp_t ifeature = 0 + cdef intp_t grid_size # 03: Algorithm samples features from each set with equal probability proj_i = 0 cdef bint finished_feature_set = False - cdef int i, j + cdef intp_t i, j while proj_i < self.max_features and not finished_feature_set: # sample from a feature set for idx in range(self.n_feature_sets): @@ -787,12 +787,12 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter): def __cinit__( self, Criterion criterion, - SIZE_t max_features, - SIZE_t min_samples_leaf, - double min_weight_leaf, + intp_t max_features, + intp_t min_samples_leaf, + float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - double feature_combinations, + float64_t feature_combinations, const intp_t[:] feature_set_ends, intp_t n_feature_sets, bint uniform_sampling, @@ -820,11 +820,11 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter): self.uniform_sampling, ), self.__getstate__()) - cdef int init( + cdef intp_t init( self, object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight, const unsigned char[::1] missing_values_in_feature_mask, ) except -1: Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) @@ -832,11 +832,11 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter): self.X = X # create a helper array for allowing efficient Fisher-Yates - self.multi_indices_to_sample = vector[vector[SIZE_t]](self.n_feature_sets) + self.multi_indices_to_sample = vector[vector[intp_t]](self.n_feature_sets) - cdef SIZE_t i_feature = 0 - cdef SIZE_t feature_set_begin = 0 - cdef SIZE_t size_of_feature_set + cdef intp_t i_feature = 0 + cdef intp_t feature_set_begin = 0 + cdef intp_t size_of_feature_set cdef intp_t ifeat = 0 cdef intp_t iproj = 0 while iproj < self.max_features: @@ -856,34 +856,34 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter): cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil: """Sample projection matrix accounting for multi-views. This proceeds as a normal sampling projection matrix, but now also uniformly samples features from each feature set. """ - cdef SIZE_t n_features = self.n_features - cdef SIZE_t n_non_zeros = self.n_non_zeros + cdef intp_t n_features = self.n_features + cdef intp_t n_non_zeros = self.n_non_zeros cdef UINT32_t* random_state = &self.rand_r_state - cdef int i, j, feat_i, proj_i, rand_vec_index - cdef DTYPE_t weight + cdef intp_t i, j, feat_i, proj_i, rand_vec_index + cdef float32_t weight # construct an array to sample from mTry x n_features set of indices - cdef vector[SIZE_t] indices_to_sample - cdef SIZE_t grid_size + cdef vector[intp_t] indices_to_sample + cdef intp_t grid_size # compute the number of features in each feature set - cdef SIZE_t n_features_in_set + cdef intp_t n_features_in_set # keep track of the beginning and ending indices of each feature set - cdef int feature_set_begin, feature_set_end, idx + cdef intp_t feature_set_begin, feature_set_end, idx feature_set_begin = 0 # keep track of number of features sampled relative to n_non_zeros - cdef int ifeature = 0 + cdef intp_t ifeature = 0 if self.uniform_sampling: # 01: This algorithm samples features from each feature set uniformly and combines them @@ -973,9 +973,9 @@ cdef class MultiViewSplitterTester(MultiViewSplitter): Returns projection matrix of shape (max_features, n_features). """ - cdef vector[vector[DTYPE_t]] proj_mat_weights = vector[vector[DTYPE_t]](self.max_features) - cdef vector[vector[SIZE_t]] proj_mat_indices = vector[vector[SIZE_t]](self.max_features) - cdef SIZE_t i, j + cdef vector[vector[float32_t]] proj_mat_weights = vector[vector[float32_t]](self.max_features) + cdef vector[vector[intp_t]] proj_mat_indices = vector[vector[intp_t]](self.max_features) + cdef intp_t i, j # sample projection matrix in C/C++ self.sample_proj_mat(proj_mat_weights, proj_mat_indices) @@ -1021,9 +1021,9 @@ cdef class BestObliqueSplitterTester(BestObliqueSplitter): Returns projection matrix of shape (max_features, n_features). """ - cdef vector[vector[DTYPE_t]] proj_mat_weights = vector[vector[DTYPE_t]](self.max_features) - cdef vector[vector[SIZE_t]] proj_mat_indices = vector[vector[SIZE_t]](self.max_features) - cdef SIZE_t i, j + cdef vector[vector[float32_t]] proj_mat_weights = vector[vector[float32_t]](self.max_features) + cdef vector[vector[intp_t]] proj_mat_indices = vector[vector[intp_t]](self.max_features) + cdef intp_t i, j # sample projection matrix in C/C++ self.sample_proj_mat(proj_mat_weights, proj_mat_indices) diff --git a/sktree/tree/_oblique_tree.pxd b/sktree/tree/_oblique_tree.pxd index 9f4906a4d..db7813946 100644 --- a/sktree/tree/_oblique_tree.pxd +++ b/sktree/tree/_oblique_tree.pxd @@ -14,34 +14,30 @@ cimport numpy as cnp from libcpp.vector cimport vector from .._lib.sklearn.tree._splitter cimport SplitRecord -from .._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from .._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from .._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from .._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from .._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer from .._lib.sklearn.tree._tree cimport Node, Tree, TreeBuilder +from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t from ._oblique_splitter cimport ObliqueSplitRecord cdef class ObliqueTree(Tree): - cdef vector[vector[DTYPE_t]] proj_vec_weights # (capacity, n_features) array of projection vectors - cdef vector[vector[SIZE_t]] proj_vec_indices # (capacity, n_features) array of projection vectors + cdef vector[vector[float32_t]] proj_vec_weights # (capacity, n_features) array of projection vectors + cdef vector[vector[intp_t]] proj_vec_indices # (capacity, n_features) array of projection vectors # overridden methods - cdef int _resize_c( + cdef intp_t _resize_c( self, - SIZE_t capacity=* + intp_t capacity=* ) except -1 nogil - cdef int _set_split_node( + cdef intp_t _set_split_node( self, SplitRecord* split_node, Node *node, - SIZE_t node_id + intp_t node_id ) nogil except -1 - cdef DTYPE_t _compute_feature( + cdef float32_t _compute_feature( self, - const DTYPE_t[:, :] X_ndarray, - SIZE_t sample_index, + const float32_t[:, :] X_ndarray, + intp_t sample_index, Node *node ) noexcept nogil cdef void _compute_feature_importances( diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index f59374236..99a3d6fc0 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -29,7 +29,7 @@ cdef Node dummy NODE_DTYPE = np.asarray((&dummy)).dtype # Mitigate precision differences between 32 bit and 64 bit -cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 +cdef float32_t FEATURE_THRESHOLD = 1e-7 # ============================================================================= # ObliqueTree @@ -47,53 +47,53 @@ cdef class ObliqueTree(Tree): Attributes ---------- - node_count : int + node_count : intp_t The number of nodes (internal nodes + leaves) in the tree. - capacity : int + capacity : intp_t The current capacity (i.e., size) of the arrays, which is at least as great as `node_count`. - max_depth : int + max_depth : intp_t The depth of the tree, i.e. the maximum depth of its leaves. - children_left : array of int, shape [node_count] + children_left : array of intp_t, shape [node_count] children_left[i] holds the node id of the left child of node i. For leaves, children_left[i] == TREE_LEAF. Otherwise, children_left[i] > i. This child handles the case where X[:, feature[i]] <= threshold[i]. - children_right : array of int, shape [node_count] + children_right : array of intp_t, shape [node_count] children_right[i] holds the node id of the right child of node i. For leaves, children_right[i] == TREE_LEAF. Otherwise, children_right[i] > i. This child handles the case where X[:, feature[i]] > threshold[i]. - feature : array of int, shape [node_count] + feature : array of intp_t, shape [node_count] feature[i] holds the feature to split on, for the internal node i. - threshold : array of double, shape [node_count] + threshold : array of float64_t, shape [node_count] threshold[i] holds the threshold for the internal node i. - value : array of double, shape [node_count, n_outputs, max_n_classes] + value : array of float64_t, shape [node_count, n_outputs, max_n_classes] Contains the constant prediction value of each node. - impurity : array of double, shape [node_count] + impurity : array of float64_t, shape [node_count] impurity[i] holds the impurity (i.e., the value of the splitting criterion) at node i. - n_node_samples : array of int, shape [node_count] + n_node_samples : array of intp_t, shape [node_count] n_node_samples[i] holds the number of training samples reaching node i. - weighted_n_node_samples : array of int, shape [node_count] + weighted_n_node_samples : array of intp_t, shape [node_count] weighted_n_node_samples[i] holds the weighted number of training samples reaching node i. """ def __cinit__( self, - int n_features, - cnp.ndarray[SIZE_t, ndim=1] n_classes, - int n_outputs + intp_t n_features, + cnp.ndarray[intp_t, ndim=1] n_classes, + intp_t n_outputs ): """Constructor.""" # Input/Output layout @@ -105,7 +105,7 @@ cdef class ObliqueTree(Tree): self.max_n_classes = np.max(n_classes) self.value_stride = n_outputs * self.max_n_classes - cdef SIZE_t k + cdef intp_t k for k in range(n_outputs): self.n_classes[k] = n_classes[k] @@ -116,8 +116,8 @@ cdef class ObliqueTree(Tree): self.value = NULL self.nodes = NULL - self.proj_vec_weights = vector[vector[DTYPE_t]](self.capacity) - self.proj_vec_indices = vector[vector[SIZE_t]](self.capacity) + self.proj_vec_weights = vector[vector[float32_t]](self.capacity) + self.proj_vec_indices = vector[vector[intp_t]](self.capacity) def __reduce__(self): """Reduce re-implementation, for pickling.""" @@ -179,7 +179,7 @@ cdef class ObliqueTree(Tree): memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), self.capacity * sizeof(Node)) memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) + self.capacity * self.value_stride * sizeof(float64_t)) cpdef cnp.ndarray get_projection_matrix(self): """Get the projection matrix of shape (node_count, n_features).""" @@ -191,7 +191,7 @@ cdef class ObliqueTree(Tree): proj_vecs[i, feat] = weight return proj_vecs - cdef int _resize_c(self, SIZE_t capacity=INTPTR_MAX) except -1 nogil: + cdef intp_t _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: """Guts of _resize. Additionally resizes the projection indices and weights. @@ -220,7 +220,7 @@ cdef class ObliqueTree(Tree): # value memory is initialised to 0 to enable classifier argmax if capacity > self.capacity: memset((self.value + self.capacity * self.value_stride), 0, - (capacity - self.capacity) * self.value_stride * sizeof(double)) + (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: @@ -229,7 +229,7 @@ cdef class ObliqueTree(Tree): self.capacity = capacity return 0 - cdef int _set_split_node(self, SplitRecord* split_node, Node *node, SIZE_t node_id) except -1 nogil: + cdef intp_t _set_split_node(self, SplitRecord* split_node, Node *node, intp_t node_id) except -1 nogil: """Set node data. """ # Cython type cast split record into its inherited split record @@ -250,10 +250,10 @@ cdef class ObliqueTree(Tree): ) return 1 - cdef DTYPE_t _compute_feature( + cdef float32_t _compute_feature( self, - const DTYPE_t[:, :] X_ndarray, - SIZE_t sample_index, + const float32_t[:, :] X_ndarray, + intp_t sample_index, Node *node ) noexcept nogil: """Compute feature from a given data matrix, X. @@ -261,15 +261,15 @@ cdef class ObliqueTree(Tree): In oblique-aligned trees, this is the projection of X. In this case, we take a simple linear combination of some columns of X. """ - cdef DTYPE_t proj_feat = 0.0 - cdef DTYPE_t weight = 0.0 - cdef int j = 0 - cdef SIZE_t feature_index + cdef float32_t proj_feat = 0.0 + cdef float32_t weight = 0.0 + cdef intp_t j = 0 + cdef intp_t feature_index # get the index of the node - cdef SIZE_t node_id = node - self.nodes + cdef intp_t node_id = node - self.nodes - # cdef SIZE_t n_projections = proj_vec_indices.size() + # cdef intp_t n_projections = proj_vec_indices.size() # compute projection of the data based on trained tree # proj_vec_weights = self.proj_vec_weights[node_id] # proj_vec_indices = self.proj_vec_indices[node_id] @@ -299,13 +299,13 @@ cdef class ObliqueTree(Tree): cdef Node* right # get the index of the node - cdef SIZE_t node_id = node - self.nodes + cdef intp_t node_id = node - self.nodes left = &nodes[node.left_child] right = &nodes[node.right_child] - cdef int i, feature_index - cdef DTYPE_t weight + cdef intp_t i, feature_index + cdef float32_t weight for i in range(0, self.proj_vec_indices[node_id].size()): feature_index = self.proj_vec_indices[node_id][i] weight = self.proj_vec_weights[node_id][i] diff --git a/sktree/tree/_sklearn_splitter.pxd b/sktree/tree/_sklearn_splitter.pxd index e5db44c4a..6a659998b 100644 --- a/sktree/tree/_sklearn_splitter.pxd +++ b/sktree/tree/_sklearn_splitter.pxd @@ -1,11 +1,7 @@ -from .._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from .._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from .._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from .._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from .._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer +from .._lib.sklearn.utils._typedefs cimport float32_t, int32_t, intp_t # This defines c-importable functions for other cython files # TODO: remove these files when sklearn merges refactor defining these in pxd files # https://github.com/scikit-learn/scikit-learn/pull/25606 -cdef void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) noexcept nogil +cdef void sort(float32_t* Xf, intp_t* samples, intp_t n) noexcept nogil diff --git a/sktree/tree/_sklearn_splitter.pyx b/sktree/tree/_sklearn_splitter.pyx index 529abde12..92c298353 100644 --- a/sktree/tree/_sklearn_splitter.pyx +++ b/sktree/tree/_sklearn_splitter.pyx @@ -9,7 +9,7 @@ from sklearn.tree._utils cimport log # Sort n-element arrays pointed to by Xf and samples, simultaneously, # by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997). -cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) noexcept nogil: +cdef inline void sort(float32_t* Xf, intp_t* samples, intp_t n) noexcept nogil: if n == 0: return cdef int maxd = 2 * log(n) @@ -17,20 +17,20 @@ cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) noexcept nogil: cdef inline void swap( - DTYPE_t* Xf, - SIZE_t* samples, - SIZE_t i, - SIZE_t j + float32_t* Xf, + intp_t* samples, + intp_t i, + intp_t j ) noexcept nogil: # Helper for sort Xf[i], Xf[j] = Xf[j], Xf[i] samples[i], samples[j] = samples[j], samples[i] -cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) noexcept nogil: +cdef inline float32_t median3(float32_t* Xf, intp_t n) noexcept nogil: # Median of three pivot selection, after Bentley and McIlroy (1993). # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. - cdef DTYPE_t a = Xf[0], b = Xf[n // 2], c = Xf[n - 1] + cdef float32_t a = Xf[0], b = Xf[n // 2], c = Xf[n - 1] if a < b: if b < c: return b @@ -49,10 +49,10 @@ cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) noexcept nogil: # Introsort with median of 3 pivot selection and 3-way partition function # (robust to repeated elements, e.g. lots of zero features). -cdef void introsort(DTYPE_t* Xf, SIZE_t *samples, - SIZE_t n, int maxd) noexcept nogil: - cdef DTYPE_t pivot - cdef SIZE_t i, l, r +cdef void introsort(float32_t* Xf, intp_t *samples, + intp_t n, int maxd) noexcept nogil: + cdef float32_t pivot + cdef intp_t i, l, r while n > 1: if maxd <= 0: # max depth limit exceeded ("gone quadratic") @@ -82,10 +82,10 @@ cdef void introsort(DTYPE_t* Xf, SIZE_t *samples, n -= r -cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples, - SIZE_t start, SIZE_t end) noexcept nogil: +cdef inline void sift_down(float32_t* Xf, intp_t* samples, + intp_t start, intp_t end) noexcept nogil: # Restore heap order in Xf[start:end] by moving the max element to start. - cdef SIZE_t child, maxind, root + cdef intp_t child, maxind, root root = start while True: @@ -105,8 +105,8 @@ cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples, root = maxind -cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) noexcept nogil: - cdef SIZE_t start, end +cdef void heapsort(float32_t* Xf, intp_t* samples, intp_t n) noexcept nogil: + cdef intp_t start, end # heapify start = (n - 2) // 2 @@ -125,20 +125,20 @@ cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) noexcept nogil: end = end - 1 -cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil: +cdef int compare_intp_t(const void* a, const void* b) noexcept nogil: """Comparison function for sort.""" - return ((a)[0] - (b)[0]) + return ((a)[0] - (b)[0]) -cdef inline void binary_search(INT32_t[::1] sorted_array, - INT32_t start, INT32_t end, - SIZE_t value, SIZE_t* index, - INT32_t* new_start) noexcept nogil: +cdef inline void binary_search(int32_t[::1] sorted_array, + int32_t start, int32_t end, + intp_t value, intp_t* index, + int32_t* new_start) noexcept nogil: """Return the index of value in the sorted array. If not found, return -1. new_start is the last pivot + 1 """ - cdef INT32_t pivot + cdef int32_t pivot index[0] = -1 while start < end: pivot = start + (end - start) // 2 @@ -155,25 +155,25 @@ cdef inline void binary_search(INT32_t[::1] sorted_array, new_start[0] = start -cdef inline void extract_nnz_index_to_samples(INT32_t[::1] X_indices, - DTYPE_t[::1] X_data, - INT32_t indptr_start, - INT32_t indptr_end, - SIZE_t[::1] samples, - SIZE_t start, - SIZE_t end, - SIZE_t[::1] index_to_samples, - DTYPE_t[::1] Xf, - SIZE_t* end_negative, - SIZE_t* start_positive) noexcept nogil: +cdef inline void extract_nnz_index_to_samples(int32_t[::1] X_indices, + float32_t[::1] X_data, + int32_t indptr_start, + int32_t indptr_end, + intp_t[::1] samples, + intp_t start, + intp_t end, + intp_t[::1] index_to_samples, + float32_t[::1] Xf, + intp_t* end_negative, + intp_t* start_positive) noexcept nogil: """Extract and partition values for a feature using index_to_samples. Complexity is O(indptr_end - indptr_start). """ - cdef INT32_t k - cdef SIZE_t index - cdef SIZE_t end_negative_ = start - cdef SIZE_t start_positive_ = end + cdef int32_t k + cdef intp_t index + cdef intp_t end_negative_ = start + cdef intp_t start_positive_ = end for k in range(indptr_start, indptr_end): if start <= index_to_samples[X_indices[k]] < end: @@ -194,18 +194,18 @@ cdef inline void extract_nnz_index_to_samples(INT32_t[::1] X_indices, start_positive[0] = start_positive_ -cdef inline void extract_nnz_binary_search(INT32_t[::1] X_indices, - DTYPE_t[::1] X_data, - INT32_t indptr_start, - INT32_t indptr_end, - SIZE_t[::1] samples, - SIZE_t start, - SIZE_t end, - SIZE_t[::1] index_to_samples, - DTYPE_t[::1] Xf, - SIZE_t* end_negative, - SIZE_t* start_positive, - SIZE_t[::1] sorted_samples, +cdef inline void extract_nnz_binary_search(int32_t[::1] X_indices, + float32_t[::1] X_data, + int32_t indptr_start, + int32_t indptr_end, + intp_t[::1] samples, + intp_t start, + intp_t end, + intp_t[::1] index_to_samples, + float32_t[::1] Xf, + intp_t* end_negative, + intp_t* start_positive, + intp_t[::1] sorted_samples, bint* is_samples_sorted) noexcept nogil: """Extract and partition values for a given feature using binary search. @@ -215,14 +215,14 @@ cdef inline void extract_nnz_binary_search(INT32_t[::1] X_indices, O((1 - is_samples_sorted[0]) * n_samples * log(n_samples) + n_samples * log(n_indices)). """ - cdef SIZE_t n_samples + cdef intp_t n_samples if not is_samples_sorted[0]: n_samples = end - start memcpy(&sorted_samples[start], &samples[start], - n_samples * sizeof(SIZE_t)) - qsort(&sorted_samples[start], n_samples, sizeof(SIZE_t), - compare_SIZE_t) + n_samples * sizeof(intp_t)) + qsort(&sorted_samples[start], n_samples, sizeof(intp_t), + compare_intp_t) is_samples_sorted[0] = 1 while (indptr_start < indptr_end and @@ -233,11 +233,11 @@ cdef inline void extract_nnz_binary_search(INT32_t[::1] X_indices, sorted_samples[end - 1] < X_indices[indptr_end - 1]): indptr_end -= 1 - cdef SIZE_t p = start - cdef SIZE_t index - cdef SIZE_t k - cdef SIZE_t end_negative_ = start - cdef SIZE_t start_positive_ = end + cdef intp_t p = start + cdef intp_t index + cdef intp_t k + cdef intp_t end_negative_ = start + cdef intp_t start_positive_ = end while (p < end and indptr_start < indptr_end): # Find index of sorted_samples[p] in X_indices @@ -264,8 +264,8 @@ cdef inline void extract_nnz_binary_search(INT32_t[::1] X_indices, start_positive[0] = start_positive_ -cdef inline void sparse_swap(SIZE_t[::1] index_to_samples, SIZE_t[::1] samples, - SIZE_t pos_1, SIZE_t pos_2) noexcept nogil: +cdef inline void sparse_swap(intp_t[::1] index_to_samples, intp_t[::1] samples, + intp_t pos_1, intp_t pos_2) noexcept nogil: """Swap sample pos_1 and pos_2 preserving sparse invariant.""" samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1] index_to_samples[samples[pos_1]] = pos_1 diff --git a/sktree/tree/_utils.pxd b/sktree/tree/_utils.pxd index fc93163b2..c44454914 100644 --- a/sktree/tree/_utils.pxd +++ b/sktree/tree/_utils.pxd @@ -4,22 +4,19 @@ cimport numpy as cnp cnp.import_array() -from sktree._lib.sklearn.tree._splitter cimport SplitRecord -from sktree._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from sktree._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from sktree._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from sktree._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from sktree._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer +from .._lib.sklearn.tree._splitter cimport SplitRecord +from .._lib.sklearn.tree._utils cimport UINT32_t +from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t -cdef int rand_weighted_binary(double p0, UINT32_t* random_state) noexcept nogil +cdef int rand_weighted_binary(float64_t p0, UINT32_t* random_state) noexcept nogil cpdef unravel_index( - SIZE_t index, cnp.ndarray[SIZE_t, ndim=1] shape + intp_t index, cnp.ndarray[intp_t, ndim=1] shape ) -cpdef ravel_multi_index(SIZE_t[:] coords, const SIZE_t[:] shape) +cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape) -cdef void unravel_index_cython(SIZE_t index, const SIZE_t[:] shape, SIZE_t[:] coords) noexcept nogil +cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil -cdef SIZE_t ravel_multi_index_cython(SIZE_t[:] coords, const SIZE_t[:] shape) noexcept nogil +cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil diff --git a/sktree/tree/_utils.pyx b/sktree/tree/_utils.pyx index 1b7b6c293..84dc2de34 100644 --- a/sktree/tree/_utils.pyx +++ b/sktree/tree/_utils.pyx @@ -11,20 +11,20 @@ cimport numpy as cnp cnp.import_array() -from sktree._lib.sklearn.tree._utils cimport rand_uniform +from .._lib.sklearn.tree._utils cimport rand_uniform -cdef inline int rand_weighted_binary(double p0, UINT32_t* random_state) noexcept nogil: +cdef inline int rand_weighted_binary(float64_t p0, UINT32_t* random_state) noexcept nogil: """Sample from integers 0 and 1 with different probabilities. Parameters ---------- - p0 : double + p0 : float64_t The probability of sampling 0. random_state : UINT32_t* The random state. """ - cdef double random_value = rand_uniform(0.0, 1.0, random_state) + cdef float64_t random_value = rand_uniform(0.0, 1.0, random_state) if random_value < p0: return 0 @@ -32,8 +32,8 @@ cdef inline int rand_weighted_binary(double p0, UINT32_t* random_state) noexcept return 1 cpdef unravel_index( - SIZE_t index, - cnp.ndarray[SIZE_t, ndim=1] shape + intp_t index, + cnp.ndarray[intp_t, ndim=1] shape ): """Converts a flat index or array of flat indices into a tuple of coordinate arrays. @@ -41,14 +41,14 @@ cpdef unravel_index( Parameters ---------- - index : SIZE_t + index : intp_t A flat index. - shape : numpy.ndarray[SIZE_t, ndim=1] + shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the flat indices should be converted. Returns ------- - numpy.ndarray[SIZE_t, ndim=1] + numpy.ndarray[intp_t, ndim=1] A coordinate array having the same shape as the input `shape`. """ index = np.intp(index) @@ -58,21 +58,21 @@ cpdef unravel_index( return coords -cpdef ravel_multi_index(SIZE_t[:] coords, const SIZE_t[:] shape): +cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape): """Converts a tuple of coordinate arrays into a flat index. Purely used for testing purposes. Parameters ---------- - coords : numpy.ndarray[SIZE_t, ndim=1] + coords : numpy.ndarray[intp_t, ndim=1] An array of coordinate arrays to be converted. - shape : numpy.ndarray[SIZE_t, ndim=1] + shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the coordinates should be converted. Returns ------- - SIZE_t + intp_t The resulting flat index. Raises @@ -83,25 +83,25 @@ cpdef ravel_multi_index(SIZE_t[:] coords, const SIZE_t[:] shape): return ravel_multi_index_cython(coords, shape) -cdef void unravel_index_cython(SIZE_t index, const SIZE_t[:] shape, SIZE_t[:] coords) noexcept nogil: +cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil: """Converts a flat index into a tuple of coordinate arrays. Parameters ---------- - index : SIZE_t + index : intp_t The flat index to be converted. - shape : numpy.ndarray[SIZE_t, ndim=1] + shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the flat index should be converted. - coords : numpy.ndarray[SIZE_t, ndim=1] + coords : numpy.ndarray[intp_t, ndim=1] A preinitialized memoryview array of coordinate arrays to be converted. Returns ------- - numpy.ndarray[SIZE_t, ndim=1] + numpy.ndarray[intp_t, ndim=1] An array of coordinate arrays, with each coordinate array having the same shape as the input `shape`. """ - cdef SIZE_t ndim = shape.shape[0] - cdef SIZE_t j, size + cdef intp_t ndim = shape.shape[0] + cdef intp_t j, size for j in range(ndim - 1, -1, -1): size = shape[j] @@ -109,19 +109,19 @@ cdef void unravel_index_cython(SIZE_t index, const SIZE_t[:] shape, SIZE_t[:] co index //= size -cdef SIZE_t ravel_multi_index_cython(SIZE_t[:] coords, const SIZE_t[:] shape) noexcept nogil: +cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil: """Converts a tuple of coordinate arrays into a flat index. Parameters ---------- - coords : numpy.ndarray[SIZE_t, ndim=1] + coords : numpy.ndarray[intp_t, ndim=1] An array of coordinate arrays to be converted. - shape : numpy.ndarray[SIZE_t, ndim=1] + shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the coordinates should be converted. Returns ------- - SIZE_t + intp_t The resulting flat index. Raises @@ -129,8 +129,8 @@ cdef SIZE_t ravel_multi_index_cython(SIZE_t[:] coords, const SIZE_t[:] shape) no ValueError If the input `coords` have invalid indices. """ - cdef SIZE_t i, ndim - cdef SIZE_t flat_index, index + cdef intp_t i, ndim + cdef intp_t flat_index, index ndim = len(shape) diff --git a/sktree/tree/manifold/_morf_splitter.pxd b/sktree/tree/manifold/_morf_splitter.pxd index a05bb50e8..2bdb7ab56 100644 --- a/sktree/tree/manifold/_morf_splitter.pxd +++ b/sktree/tree/manifold/_morf_splitter.pxd @@ -14,11 +14,8 @@ cimport numpy as cnp from libcpp.vector cimport vector from ..._lib.sklearn.tree._splitter cimport SplitRecord -from ..._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from ..._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from ..._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from ..._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from ..._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer +from ..._lib.sklearn.tree._utils cimport UINT32_t +from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t from .._oblique_splitter cimport BestObliqueSplitter, ObliqueSplitRecord # https://github.com/cython/cython/blob/master/Cython/Includes/libcpp/algorithm.pxd @@ -44,49 +41,49 @@ cdef class PatchSplitter(BestObliqueSplitter): # `data_width` are used to determine the vectorized indices corresponding to # (x,y) coordinates in the original un-vectorized data. - cdef public SIZE_t max_patch_height # Maximum height of the patch to sample - cdef public SIZE_t max_patch_width # Maximum width of the patch to sample - cdef public SIZE_t min_patch_height # Minimum height of the patch to sample - cdef public SIZE_t min_patch_width # Minimum width of the patch to sample - cdef public SIZE_t data_height # Height of the input data - cdef public SIZE_t data_width # Width of the input data + cdef public intp_t max_patch_height # Maximum height of the patch to sample + cdef public intp_t max_patch_width # Maximum width of the patch to sample + cdef public intp_t min_patch_height # Minimum height of the patch to sample + cdef public intp_t min_patch_width # Minimum width of the patch to sample + cdef public intp_t data_height # Height of the input data + cdef public intp_t data_width # Width of the input data - cdef public SIZE_t ndim # The number of dimensions of the input data + cdef public intp_t ndim # The number of dimensions of the input data - cdef const SIZE_t[:] data_dims # The dimensions of the input data - cdef const SIZE_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension - cdef const SIZE_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension + cdef const intp_t[:] data_dims # The dimensions of the input data + cdef const intp_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension + cdef const intp_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension cdef const cnp.uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous # TODO: check if this works and is necessary for discontiguous data - # cdef SIZE_t[:] stride_offsets # The stride offsets for each dimension + # cdef intp_t[:] stride_offsets # The stride offsets for each dimension cdef bint _discontiguous cdef bytes boundary # how to sample the patch with boundary in mind - cdef const DTYPE_t[:, :] feature_weight # Whether or not to normalize each column of X when adding in a patch + cdef const float32_t[:, :] feature_weight # Whether or not to normalize each column of X when adding in a patch - cdef SIZE_t[::1] _index_data_buffer - cdef SIZE_t[::1] _index_patch_buffer - cdef SIZE_t[:] patch_dims_buff # A buffer to store the dimensions of the sampled patch - cdef SIZE_t[:] unraveled_patch_point # A buffer to store the unraveled patch point + cdef intp_t[::1] _index_data_buffer + cdef intp_t[::1] _index_patch_buffer + cdef intp_t[:] patch_dims_buff # A buffer to store the dimensions of the sampled patch + cdef intp_t[:] unraveled_patch_point # A buffer to store the unraveled patch point # All oblique splitters (i.e. non-axis aligned splitters) require a # function to sample a projection matrix that is applied to the feature matrix # to quickly obtain the sampled projections for candidate splits. - cdef (SIZE_t, SIZE_t) sample_top_left_seed( + cdef (intp_t, intp_t) sample_top_left_seed( self ) noexcept nogil cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil # cdef class UserKernelSplitter(PatchSplitter): # """A class to hold user-specified kernels.""" -# cdef vector[DTYPE_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels +# cdef vector[float32_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels cdef class GaussianKernelSplitter(PatchSplitter): @@ -99,6 +96,6 @@ cdef class GaussianKernelSplitter(PatchSplitter): cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index 95c37a2da..79be42a47 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -30,22 +30,22 @@ cdef class PatchSplitter(BestObliqueSplitter): def __setstate__(self, d): pass - cdef int init( + cdef intp_t init( self, object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight, const unsigned char[::1] missing_values_in_feature_mask, ) except -1: BestObliqueSplitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) return 0 - cdef int node_reset( + cdef intp_t node_reset( self, - SIZE_t start, - SIZE_t end, - double* weighted_n_node_samples + intp_t start, + intp_t end, + float64_t* weighted_n_node_samples ) except -1 nogil: """Reset splitter on node samples[start:end]. @@ -54,11 +54,11 @@ cdef class PatchSplitter(BestObliqueSplitter): Parameters ---------- - start : SIZE_t + start : intp_t The index of the first sample to consider - end : SIZE_t + end : intp_t The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=double pointer + weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ @@ -81,8 +81,8 @@ cdef class PatchSplitter(BestObliqueSplitter): cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil: """ Sample the projection vector. @@ -91,18 +91,18 @@ cdef class PatchSplitter(BestObliqueSplitter): """ pass - cdef (SIZE_t, SIZE_t) sample_top_left_seed(self) noexcept nogil: + cdef (intp_t, intp_t) sample_top_left_seed(self) noexcept nogil: pass cdef class BaseDensePatchSplitter(PatchSplitter): - cdef int init( + cdef intp_t init( self, object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight, const unsigned char[::1] missing_values_in_feature_mask, - # const INT32_t[:] n_categories + # const int32_t[:] n_categories ) except -1: """Initialize the splitter @@ -120,18 +120,18 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): def __cinit__( self, Criterion criterion, - SIZE_t max_features, - SIZE_t min_samples_leaf, - double min_weight_leaf, + intp_t max_features, + intp_t min_samples_leaf, + float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - double feature_combinations, - const SIZE_t[:] min_patch_dims, - const SIZE_t[:] max_patch_dims, + float64_t feature_combinations, + const intp_t[:] min_patch_dims, + const intp_t[:] max_patch_dims, const cnp.uint8_t[:] dim_contiguous, - const SIZE_t[:] data_dims, + const intp_t[:] data_dims, bytes boundary, - const DTYPE_t[:, :] feature_weight, + const float32_t[:, :] feature_weight, *argv ): self.criterion = criterion @@ -147,8 +147,8 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.monotonic_cst = monotonic_cst # Sparse max_features x n_features projection matrix - self.proj_mat_weights = vector[vector[DTYPE_t]](self.max_features) - self.proj_mat_indices = vector[vector[SIZE_t]](self.max_features) + self.proj_mat_weights = vector[vector[float32_t]](self.max_features) + self.proj_mat_indices = vector[vector[intp_t]](self.max_features) # initialize state to allow generalization to higher-dimensional tensors self.ndim = data_dims.shape[0] @@ -196,31 +196,31 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.feature_weight.base if self.feature_weight is not None else None, ), self.__getstate__()) - cdef (SIZE_t, SIZE_t) sample_top_left_seed(self) noexcept nogil: + cdef (intp_t, intp_t) sample_top_left_seed(self) noexcept nogil: """Sample the top-left seed for the n-dim patch. Returns ------- - top_left_seed : SIZE_t + top_left_seed : intp_t The top-left seed vectorized (i.e. raveled) for the n-dim patch. - patch_size : SIZE_t + patch_size : intp_t The total size of the n-dim patch (i.e. the volume). """ # now get the top-left seed that is used to then determine the top-left # position in patch # compute top-left seed for the multi-dimensional patch - cdef SIZE_t top_left_patch_seed - cdef SIZE_t patch_size = 1 + cdef intp_t top_left_patch_seed + cdef intp_t patch_size = 1 cdef UINT32_t* random_state = &self.rand_r_state # define parameters for the random patch - cdef SIZE_t patch_dim - cdef SIZE_t delta_patch_dim + cdef intp_t patch_dim + cdef intp_t delta_patch_dim - cdef SIZE_t dim + cdef intp_t dim - cdef SIZE_t idx + cdef intp_t idx for idx in range(self.ndim): # compute random patch width and height @@ -272,23 +272,23 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil: """Sample projection matrix using a contiguous patch. Randomly sample patches with weight of 1. """ - cdef SIZE_t max_features = self.max_features - cdef int proj_i + cdef intp_t max_features = self.max_features + cdef intp_t proj_i # define parameters for vectorized points in the original data shape # and top-left seed - cdef SIZE_t top_left_patch_seed + cdef intp_t top_left_patch_seed # size of the sampled patch, which is just the size of the n-dim patch # (\prod_i self.patch_dims_buff[i]) - cdef SIZE_t patch_size + cdef intp_t patch_size for proj_i in range(0, max_features): # now get the top-left seed that is used to then determine the top-left @@ -308,34 +308,34 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): cdef void sample_proj_vec( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices, - SIZE_t proj_i, - SIZE_t patch_size, - SIZE_t top_left_patch_seed, - const SIZE_t[:] patch_dims, + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices, + intp_t proj_i, + intp_t patch_size, + intp_t top_left_patch_seed, + const intp_t[:] patch_dims, ) noexcept nogil: cdef UINT32_t* random_state = &self.rand_r_state # iterates over the size of the patch - cdef SIZE_t patch_idx + cdef intp_t patch_idx # stores how many patches we have iterated so far - cdef int vectorized_patch_offset - cdef SIZE_t vectorized_point_offset - cdef SIZE_t vectorized_point + cdef intp_t vectorized_patch_offset + cdef intp_t vectorized_point_offset + cdef intp_t vectorized_point - cdef SIZE_t dim_idx + cdef intp_t dim_idx # weights are default to 1 - cdef DTYPE_t weight = 1. + cdef float32_t weight = 1. # XXX: still unsure if it works yet # XXX: THIS ONLY WORKS FOR THE FIRST DIMENSION THAT IS DISCONTIGUOUS. - cdef SIZE_t other_dims_offset - cdef SIZE_t row_index + cdef intp_t other_dims_offset + cdef intp_t row_index - cdef SIZE_t i - cdef int num_rows = self.data_dims[0] + cdef intp_t i + cdef intp_t num_rows = self.data_dims[0] if self._discontiguous: # fill with values 0, 1, ..., dimension - 1 for i in range(0, self.data_dims[0]): @@ -407,22 +407,22 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): cdef void compute_features_over_samples( self, - SIZE_t start, - SIZE_t end, - const SIZE_t[:] samples, - DTYPE_t[:] feature_values, - vector[DTYPE_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[SIZE_t]* proj_vec_indices # indices of the features (max_features,) + intp_t start, + intp_t end, + const intp_t[:] samples, + float32_t[:] feature_values, + vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]* proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ - cdef SIZE_t idx, jdx + cdef intp_t idx, jdx # initialize feature weight to normalize across patch - cdef DTYPE_t patch_weight + cdef float32_t patch_weight # Compute linear combination of features and then # sort samples according to the feature values. @@ -454,14 +454,14 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): cpdef sample_projection_vector( self, - SIZE_t proj_i, - SIZE_t patch_size, - SIZE_t top_left_patch_seed, - SIZE_t[:] patch_dims, + intp_t proj_i, + intp_t patch_size, + intp_t top_left_patch_seed, + intp_t[:] patch_dims, ): - cdef vector[vector[DTYPE_t]] proj_mat_weights = vector[vector[DTYPE_t]](self.max_features) - cdef vector[vector[SIZE_t]] proj_mat_indices = vector[vector[SIZE_t]](self.max_features) - cdef SIZE_t i, j + cdef vector[vector[float32_t]] proj_mat_weights = vector[vector[float32_t]](self.max_features) + cdef vector[vector[intp_t]] proj_mat_indices = vector[vector[intp_t]](self.max_features) + cdef intp_t i, j # sample projection matrix in C/C++ self.sample_proj_vec( @@ -489,9 +489,9 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): Randomly sample patches with weight of 1. """ - cdef vector[vector[DTYPE_t]] proj_mat_weights = vector[vector[DTYPE_t]](self.max_features) - cdef vector[vector[SIZE_t]] proj_mat_indices = vector[vector[SIZE_t]](self.max_features) - cdef SIZE_t i, j + cdef vector[vector[float32_t]] proj_mat_weights = vector[vector[float32_t]](self.max_features) + cdef vector[vector[intp_t]] proj_mat_indices = vector[vector[intp_t]](self.max_features) + cdef intp_t i, j # sample projection matrix in C/C++ self.sample_proj_mat(proj_mat_weights, proj_mat_indices) diff --git a/sktree/tree/unsupervised/_unsup_criterion.pxd b/sktree/tree/unsupervised/_unsup_criterion.pxd index bfbd7428a..8ef633875 100644 --- a/sktree/tree/unsupervised/_unsup_criterion.pxd +++ b/sktree/tree/unsupervised/_unsup_criterion.pxd @@ -3,11 +3,7 @@ # cython: language_level=3 from ..._lib.sklearn.tree._criterion cimport BaseCriterion -from ..._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from ..._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from ..._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from ..._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from ..._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer +from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t # Note: This class is an exact copy of scikit-learn's Criterion # class, with the exception of the type of the internal structure. @@ -31,31 +27,31 @@ cdef class UnsupervisedCriterion(BaseCriterion): # impurity of a split on that node. It also computes the output statistics. # Internal structures - cdef const DTYPE_t[:] feature_values # 1D memview for the feature vector to compute criterion on + cdef const float32_t[:] feature_values # 1D memview for the feature vector to compute criterion on # Keep running total of Xf[samples[start:end]] and the corresponding sum in # the left and right node. For example, this can then efficiently compute the # mean of the node, and left/right child by subtracting relevant Xf elements # and then dividing by the total number of samples in the node and left/right child. - cdef double sum_total # The sum of the weighted count of each feature. - cdef double sum_left # Same as above, but for the left side of the split - cdef double sum_right # Same as above, but for the right side of the split + cdef float64_t sum_total # The sum of the weighted count of each feature. + cdef float64_t sum_left # Same as above, but for the left side of the split + cdef float64_t sum_right # Same as above, but for the right side of the split - cdef double sumsq_total # The sum of the weighted count of each feature. - cdef double sumsq_left # Same as above, but for the left side of the split - cdef double sumsq_right # Same as above, but for the right side of the split + cdef float64_t sumsq_total # The sum of the weighted count of each feature. + cdef float64_t sumsq_left # Same as above, but for the left side of the split + cdef float64_t sumsq_right # Same as above, but for the right side of the split # Methods # ------- # The 'init' method is copied here with the almost the exact same signature # as that of supervised learning criterion in scikit-learn to ensure that # Unsupervised criterion can be used with splitter and tree methods. - cdef int init( + cdef intp_t init( self, - const DTYPE_t[:] feature_values, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] samples, + const float32_t[:] feature_values, + const float64_t[:] sample_weight, + float64_t weighted_n_samples, + const intp_t[:] samples, ) except -1 nogil cdef void init_feature_vec( @@ -64,6 +60,6 @@ cdef class UnsupervisedCriterion(BaseCriterion): cdef void set_sample_pointers( self, - SIZE_t start, - SIZE_t end + intp_t start, + intp_t end ) noexcept nogil diff --git a/sktree/tree/unsupervised/_unsup_criterion.pyx b/sktree/tree/unsupervised/_unsup_criterion.pyx index 518fc2753..58f21f348 100644 --- a/sktree/tree/unsupervised/_unsup_criterion.pyx +++ b/sktree/tree/unsupervised/_unsup_criterion.pyx @@ -11,7 +11,7 @@ from libc.math cimport log cnp.import_array() -cdef DTYPE_t PI = np.pi +cdef float32_t PI = np.pi cdef class UnsupervisedCriterion(BaseCriterion): @@ -64,19 +64,19 @@ cdef class UnsupervisedCriterion(BaseCriterion): Parameters ---------- - Xf : array-like, dtype=DTYPE_t + Xf : array-like, dtype=float32_t The read-only memoryview 1D feature vector with (n_samples,) shape. """ # also compute the sum total self.sum_total = 0.0 self.sumsq_total = 0.0 self.weighted_n_node_samples = 0.0 - cdef SIZE_t s_idx - cdef SIZE_t p_idx + cdef intp_t s_idx + cdef intp_t p_idx # XXX: this can be further optimized by computing a cumulative sum hash map of the sum_total and sumsq_total # and then update will never have to iterate through even - cdef DOUBLE_t w = 1.0 + cdef float64_t w = 1.0 for p_idx in range(self.start, self.end): s_idx = self.sample_indices[p_idx] @@ -93,12 +93,12 @@ cdef class UnsupervisedCriterion(BaseCriterion): # Reset to pos=start self.reset() - cdef int init( + cdef intp_t init( self, - const DTYPE_t[:] feature_values, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, + const float32_t[:] feature_values, + const float64_t[:] sample_weight, + float64_t weighted_n_samples, + const intp_t[:] sample_indices, ) except -1 nogil: """Initialize the unsuperivsed criterion. @@ -107,13 +107,13 @@ cdef class UnsupervisedCriterion(BaseCriterion): Parameters ---------- - feature_values : array-like, dtype=DTYPE_t + feature_values : array-like, dtype=float32_t The memoryview 1D feature vector with (n_samples,) shape. - sample_weight : array-like, dtype=DOUBLE_t + sample_weight : array-like, dtype=float64_t The weight of each sample (i.e. row of X). - weighted_n_samples : double + weighted_n_samples : float64_t The total weight of all sample_indices. - sample_indices : array-like, dtype=SIZE_t + sample_indices : array-like, dtype=intp_t A mask on the sample_indices, showing which ones we want to use """ self.feature_values = feature_values @@ -123,7 +123,7 @@ cdef class UnsupervisedCriterion(BaseCriterion): return 0 - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -140,7 +140,7 @@ cdef class UnsupervisedCriterion(BaseCriterion): self.sumsq_right = self.sumsq_total return 0 - cdef int reverse_reset(self) except -1 nogil: + cdef intp_t reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -157,9 +157,9 @@ cdef class UnsupervisedCriterion(BaseCriterion): self.sumsq_left = self.sumsq_total return 0 - cdef int update( + cdef intp_t update( self, - SIZE_t new_pos + intp_t new_pos ) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. @@ -168,19 +168,19 @@ cdef class UnsupervisedCriterion(BaseCriterion): Parameters ---------- - new_pos : SIZE_t + new_pos : intp_t The new ending position for which to move sample_indices from the right child to the left child. """ - cdef SIZE_t pos = self.pos - cdef SIZE_t end = self.end + cdef intp_t pos = self.pos + cdef intp_t end = self.end - cdef const SIZE_t[:] sample_indices = self.sample_indices - cdef const DOUBLE_t[:] sample_weight = self.sample_weight + cdef const intp_t[:] sample_indices = self.sample_indices + cdef const float64_t[:] sample_weight = self.sample_weight - cdef SIZE_t i - cdef SIZE_t p - cdef DOUBLE_t w = 1.0 + cdef intp_t i + cdef intp_t p + cdef float64_t w = 1.0 # Update statistics up to new_pos # @@ -225,13 +225,13 @@ cdef class UnsupervisedCriterion(BaseCriterion): cdef void node_value( self, - double* dest + float64_t* dest ) noexcept nogil: """Set the node value with sum_total and save it into dest. Parameters ---------- - dest : double pointer + dest : float64_t pointer The memory address which we will save the node value into. """ # set values at the address pointer is pointing to with the total value @@ -239,8 +239,8 @@ cdef class UnsupervisedCriterion(BaseCriterion): cdef void set_sample_pointers( self, - SIZE_t start, - SIZE_t end + intp_t start, + intp_t end ) noexcept nogil: """Set sample pointers in the criterion. @@ -248,9 +248,9 @@ cdef class UnsupervisedCriterion(BaseCriterion): Parameters ---------- - start : SIZE_t + start : intp_t The start sample pointer. - end : SIZE_t + end : intp_t The end sample pointer. """ self.n_node_samples = end - start @@ -307,7 +307,7 @@ cdef class TwoMeans(UnsupervisedCriterion): pair minimizes the splitting criteria described in the following section """ - cdef double node_impurity( + cdef float64_t node_impurity( self ) noexcept nogil: """Evaluate the impurity of the current node. @@ -316,7 +316,7 @@ cdef class TwoMeans(UnsupervisedCriterion): i.e. the variance of Xf[sample_indices[start:end]]. The smaller the impurity the better. """ - cdef double impurity + cdef float64_t impurity # then compute the impurity as the variance impurity = self.fast_variance(self.weighted_n_node_samples, self.sumsq_total, self.sum_total) @@ -324,8 +324,8 @@ cdef class TwoMeans(UnsupervisedCriterion): cdef void children_impurity( self, - double* impurity_left, - double* impurity_right + float64_t* impurity_left, + float64_t* impurity_right ) noexcept nogil: """Evaluate the impurity in children nodes. @@ -340,9 +340,9 @@ cdef class TwoMeans(UnsupervisedCriterion): Parameters ---------- - impurity_left : double pointer + impurity_left : float64_t pointer The memory address to save the impurity of the left node - impurity_right : double pointer + impurity_right : float64_t pointer The memory address to save the impurity of the right node """ # set values at the address pointer is pointing to with the variance @@ -350,11 +350,11 @@ cdef class TwoMeans(UnsupervisedCriterion): impurity_left[0] = self.fast_variance(self.weighted_n_left, self.sumsq_left, self.sum_left) impurity_right[0] = self.fast_variance(self.weighted_n_right, self.sumsq_right, self.sum_right) - cdef inline double fast_variance( + cdef inline float64_t fast_variance( self, - double weighted_n_node_samples, - double sumsq_total, - double sum_total + float64_t weighted_n_node_samples, + float64_t sumsq_total, + float64_t sum_total ) noexcept nogil: return (1. / weighted_n_node_samples) * \ ((sumsq_total) - (1. / weighted_n_node_samples) * (sum_total * sum_total)) @@ -391,18 +391,18 @@ cdef class FastBIC(TwoMeans): Reference: https://arxiv.org/abs/1907.02844 """ - cdef inline double bic_cluster( + cdef inline float64_t bic_cluster( self, - SIZE_t n_samples, - double variance + intp_t n_samples, + float64_t variance ) noexcept nogil: """Help compute the BIC from assigning to a specific cluster. Parameters ---------- - n_samples : SIZE_t + n_samples : intp_t The number of samples assigned cluster. - variance : double + variance : float64_t The plug-in variance for assigning to specific cluster. Notes @@ -421,13 +421,13 @@ cdef class FastBIC(TwoMeans): """ # chances of choosing the cluster based on how many samples are hard-assigned to cluster # i.e. the prior - # cast to double, so we do not round to integers - cdef double w_cluster = (n_samples + 0.0) / self.n_node_samples + # cast to float64_t, so we do not round to integers + cdef float64_t w_cluster = (n_samples + 0.0) / self.n_node_samples # add to prevent taking log of 0 when there is a degenerate cluster (i.e. single sample, or no variance) return -2. * (n_samples * log(w_cluster) + 0.5 * n_samples * log(2. * PI * variance + 1.e-7)) - cdef double node_impurity( + cdef float64_t node_impurity( self ) noexcept nogil: """Evaluate the impurity of the current node. @@ -437,8 +437,8 @@ cdef class FastBIC(TwoMeans): Namely, this is the maximum likelihood of Xf[sample_indices[start:end]]. The smaller the impurity the better. """ - cdef double variance - cdef double impurity + cdef float64_t variance + cdef float64_t impurity # then compute the variance of the cluster variance = self.fast_variance(self.weighted_n_node_samples, self.sumsq_total, self.sum_total) @@ -451,8 +451,8 @@ cdef class FastBIC(TwoMeans): cdef void children_impurity( self, - double* impurity_left, - double* impurity_right + float64_t* impurity_left, + float64_t* impurity_right ) noexcept nogil: """Evaluate the impurity in children nodes. @@ -461,20 +461,20 @@ cdef class FastBIC(TwoMeans): Parameters ---------- - impurity_left : double pointer + impurity_left : float64_t pointer The memory address to save the impurity of the left node - impurity_right : double pointer + impurity_right : float64_t pointer The memory address to save the impurity of the right node """ - cdef SIZE_t pos = self.pos - cdef SIZE_t start = self.start - cdef SIZE_t end = self.end - cdef SIZE_t n_samples_left, n_samples_right - - cdef double variance_left, variance_right, variance_comb - cdef double BIC_diff_var_left, BIC_diff_var_right - cdef double BIC_same_var_left, BIC_same_var_right - cdef double BIC_same_var, BIC_diff_var + cdef intp_t pos = self.pos + cdef intp_t start = self.start + cdef intp_t end = self.end + cdef intp_t n_samples_left, n_samples_right + + cdef float64_t variance_left, variance_right, variance_comb + cdef float64_t BIC_diff_var_left, BIC_diff_var_right + cdef float64_t BIC_same_var_left, BIC_same_var_right + cdef float64_t BIC_same_var, BIC_diff_var # number of samples of left and right n_samples_left = pos - start diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd b/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd index 07a7049bb..5b7b4a2d4 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd @@ -4,27 +4,24 @@ cimport numpy as cnp from libcpp.vector cimport vector from ..._lib.sklearn.tree._splitter cimport SplitRecord -from ..._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from ..._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from ..._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from ..._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from ..._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer +from ..._lib.sklearn.tree._utils cimport UINT32_t +from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t from ._unsup_splitter cimport UnsupervisedSplitter cdef struct ObliqueSplitRecord: # Data to track sample split - SIZE_t feature # Which feature to split on. - SIZE_t pos # Split samples array at the given position, + intp_t feature # Which feature to split on. + intp_t pos # Split samples array at the given position, # # i.e. count of samples below threshold for feature. # # pos is >= end if the node is a leaf. - double threshold # Threshold to split at. - double improvement # Impurity improvement given parent node. - double impurity_left # Impurity of the left split. - double impurity_right # Impurity of the right split. + float64_t threshold # Threshold to split at. + float64_t improvement # Impurity improvement given parent node. + float64_t impurity_left # Impurity of the left split. + float64_t impurity_right # Impurity of the right split. - vector[DTYPE_t]* proj_vec_weights # weights of the vector (max_features,) - vector[SIZE_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) + vector[intp_t]* proj_vec_indices # indices of the features (max_features,) cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): @@ -38,44 +35,44 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): """ # Oblique Splitting extra parameters - cdef public double feature_combinations # Number of features to combine - cdef SIZE_t n_non_zeros # Number of non-zero features - cdef vector[vector[DTYPE_t]] proj_mat_weights # nonzero weights of sparse proj_mat matrix - cdef vector[vector[SIZE_t]] proj_mat_indices # nonzero indices of sparse proj_mat matrix - cdef SIZE_t[::1] indices_to_sample # an array of indices to sample of size mtry X n_features + cdef public float64_t feature_combinations # Number of features to combine + cdef intp_t n_non_zeros # Number of non-zero features + cdef vector[vector[float32_t]] proj_mat_weights # nonzero weights of sparse proj_mat matrix + cdef vector[vector[intp_t]] proj_mat_indices # nonzero indices of sparse proj_mat matrix + cdef intp_t[::1] indices_to_sample # an array of indices to sample of size mtry X n_features # All oblique splitters (i.e. non-axis aligned splitters) require a # function to sample a projection matrix that is applied to the feature matrix # to quickly obtain the sampled projections for candidate splits. cdef void sample_proj_mat(self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices) noexcept nogil + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices) noexcept nogil # Redefined here since the new logic requires calling sample_proj_mat - cdef int node_reset(self, SIZE_t start, SIZE_t end, - double* weighted_n_node_samples) except -1 nogil + cdef intp_t node_reset(self, intp_t start, intp_t end, + float64_t* weighted_n_node_samples) except -1 nogil - cdef int node_split( + cdef intp_t node_split( self, - double impurity, # Impurity of the node + float64_t impurity, # Impurity of the node SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound + intp_t* n_constant_features, + float64_t lower_bound, + float64_t upper_bound ) except -1 nogil - cdef int init( + cdef intp_t init( self, - const DTYPE_t[:, :] X, - const DOUBLE_t[:] sample_weight + const float32_t[:, :] X, + const float64_t[:] sample_weight ) except -1 - cdef int pointer_size(self) noexcept nogil + cdef intp_t pointer_size(self) noexcept nogil cdef void compute_features_over_samples( self, - SIZE_t start, - SIZE_t end, - const SIZE_t[:] samples, - DTYPE_t[:] feature_values, - vector[DTYPE_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[SIZE_t]* proj_vec_indices # indices of the features (max_features,) + intp_t start, + intp_t end, + const intp_t[:] samples, + float32_t[:] feature_values, + vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]* proj_vec_indices # indices of the features (max_features,) ) noexcept nogil diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index e65612095..7cd51d3e6 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -18,17 +18,17 @@ from .._sklearn_splitter cimport sort from ._unsup_criterion cimport UnsupervisedCriterion -cdef double INFINITY = np.inf +cdef float64_t INFINITY = np.inf # Mitigate precision differences between 32 bit and 64 bit -cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 +cdef float32_t FEATURE_THRESHOLD = 1e-7 # Constant to switch between algorithm non zero value extract algorithm # in SparseSplitter -cdef DTYPE_t EXTRACT_NNZ_SWITCH = 0.1 +cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 -cdef inline void _init_split(ObliqueSplitRecord* self, SIZE_t start_pos) noexcept nogil: +cdef inline void _init_split(ObliqueSplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY self.impurity_right = INFINITY self.pos = start_pos @@ -47,11 +47,11 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): def __cinit__( self, UnsupervisedCriterion criterion, - SIZE_t max_features, - SIZE_t min_samples_leaf, - double min_weight_leaf, + intp_t max_features, + intp_t min_samples_leaf, + float64_t min_weight_leaf, object random_state, - double feature_combinations, + float64_t feature_combinations, *argv ): """ @@ -60,20 +60,20 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): criterion : Criterion The criterion to measure the quality of a split. - max_features : SIZE_t + max_features : intp_t The maximal number of randomly selected features which can be considered for a split. - min_samples_leaf : SIZE_t + min_samples_leaf : intp_t The minimal number of samples each leaf can have, where splits which would result in having less samples in a leaf are not considered. - min_weight_leaf : double + min_weight_leaf : float64_t The minimal weight each leaf can have, where the weight is the sum of the weights of each sample in it. - feature_combinations : double + feature_combinations : float64_t The average number of features to combine in an oblique split. Each feature is independently included with probability ``feature_combination`` / ``n_features``. @@ -96,11 +96,11 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): self.feature_combinations = feature_combinations # Sparse max_features x n_features projection matrix - self.proj_mat_weights = vector[vector[DTYPE_t]](self.max_features) - self.proj_mat_indices = vector[vector[SIZE_t]](self.max_features) + self.proj_mat_weights = vector[vector[float32_t]](self.max_features) + self.proj_mat_indices = vector[vector[intp_t]](self.max_features) # or max w/ 1... - self.n_non_zeros = max(int(self.max_features * self.feature_combinations), 1) + self.n_non_zeros = max((self.max_features * self.feature_combinations), 1) def __getstate__(self): return {} @@ -116,10 +116,10 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): self.random_state, self.feature_combinations), self.__getstate__()) - cdef int init( + cdef intp_t init( self, - const DTYPE_t[:, :] X, - const DOUBLE_t[:] sample_weight + const float32_t[:, :] X, + const float64_t[:] sample_weight ) except -1: UnsupervisedSplitter.init(self, X, sample_weight) @@ -128,8 +128,8 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): dtype=np.intp) return 0 - cdef int node_reset(self, SIZE_t start, SIZE_t end, - double* weighted_n_node_samples) except -1 nogil: + cdef intp_t node_reset(self, intp_t start, intp_t end, + float64_t* weighted_n_node_samples) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -137,11 +137,11 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): Parameters ---------- - start : SIZE_t + start : intp_t The index of the first sample to consider - end : SIZE_t + end : intp_t The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=double pointer + weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ # call parent reset @@ -154,35 +154,35 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): return 0 cdef void sample_proj_mat(self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices) noexcept nogil: + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices) noexcept nogil: """ Sample the projection vector. This is a placeholder method. """ pass - cdef int pointer_size(self) noexcept nogil: + cdef intp_t pointer_size(self) noexcept nogil: """Get size of a pointer to record for ObliqueSplitter.""" return sizeof(ObliqueSplitRecord) cdef inline void compute_features_over_samples( self, - SIZE_t start, - SIZE_t end, - const SIZE_t[:] samples, - DTYPE_t[:] feature_values, - vector[DTYPE_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[SIZE_t]* proj_vec_indices # indices of the features (max_features,) + intp_t start, + intp_t end, + const intp_t[:] samples, + float32_t[:] feature_values, + vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]* proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ - cdef SIZE_t idx, jdx - cdef SIZE_t col_idx - cdef DTYPE_t col_weight + cdef intp_t idx, jdx + cdef intp_t col_idx + cdef float32_t col_weight # Compute linear combination of features and then # sort samples according to the feature values. @@ -201,24 +201,24 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): # NOTE: vectors are passed by value, so & is needed to pass by reference cdef void sample_proj_mat( self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil: """ Sparse Oblique Projection matrix. Randomly sample features to put in randomly sampled projection vectors weight = 1 or -1 with probability 0.5 """ - cdef SIZE_t n_features = self.n_features - cdef SIZE_t n_non_zeros = self.n_non_zeros + cdef intp_t n_features = self.n_features + cdef intp_t n_non_zeros = self.n_non_zeros cdef UINT32_t* random_state = &self.rand_r_state - cdef int i, feat_i, proj_i, rand_vec_index - cdef DTYPE_t weight + cdef intp_t i, feat_i, proj_i, rand_vec_index + cdef float32_t weight # construct an array to sample from mTry x n_features set of indices - cdef SIZE_t[::1] indices_to_sample = self.indices_to_sample - cdef SIZE_t grid_size = self.max_features * self.n_features + cdef intp_t[::1] indices_to_sample = self.indices_to_sample + cdef intp_t grid_size = self.max_features * self.n_features # shuffle indices over the 2D grid to sample using Fisher-Yates for i in range(0, grid_size): @@ -242,13 +242,13 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero - cdef int node_split( + cdef intp_t node_split( self, - double impurity, + float64_t impurity, SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound, + intp_t* n_constant_features, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil: """Find the best_split split on node samples[start:end] @@ -259,25 +259,25 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): cdef ObliqueSplitRecord* oblique_split = (split) # Draw random splits and pick the best_split - cdef SIZE_t[::1] samples = self.samples - cdef SIZE_t start = self.start - cdef SIZE_t end = self.end + cdef intp_t[::1] samples = self.samples + cdef intp_t start = self.start + cdef intp_t end = self.end # pointer array to store feature values to split on - cdef DTYPE_t[::1] feature_values = self.feature_values - cdef SIZE_t max_features = self.max_features - cdef SIZE_t min_samples_leaf = self.min_samples_leaf - cdef double min_weight_leaf = self.min_weight_leaf + cdef float32_t[::1] feature_values = self.feature_values + cdef intp_t max_features = self.max_features + cdef intp_t min_samples_leaf = self.min_samples_leaf + cdef float64_t min_weight_leaf = self.min_weight_leaf # keep track of split record for current_split node and the best_split split # found among the sampled projection vectors cdef ObliqueSplitRecord best_split, current_split - cdef double current_proxy_improvement = -INFINITY - cdef double best_proxy_improvement = -INFINITY + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY - cdef SIZE_t feat_i, p # index over computed features and start/end - cdef SIZE_t partition_end - cdef DTYPE_t temp_d # to compute a projection feature value + cdef intp_t feat_i, p # index over computed features and start/end + cdef intp_t partition_end + cdef float32_t temp_d # to compute a projection feature value # instantiate the split records _init_split(&best_split, end) diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pxd b/sktree/tree/unsupervised/_unsup_oblique_tree.pxd index 6aa891cf5..b1277b869 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pxd +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pxd @@ -12,40 +12,36 @@ cimport numpy as cnp from libcpp.vector cimport vector from ..._lib.sklearn.tree._splitter cimport SplitRecord -from ..._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from ..._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from ..._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from ..._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from ..._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer from ..._lib.sklearn.tree._tree cimport Node +from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t from .._oblique_splitter cimport ObliqueSplitRecord from ._unsup_tree cimport UnsupervisedTree cdef class UnsupervisedObliqueTree(UnsupervisedTree): - cdef vector[vector[DTYPE_t]] proj_vec_weights # (capacity, n_features) array of projection vectors - cdef vector[vector[SIZE_t]] proj_vec_indices # (capacity, n_features) array of projection vectors + cdef vector[vector[float32_t]] proj_vec_weights # (capacity, n_features) array of projection vectors + cdef vector[vector[intp_t]] proj_vec_indices # (capacity, n_features) array of projection vectors # overridden methods - cdef int _resize_c( + cdef intp_t _resize_c( self, - SIZE_t capacity=* + intp_t capacity=* ) except -1 nogil - cdef int _set_split_node( + cdef intp_t _set_split_node( self, SplitRecord* split_node, Node *node, - SIZE_t node_id, + intp_t node_id, ) nogil except -1 - cdef DTYPE_t _compute_feature( + cdef float32_t _compute_feature( self, - const DTYPE_t[:, :] X_ndarray, - SIZE_t sample_index, + const float32_t[:, :] X_ndarray, + intp_t sample_index, Node *node ) noexcept nogil cdef void _compute_feature_importances( self, - cnp.float64_t[:] importances, + float64_t[:] importances, Node* node ) noexcept nogil diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx index 0ea70a6b6..51f6e6202 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx @@ -27,7 +27,7 @@ cdef Node dummy NODE_DTYPE = np.asarray((&dummy)).dtype # Mitigate precision differences between 32 bit and 64 bit -cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 +cdef float32_t FEATURE_THRESHOLD = 1e-7 # ============================================================================= # ObliqueTree @@ -45,49 +45,49 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): Attributes ---------- - node_count : int + node_count : intp_t The number of nodes (internal nodes + leaves) in the tree. - capacity : int + capacity : intp_t The current capacity (i.e., size) of the arrays, which is at least as great as `node_count`. - max_depth : int + max_depth : intp_t The depth of the tree, i.e. the maximum depth of its leaves. - children_left : array of int, shape [node_count] + children_left : array of intp_t, shape [node_count] children_left[i] holds the node id of the left child of node i. For leaves, children_left[i] == TREE_LEAF. Otherwise, children_left[i] > i. This child handles the case where X[:, feature[i]] <= threshold[i]. - children_right : array of int, shape [node_count] + children_right : array of intp_t, shape [node_count] children_right[i] holds the node id of the right child of node i. For leaves, children_right[i] == TREE_LEAF. Otherwise, children_right[i] > i. This child handles the case where X[:, feature[i]] > threshold[i]. - feature : array of int, shape [node_count] + feature : array of intp_t, shape [node_count] feature[i] holds the feature to split on, for the internal node i. - threshold : array of double, shape [node_count] + threshold : array of float64_t, shape [node_count] threshold[i] holds the threshold for the internal node i. - value : array of double, shape [node_count, n_outputs, max_n_classes] + value : array of float64_t, shape [node_count, n_outputs, max_n_classes] Contains the constant prediction value of each node. - impurity : array of double, shape [node_count] + impurity : array of float64_t, shape [node_count] impurity[i] holds the impurity (i.e., the value of the splitting criterion) at node i. - n_node_samples : array of int, shape [node_count] + n_node_samples : array of intp_t, shape [node_count] n_node_samples[i] holds the number of training samples reaching node i. - weighted_n_node_samples : array of int, shape [node_count] + weighted_n_node_samples : array of intp_t, shape [node_count] weighted_n_node_samples[i] holds the weighted number of training samples reaching node i. """ - def __cinit__(self, int n_features): + def __cinit__(self, intp_t n_features): """Constructor.""" # Input/Output layout self.n_features = n_features @@ -100,8 +100,8 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): self.value = NULL self.nodes = NULL - self.proj_vec_weights = vector[vector[DTYPE_t]](self.capacity) - self.proj_vec_indices = vector[vector[SIZE_t]](self.capacity) + self.proj_vec_weights = vector[vector[float32_t]](self.capacity) + self.proj_vec_indices = vector[vector[intp_t]](self.capacity) def __reduce__(self): """Reduce re-implementation, for pickling.""" @@ -159,7 +159,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), self.capacity * sizeof(Node)) memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) + self.capacity * self.value_stride * sizeof(float64_t)) cpdef cnp.ndarray get_projection_matrix(self): """Get the projection matrix of shape (node_count, n_features).""" @@ -171,7 +171,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): proj_vecs[i, feat] = weight return proj_vecs - cdef int _resize_c(self, SIZE_t capacity=SIZE_MAX) except -1 nogil: + cdef intp_t _resize_c(self, intp_t capacity=SIZE_MAX) except -1 nogil: """Guts of _resize. Additionally resizes the projection indices and weights. @@ -200,7 +200,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): # value memory is initialised to 0 to enable classifier argmax if capacity > self.capacity: memset((self.value + self.capacity * self.value_stride), 0, - (capacity - self.capacity) * self.value_stride * sizeof(double)) + (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: @@ -209,7 +209,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): self.capacity = capacity return 0 - cdef int _set_split_node(self, SplitRecord* split_node, Node *node, SIZE_t node_id) except -1 nogil: + cdef intp_t _set_split_node(self, SplitRecord* split_node, Node *node, intp_t node_id) except -1 nogil: """Set node data. """ # Cython type cast split record into its inherited split record @@ -230,10 +230,10 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): ) return 1 - cdef DTYPE_t _compute_feature( + cdef float32_t _compute_feature( self, - const DTYPE_t[:, :] X_ndarray, - SIZE_t sample_index, + const float32_t[:, :] X_ndarray, + intp_t sample_index, Node *node ) noexcept nogil: """Compute feature from a given data matrix, X. @@ -241,15 +241,15 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): In oblique-aligned trees, this is the projection of X. In this case, we take a simple linear combination of some columns of X. """ - cdef DTYPE_t proj_feat = 0.0 - cdef DTYPE_t weight = 0.0 - cdef SIZE_t j = 0 - cdef SIZE_t feature_index + cdef float32_t proj_feat = 0.0 + cdef float32_t weight = 0.0 + cdef intp_t j = 0 + cdef intp_t feature_index # get the index of the node - cdef SIZE_t node_id = node - self.nodes + cdef intp_t node_id = node - self.nodes - # cdef SIZE_t n_projections = proj_vec_indices.size() + # cdef intp_t n_projections = proj_vec_indices.size() # compute projection of the data based on trained tree # proj_vec_weights = self.proj_vec_weights[node_id] # proj_vec_indices = self.proj_vec_indices[node_id] @@ -266,7 +266,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): cdef void _compute_feature_importances( self, - cnp.float64_t[:] importances, + float64_t[:] importances, Node* node ) noexcept nogil: """Compute feature importances from a Node in the Tree. @@ -279,13 +279,13 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): cdef Node* right # get the index of the node - cdef SIZE_t node_id = node - self.nodes + cdef intp_t node_id = node - self.nodes left = &nodes[node.left_child] right = &nodes[node.right_child] - cdef int i, feature_index - cdef DTYPE_t weight + cdef intp_t i, feature_index + cdef float32_t weight for i in range(0, self.proj_vec_indices[node_id].size()): feature_index = self.proj_vec_indices[node_id][i] weight = self.proj_vec_weights[node_id][i] diff --git a/sktree/tree/unsupervised/_unsup_splitter.pxd b/sktree/tree/unsupervised/_unsup_splitter.pxd index 324d87445..702a69459 100644 --- a/sktree/tree/unsupervised/_unsup_splitter.pxd +++ b/sktree/tree/unsupervised/_unsup_splitter.pxd @@ -1,9 +1,6 @@ from ..._lib.sklearn.tree._splitter cimport BaseSplitter, SplitRecord -from ..._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from ..._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from ..._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from ..._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from ..._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer +from ..._lib.sklearn.tree._utils cimport UINT32_t +from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t from ._unsup_criterion cimport UnsupervisedCriterion @@ -23,35 +20,35 @@ cdef class UnsupervisedSplitter(BaseSplitter): # XXX: requires BaseSplitter to not define "criterion" cdef public UnsupervisedCriterion criterion # criterion computer - cdef const DTYPE_t[:, :] X # feature matrix - cdef SIZE_t n_total_samples # store the total number of samples + cdef const float32_t[:, :] X # feature matrix + cdef intp_t n_total_samples # store the total number of samples # Initialization method for unsupervised splitters - cdef int init( + cdef intp_t init( self, - const DTYPE_t[:, :] X, - const DOUBLE_t[:] sample_weight + const float32_t[:, :] X, + const float64_t[:] sample_weight ) except -1 # Overridden Methods from base class - cdef int node_reset( + cdef intp_t node_reset( self, - SIZE_t start, - SIZE_t end, - double* weighted_n_node_samples + intp_t start, + intp_t end, + float64_t* weighted_n_node_samples ) except -1 nogil - cdef int node_split( + cdef intp_t node_split( self, - double impurity, # Impurity of the node + float64_t impurity, # Impurity of the node SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound + intp_t* n_constant_features, + float64_t lower_bound, + float64_t upper_bound ) except -1 nogil cdef void node_value( self, - double* dest + float64_t* dest ) noexcept nogil - cdef double node_impurity( + cdef float64_t node_impurity( self ) noexcept nogil diff --git a/sktree/tree/unsupervised/_unsup_splitter.pyx b/sktree/tree/unsupervised/_unsup_splitter.pyx index be97ea246..df48394a0 100644 --- a/sktree/tree/unsupervised/_unsup_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_splitter.pyx @@ -16,16 +16,16 @@ from sklearn.tree._utils cimport RAND_R_MAX, rand_int from .._sklearn_splitter cimport sort -cdef double INFINITY = np.inf +cdef float64_t INFINITY = np.inf # Mitigate precision differences between 32 bit and 64 bit -cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 +cdef float32_t FEATURE_THRESHOLD = 1e-7 # Constant to switch between algorithm non zero value extract algorithm # in SparseSplitter -cdef DTYPE_t EXTRACT_NNZ_SWITCH = 0.1 +cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 -cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil: +cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY self.impurity_right = INFINITY self.pos = start_pos @@ -37,8 +37,8 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil cdef class UnsupervisedSplitter(BaseSplitter): """Base class for unsupervised splitters.""" - def __cinit__(self, UnsupervisedCriterion criterion, SIZE_t max_features, - SIZE_t min_samples_leaf, double min_weight_leaf, + def __cinit__(self, UnsupervisedCriterion criterion, intp_t max_features, + intp_t min_samples_leaf, float64_t min_weight_leaf, object random_state, *argv): """ Parameters @@ -46,16 +46,16 @@ cdef class UnsupervisedSplitter(BaseSplitter): criterion : Criterion The criterion to measure the quality of a split. - max_features : SIZE_t + max_features : intp_t The maximal number of randomly selected features which can be considered for a split. - min_samples_leaf : SIZE_t + min_samples_leaf : intp_t The minimal number of samples each leaf can have, where splits which would result in having less samples in a leaf are not considered. - min_weight_leaf : double + min_weight_leaf : float64_t The minimal weight each leaf can have, where the weight is the sum of the weights of each sample in it. @@ -79,21 +79,21 @@ cdef class UnsupervisedSplitter(BaseSplitter): self.min_weight_leaf, self.random_state), self.__getstate__()) - cdef int init( + cdef intp_t init( self, - const DTYPE_t[:, :] X, - const DOUBLE_t[:] sample_weight + const float32_t[:, :] X, + const float64_t[:] sample_weight ) except -1: self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) - cdef SIZE_t n_samples = X.shape[0] + cdef intp_t n_samples = X.shape[0] # Create a new array which will be used to store nonzero # samples from the feature of interest self.samples = np.empty(n_samples, dtype=np.intp) - cdef SIZE_t[::1] samples = self.samples + cdef intp_t[::1] samples = self.samples - cdef SIZE_t i, j - cdef double weighted_n_samples = 0.0 + cdef intp_t i, j + cdef float64_t weighted_n_samples = 0.0 j = 0 for i in range(n_samples): @@ -111,7 +111,7 @@ cdef class UnsupervisedSplitter(BaseSplitter): self.n_samples = j self.weighted_n_samples = weighted_n_samples - cdef SIZE_t n_features = X.shape[1] + cdef intp_t n_features = X.shape[1] self.features = np.arange(n_features, dtype=np.intp) self.n_features = n_features @@ -136,8 +136,8 @@ cdef class UnsupervisedSplitter(BaseSplitter): ) return 0 - cdef int node_reset(self, SIZE_t start, SIZE_t end, - double* weighted_n_node_samples) except -1 nogil: + cdef intp_t node_reset(self, intp_t start, intp_t end, + float64_t* weighted_n_node_samples) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -145,11 +145,11 @@ cdef class UnsupervisedSplitter(BaseSplitter): Parameters ---------- - start : SIZE_t + start : intp_t The index of the first sample to consider - end : SIZE_t + end : intp_t The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=double pointer + weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ @@ -161,12 +161,12 @@ cdef class UnsupervisedSplitter(BaseSplitter): weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 - cdef void node_value(self, double* dest) noexcept nogil: + cdef void node_value(self, float64_t* dest) noexcept nogil: """Copy the value of node samples[start:end] into dest.""" self.criterion.node_value(dest) - cdef double node_impurity(self) noexcept nogil: + cdef float64_t node_impurity(self) noexcept nogil: """Return the impurity of the current_split node.""" return self.criterion.node_impurity() @@ -174,13 +174,13 @@ cdef class UnsupervisedSplitter(BaseSplitter): cdef class BestUnsupervisedSplitter(UnsupervisedSplitter): """Split in a best_split-first fashion. """ - cdef int node_split( + cdef intp_t node_split( self, - double impurity, + float64_t impurity, SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound + intp_t* n_constant_features, + float64_t lower_bound, + float64_t upper_bound ) except -1 nogil: """Find the best_split split on node samples[start:end]. @@ -195,40 +195,40 @@ cdef class BestUnsupervisedSplitter(UnsupervisedSplitter): be cimportable. """ # Find the best_split split - cdef SIZE_t[::1] samples = self.samples - cdef SIZE_t start = self.start - cdef SIZE_t end = self.end + cdef intp_t[::1] samples = self.samples + cdef intp_t start = self.start + cdef intp_t end = self.end - cdef SIZE_t[::1] features = self.features - cdef SIZE_t[::1] constant_features = self.constant_features - cdef SIZE_t n_features = self.n_features + cdef intp_t[::1] features = self.features + cdef intp_t[::1] constant_features = self.constant_features + cdef intp_t n_features = self.n_features - cdef DTYPE_t[::1] feature_values = self.feature_values - cdef SIZE_t max_features = self.max_features - cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef float32_t[::1] feature_values = self.feature_values + cdef intp_t max_features = self.max_features + cdef intp_t min_samples_leaf = self.min_samples_leaf cdef UINT32_t* random_state = &self.rand_r_state # XXX: maybe need to rename to something else - cdef double min_weight_leaf = self.min_weight_leaf + cdef float64_t min_weight_leaf = self.min_weight_leaf cdef SplitRecord best_split, current_split - cdef double current_proxy_improvement = -INFINITY - cdef double best_proxy_improvement = -INFINITY + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY - cdef SIZE_t f_i = n_features - cdef SIZE_t f_j - cdef SIZE_t p - cdef SIZE_t i + cdef intp_t f_i = n_features + cdef intp_t f_j + cdef intp_t p + cdef intp_t i - cdef SIZE_t n_visited_features = 0 + cdef intp_t n_visited_features = 0 # Number of features discovered to be constant during the split search - cdef SIZE_t n_found_constants = 0 + cdef intp_t n_found_constants = 0 # Number of features known to be constant and drawn without replacement - cdef SIZE_t n_drawn_constants = 0 - cdef SIZE_t n_known_constants = n_constant_features[0] + cdef intp_t n_drawn_constants = 0 + cdef intp_t n_known_constants = n_constant_features[0] # n_total_constants = n_known_constants + n_found_constants - cdef SIZE_t n_total_constants = n_known_constants - cdef SIZE_t partition_end + cdef intp_t n_total_constants = n_known_constants + cdef intp_t partition_end _init_split(&best_split, end) @@ -372,12 +372,12 @@ cdef class BestUnsupervisedSplitter(UnsupervisedSplitter): # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling # and child nodes - memcpy(&features[0], &constant_features[0], sizeof(SIZE_t) * n_known_constants) + memcpy(&features[0], &constant_features[0], sizeof(intp_t) * n_known_constants) # Copy newly found constant features memcpy(&constant_features[n_known_constants], &features[n_known_constants], - sizeof(SIZE_t) * n_found_constants) + sizeof(intp_t) * n_found_constants) # Return values split[0] = best_split diff --git a/sktree/tree/unsupervised/_unsup_tree.pxd b/sktree/tree/unsupervised/_unsup_tree.pxd index 329766812..fe4a4630c 100644 --- a/sktree/tree/unsupervised/_unsup_tree.pxd +++ b/sktree/tree/unsupervised/_unsup_tree.pxd @@ -11,12 +11,8 @@ import numpy as np cimport numpy as cnp from ..._lib.sklearn.tree._splitter cimport SplitRecord -from ..._lib.sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight -from ..._lib.sklearn.tree._tree cimport DTYPE_t # Type of X -from ..._lib.sklearn.tree._tree cimport INT32_t # Signed 32 bit integer -from ..._lib.sklearn.tree._tree cimport SIZE_t # Type for indices and counters -from ..._lib.sklearn.tree._tree cimport UINT32_t # Unsigned 32 bit integer from ..._lib.sklearn.tree._tree cimport BaseTree, Node +from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t from ._unsup_splitter cimport UnsupervisedSplitter @@ -28,27 +24,27 @@ cdef class UnsupervisedTree(BaseTree): # # Inner structures: values are stored separately from node structure, # since size is determined at runtime. - # cdef double* value # (capacity) array of values - # cdef SIZE_t value_stride # = 1 + # cdef float64_t* value # (capacity) array of values + # cdef intp_t value_stride # = 1 # Input/Output layout - cdef public SIZE_t n_features # Number of features in X + cdef public intp_t n_features # Number of features in X # Methods cdef cnp.ndarray _get_value_ndarray(self) cdef cnp.ndarray _get_node_ndarray(self) # Overridden Methods - cdef int _set_split_node( + cdef intp_t _set_split_node( self, SplitRecord* split_node, Node* node, - SIZE_t node_id + intp_t node_id ) except -1 nogil - cdef DTYPE_t _compute_feature( + cdef float32_t _compute_feature( self, - const DTYPE_t[:, :] X_ndarray, - SIZE_t sample_index, + const float32_t[:, :] X_ndarray, + intp_t sample_index, Node *node ) noexcept nogil cdef void _compute_feature_importances( @@ -71,20 +67,20 @@ cdef class UnsupervisedTreeBuilder: cdef UnsupervisedSplitter splitter # Splitting algorithm - cdef SIZE_t min_samples_split # Minimum number of samples in an internal node - cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf - cdef double min_weight_leaf # Minimum weight in a leaf - cdef SIZE_t max_depth # Maximal tree depth - cdef double min_impurity_decrease # Impurity threshold for early stopping + cdef intp_t min_samples_split # Minimum number of samples in an internal node + cdef intp_t min_samples_leaf # Minimum number of samples in a leaf + cdef float64_t min_weight_leaf # Minimum weight in a leaf + cdef intp_t max_depth # Maximal tree depth + cdef float64_t min_impurity_decrease # Impurity threshold for early stopping cpdef build( self, UnsupervisedTree tree, object X, - const DOUBLE_t[:] sample_weight=* + const float64_t[:] sample_weight=* ) cdef _check_input( self, object X, - const DOUBLE_t[:] sample_weight + const float64_t[:] sample_weight ) diff --git a/sktree/tree/unsupervised/_unsup_tree.pyx b/sktree/tree/unsupervised/_unsup_tree.pyx index df4217ce7..491a2f920 100644 --- a/sktree/tree/unsupervised/_unsup_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_tree.pyx @@ -29,10 +29,10 @@ cnp.import_array() cdef extern from "numpy/arrayobject.h": object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr, - int nd, cnp.npy_intp* dims, + intp_t nd, cnp.npy_intp* dims, cnp.npy_intp* strides, - void* data, int flags, object obj) - int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) + void* data, intp_t flags, object obj) + intp_t PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) cdef extern from "" namespace "std" nogil: cdef cppclass stack[T]: @@ -51,19 +51,19 @@ from numpy import float32 as DTYPE from numpy import float64 as DOUBLE -cdef double INFINITY = np.inf -cdef double EPSILON = np.finfo('double').eps +cdef float64_t INFINITY = np.inf +cdef float64_t EPSILON = np.finfo('double').eps # Some handy constants (BestFirstTreeBuilder) -cdef int IS_FIRST = 1 -cdef int IS_NOT_FIRST = 0 -cdef int IS_LEFT = 1 -cdef int IS_NOT_LEFT = 0 +cdef intp_t IS_FIRST = 1 +cdef intp_t IS_NOT_FIRST = 0 +cdef intp_t IS_LEFT = 1 +cdef intp_t IS_NOT_LEFT = 0 TREE_LEAF = -1 TREE_UNDEFINED = -2 -cdef SIZE_t _TREE_LEAF = TREE_LEAF -cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED +cdef intp_t _TREE_LEAF = TREE_LEAF +cdef intp_t _TREE_UNDEFINED = TREE_UNDEFINED # Build the corresponding numpy dtype for Node. # This works by casting `dummy` to an array of Node of length 1, which numpy @@ -83,7 +83,7 @@ cdef class UnsupervisedTreeBuilder: self, UnsupervisedTree tree, object X, - const DOUBLE_t[:] sample_weight=None + const float64_t[:] sample_weight=None ): """Build a decision tree from the training set X.""" pass @@ -91,7 +91,7 @@ cdef class UnsupervisedTreeBuilder: cdef inline _check_input( self, object X, - const DOUBLE_t[:] sample_weight, + const float64_t[:] sample_weight, ): """Check input dtype, layout and format""" if issparse(X): @@ -121,27 +121,27 @@ cdef struct FrontierRecord: # Record of information of a Node, the frontier for a split. Those records are # maintained in a heap to access the Node with the best improvement in impurity, # allowing growing trees greedily on this improvement. - SIZE_t node_id - SIZE_t start - SIZE_t end - SIZE_t pos - SIZE_t depth + intp_t node_id + intp_t start + intp_t end + intp_t pos + intp_t depth bint is_leaf - double impurity - double impurity_left - double impurity_right - double improvement + float64_t impurity + float64_t impurity_left + float64_t impurity_right + float64_t improvement # Depth first builder --------------------------------------------------------- # A record on the stack for depth-first tree growing cdef struct StackRecord: - SIZE_t start - SIZE_t end - SIZE_t depth - SIZE_t parent + intp_t start + intp_t end + intp_t depth + intp_t parent bint is_left - double impurity - SIZE_t n_constant_features + float64_t impurity + intp_t n_constant_features cdef inline bool _compare_records( const FrontierRecord& left, @@ -164,17 +164,17 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): The best node to expand is given by the node at the frontier that has the highest impurity improvement. """ - cdef SIZE_t max_leaf_nodes + cdef intp_t max_leaf_nodes def __cinit__( self, UnsupervisedSplitter splitter, - SIZE_t min_samples_split, - SIZE_t min_samples_leaf, - double min_weight_leaf, - SIZE_t max_depth, - SIZE_t max_leaf_nodes, - double min_impurity_decrease + intp_t min_samples_split, + intp_t min_samples_leaf, + float64_t min_weight_leaf, + intp_t max_depth, + intp_t max_leaf_nodes, + float64_t min_impurity_decrease ): self.splitter = splitter self.min_samples_split = min_samples_split @@ -188,7 +188,7 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): self, UnsupervisedTree tree, object X, - const DOUBLE_t[:] sample_weight=None + const float64_t[:] sample_weight=None ): """Build a decision tree from the training set X.""" # check input @@ -196,7 +196,7 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): # Parameters cdef UnsupervisedSplitter splitter = self.splitter - cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes + cdef intp_t max_leaf_nodes = self.max_leaf_nodes # Recursive partition (without actual recursion) splitter.init(X, sample_weight) @@ -206,15 +206,15 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): cdef FrontierRecord split_node_left cdef FrontierRecord split_node_right - cdef SIZE_t n_node_samples = splitter.n_samples - cdef SIZE_t max_split_nodes = max_leaf_nodes - 1 + cdef intp_t n_node_samples = splitter.n_samples + cdef intp_t max_split_nodes = max_leaf_nodes - 1 cdef bint is_leaf - cdef SIZE_t max_depth_seen = -1 - cdef int rc = 0 + cdef intp_t max_depth_seen = -1 + cdef intp_t rc = 0 cdef Node* node # Initial capacity - cdef SIZE_t init_capacity = max_split_nodes + max_leaf_nodes + cdef intp_t init_capacity = max_split_nodes + max_leaf_nodes tree._resize(init_capacity) with nogil: @@ -285,17 +285,17 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): if rc == -1: raise MemoryError() - cdef inline int _add_split_node( + cdef inline intp_t _add_split_node( self, UnsupervisedSplitter splitter, UnsupervisedTree tree, - SIZE_t start, - SIZE_t end, - double impurity, + intp_t start, + intp_t end, + float64_t impurity, bint is_first, bint is_left, Node* parent, - SIZE_t depth, + intp_t depth, FrontierRecord* res ) except -1 nogil: """Adds node w/ partition ``[start, end)`` to the frontier. """ @@ -305,11 +305,11 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): cdef SplitRecord split cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) - cdef SIZE_t node_id - cdef SIZE_t n_node_samples - cdef SIZE_t n_constant_features = 0 - cdef double min_impurity_decrease = self.min_impurity_decrease - cdef double weighted_n_node_samples + cdef intp_t node_id + cdef intp_t n_node_samples + cdef intp_t n_constant_features = 0 + cdef float64_t min_impurity_decrease = self.min_impurity_decrease + cdef float64_t weighted_n_node_samples cdef bint is_leaf splitter.node_reset(start, end, &weighted_n_node_samples) @@ -382,11 +382,11 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): def __cinit__( self, UnsupervisedSplitter splitter, - SIZE_t min_samples_split, - SIZE_t min_samples_leaf, - double min_weight_leaf, - SIZE_t max_depth, - double min_impurity_decrease + intp_t min_samples_split, + intp_t min_samples_leaf, + float64_t min_weight_leaf, + intp_t max_depth, + float64_t min_impurity_decrease ): self.splitter = splitter self.min_samples_split = min_samples_split @@ -399,7 +399,7 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): self, UnsupervisedTree tree, object X, - const DOUBLE_t[:] sample_weight=None + const float64_t[:] sample_weight=None ): """Build a decision tree from the training set (X, y).""" @@ -407,10 +407,10 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): X, sample_weight = self._check_input(X, sample_weight) # Initial capacity - cdef int init_capacity + cdef intp_t init_capacity if tree.max_depth <= 10: - init_capacity = (2 ** (tree.max_depth + 1)) - 1 + init_capacity = (2 ** (tree.max_depth + 1)) - 1 else: init_capacity = 2047 @@ -418,23 +418,23 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): # Parameters cdef UnsupervisedSplitter splitter = self.splitter - cdef SIZE_t max_depth = self.max_depth - cdef SIZE_t min_samples_leaf = self.min_samples_leaf - cdef double min_weight_leaf = self.min_weight_leaf - cdef SIZE_t min_samples_split = self.min_samples_split - cdef double min_impurity_decrease = self.min_impurity_decrease + cdef intp_t max_depth = self.max_depth + cdef intp_t min_samples_leaf = self.min_samples_leaf + cdef float64_t min_weight_leaf = self.min_weight_leaf + cdef intp_t min_samples_split = self.min_samples_split + cdef float64_t min_impurity_decrease = self.min_impurity_decrease # Recursive partition (without actual recursion) splitter.init(X, sample_weight) - cdef SIZE_t start - cdef SIZE_t end - cdef SIZE_t depth - cdef SIZE_t parent + cdef intp_t start + cdef intp_t end + cdef intp_t depth + cdef intp_t parent cdef bint is_left - cdef SIZE_t n_node_samples = splitter.n_samples - cdef double weighted_n_node_samples - cdef SIZE_t node_id + cdef intp_t n_node_samples = splitter.n_samples + cdef float64_t weighted_n_node_samples + cdef intp_t node_id # initialize record to keep track of split node data and a pointer to the # memory address containing the split node @@ -442,12 +442,12 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): cdef SplitRecord split cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) - cdef double impurity = INFINITY - cdef SIZE_t n_constant_features + cdef float64_t impurity = INFINITY + cdef intp_t n_constant_features cdef bint is_leaf cdef bint first = 1 - cdef SIZE_t max_depth_seen = -1 - cdef int rc = 0 + cdef intp_t max_depth_seen = -1 + cdef intp_t rc = 0 cdef stack[StackRecord] builder_stack cdef StackRecord stack_record @@ -572,45 +572,45 @@ cdef class UnsupervisedTree(BaseTree): Attributes ---------- - node_count : int + node_count : intp_t The number of nodes (internal nodes + leaves) in the tree. - capacity : int + capacity : intp_t The current capacity (i.e., size) of the arrays, which is at least as great as `node_count`. - max_depth : int + max_depth : intp_t The depth of the tree, i.e. the maximum depth of its leaves. - children_left : array of int, shape [node_count] + children_left : array of intp_t, shape [node_count] children_left[i] holds the node id of the left child of node i. For leaves, children_left[i] == TREE_LEAF. Otherwise, children_left[i] > i. This child handles the case where X[:, feature[i]] <= threshold[i]. - children_right : array of int, shape [node_count] + children_right : array of intp_t, shape [node_count] children_right[i] holds the node id of the right child of node i. For leaves, children_right[i] == TREE_LEAF. Otherwise, children_right[i] > i. This child handles the case where X[:, feature[i]] > threshold[i]. - feature : array of int, shape [node_count] + feature : array of intp_t, shape [node_count] feature[i] holds the feature to split on, for the internal node i. - threshold : array of double, shape [node_count] + threshold : array of float64_t, shape [node_count] threshold[i] holds the threshold for the internal node i. - value : array of double, shape [node_count] + value : array of float64_t, shape [node_count] Contains the constant prediction value of each node. - impurity : array of double, shape [node_count] + impurity : array of float64_t, shape [node_count] impurity[i] holds the impurity (i.e., the value of the splitting criterion) at node i. - n_node_samples : array of int, shape [node_count] + n_node_samples : array of intp_t, shape [node_count] n_node_samples[i] holds the number of training samples reaching node i. - weighted_n_node_samples : array of double, shape [node_count] + weighted_n_node_samples : array of float64_t, shape [node_count] weighted_n_node_samples[i] holds the weighted number of training samples reaching node i. """ @@ -655,7 +655,7 @@ cdef class UnsupervisedTree(BaseTree): def value(self): return self._get_value_ndarray()[:self.node_count] - def __cinit__(self, int n_features): + def __cinit__(self, intp_t n_features): """Constructor.""" # Input/Output layout self.n_features = n_features @@ -718,7 +718,7 @@ cdef class UnsupervisedTree(BaseTree): memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), self.capacity * sizeof(Node)) memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) + self.capacity * self.value_stride * sizeof(float64_t)) cdef cnp.ndarray _get_value_ndarray(self): """Wraps value as a 3-d NumPy array. @@ -784,12 +784,12 @@ def _check_value_ndarray(value_ndarray, expected_dtype, expected_shape): ) -def _dtype_to_dict(dtype): +def _float32_to_dict(dtype): return {name: dt.str for name, (dt, *rest) in dtype.fields.items()} def _dtype_dict_with_modified_bitness(dtype_dict): - # field names in Node struct with SIZE_t types (see sklearn/tree/_tree.pxd) + # field names in Node struct with intp_t types (see sklearn/tree/_tree.pxd) indexing_field_names = ["left_child", "right_child", "feature", "n_node_samples"] expected_dtype_size = str(struct.calcsize("P")) @@ -805,7 +805,7 @@ def _dtype_dict_with_modified_bitness(dtype_dict): def _all_compatible_dtype_dicts(dtype): - # The Cython code for decision trees uses platform-specific SIZE_t + # The Cython code for decision trees uses platform-specific intp_t # typed indexing fields that correspond to either i4 or i8 dtypes for # the matching fields in the numpy array depending on the bitness of # the platform (32 bit or 64 bit respectively). @@ -821,9 +821,9 @@ def _all_compatible_dtype_dicts(dtype): # saved can have a different endianness than the machine where the pickle # is loaded - dtype_dict = _dtype_to_dict(dtype) + dtype_dict = _float32_to_dict(dtype) dtype_dict_with_modified_bitness = _dtype_dict_with_modified_bitness(dtype_dict) - dtype_dict_with_modified_endianness = _dtype_to_dict(dtype.newbyteorder()) + dtype_dict_with_modified_endianness = _float32_to_dict(dtype.newbyteorder()) dtype_dict_with_modified_bitness_and_endianness = _dtype_dict_with_modified_bitness( dtype_dict_with_modified_endianness ) @@ -852,7 +852,7 @@ def _check_node_ndarray(node_ndarray, expected_dtype): if node_ndarray_dtype == expected_dtype: return node_ndarray - node_ndarray_dtype_dict = _dtype_to_dict(node_ndarray_dtype) + node_ndarray_dtype_dict = _float32_to_dict(node_ndarray_dtype) all_compatible_dtype_dicts = _all_compatible_dtype_dicts(expected_dtype) if node_ndarray_dtype_dict not in all_compatible_dtype_dicts: From ce36d7bae71222fc1c7368060fc23c99a636c862 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 15:47:19 -0400 Subject: [PATCH 5/9] Adding multiview dtc example Signed-off-by: Adam Li --- examples/multiview/plot_multiview_dtc.py | 83 ++++++++++++------- .../plot_multiview_axis_aligned_splitter.py | 2 +- 2 files changed, 54 insertions(+), 31 deletions(-) diff --git a/examples/multiview/plot_multiview_dtc.py b/examples/multiview/plot_multiview_dtc.py index 84089547e..37629093c 100644 --- a/examples/multiview/plot_multiview_dtc.py +++ b/examples/multiview/plot_multiview_dtc.py @@ -3,34 +3,40 @@ Analyze a multi-view dataset with a multi-view random forest ============================================================ -An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric -multivariate hypothesis test, on simulated datasets. Here, we present a simulation -of how MIGHT is used to evaluate how a "feature set is important for predicting the target". - -We simulate a dataset with 1000 features, 500 samples, and a binary class target -variable. Within each feature set, there is 500 features associated with one feature -set, and another 500 features associated with another feature set. One could think of -these for example as different datasets collected on the same patient in a biomedical setting. -The first feature set (X) is strongly correlated with the target, and the second -feature set (W) is weakly correlated with the target (y). - -We then use MIGHT to calculate the partial AUC of these sets. +An example using :class:`~sktree.MultiViewRandomForestClassifier` for high-dimensional +classification when there are multiple feature sets that are correlated with the +target variable, ``y``. The multi-view random forest is a variant of the random forest +that samples from each feature set uniformly, instead of sampling from all features +uniformly. This is useful when there are multiple feature sets, and some feature sets +have vastly different dimensionality from others. + +In this case, ``X`` is a matrix of shape ``(n_samples, n_features)``, where +``n_features`` is the sum of the number of features in each feature set. If the multi-view +structure is known, then one can pass this to the multi-view random forest via the +``feature_set_ends`` parameter. + +For a visualization of how the multi-view splitter works, see +:ref:`sphx_glr_auto_examples_splitters_plot_multiview_axis_aligned_splitter.py`. """ +from collections import defaultdict + +import matplotlib.pyplot as plt import numpy as np -from sklearn.datasets import make_blobs -from sklearn.model_selection import cross_val_score import pandas as pd import seaborn as sns -import matplotlib.pyplot as plt -from collections import defaultdict +from sklearn.datasets import make_blobs +from sklearn.model_selection import cross_val_score + from sktree import MultiViewRandomForestClassifier, RandomForestClassifier seed = 12345 rng = np.random.default_rng(seed) -def make_multiview_classification(n_samples=100, n_features_1=5, n_features_2=1000, cluster_std=2.0, seed=None): +def make_multiview_classification( + n_samples=100, n_features_1=5, n_features_2=1000, cluster_std=2.0, seed=None +): rng = np.random.default_rng(seed=seed) # Create a high-dimensional multiview dataset with a low-dimensional informative @@ -56,8 +62,11 @@ def make_multiview_classification(n_samples=100, n_features_1=5, n_features_2=10 X = np.vstack((X0, X1)) y = np.hstack((y0, y1)).T + X = X + rng.standard_normal(size=X.shape) + return X, y + # %% # Simulate data # ------------- @@ -66,12 +75,18 @@ def make_multiview_classification(n_samples=100, n_features_1=5, n_features_2=10 # dimensions. The sample-size will be kept fixed, so we can compare the performance of # regular Random forests with Multi-view Random Forests. -n_samples = 1000 -n_features_views = np.linspace(5, 1000, 2).astype(int) +n_samples = 100 +n_features_views = np.linspace(5, 10000, 5).astype(int) datasets = [] for idx, n_features in enumerate(n_features_views): - X, y = make_multiview_classification(n_samples=n_samples, n_features_1=5, n_features_2=n_features, cluster_std=2.0, seed=seed + idx) + X, y = make_multiview_classification( + n_samples=n_samples, + n_features_1=5, + n_features_2=n_features, + cluster_std=2.0, + seed=seed + idx, + ) datasets.append((X, y)) # %% @@ -79,46 +94,54 @@ def make_multiview_classification(n_samples=100, n_features_1=5, n_features_2=10 # ---------------------------------------------- # Here, we fit both forests over all the datasets. -n_estimators = 50 +n_estimators = 100 n_jobs = -1 scores = defaultdict(list) -for ((X, y), n_features) in zip(datasets, n_features_views): +for idx, ((X, y), n_features) in enumerate(zip(datasets, n_features_views)): feature_set_ends = np.array([5, n_features + 5]) rf = RandomForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, + random_state=seed, ) mvrf = MultiViewRandomForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, feature_set_ends=n_features_views, + random_state=seed, ) # obtain the cross-validation score rf_score = cross_val_score(rf, X, y, cv=2).mean() mvrf_score = cross_val_score(mvrf, X, y, cv=2).mean() - scores['rf'].append(rf_score) - scores['mvrf'].append(mvrf_score) + scores["rf"].append(rf_score) + scores["mvrf"].append(mvrf_score) # %% # Visualize scores and compare performance # ---------------------------------------- # Now, we can compare the performance from the cross-validation experiment. -scores['n_features'] = n_features_views +scores["n_features"] = n_features_views df = pd.DataFrame(scores) # melt the dataframe, to make it easier to plot -df = pd.melt(df, id_vars=['n_features'], var_name='model', value_name='score') +df = pd.melt(df, id_vars=["n_features"], var_name="model", value_name="score") fig, ax = plt.subplots() -sns.lineplot(data=df, x='n_features', y='score', hue='model', label='CV Score', ax=ax) -ax.set_ylabel('CV Score') -ax.set_xlabel('Number of features in second view') -ax.set_title('Random Forest vs Multi-view Random Forest') +sns.lineplot(data=df, x="n_features", y="score", marker="o", hue="model", ax=ax) +ax.set_ylabel("CV Score") +ax.set_xlabel("Number of features in second view") +ax.set_title("Random Forest vs Multi-view Random Forest") plt.show() + +# %% +# As we can see, the multi-view random forest outperforms the regular random forest +# as the number of features in the second view increases. This is because the multi-view +# random forest samples from each feature-set uniformly, while the regular random forest +# samples from all features uniformly. This is a key difference between the two forests. diff --git a/examples/splitters/plot_multiview_axis_aligned_splitter.py b/examples/splitters/plot_multiview_axis_aligned_splitter.py index 1939d4599..7ceb200da 100644 --- a/examples/splitters/plot_multiview_axis_aligned_splitter.py +++ b/examples/splitters/plot_multiview_axis_aligned_splitter.py @@ -124,4 +124,4 @@ # This is the key difference between the two splitters. # # For an example of using the multi-view splitter in practice on simulated data, see -# :ref:`sphx_glr_auto_examples_plot_multiview_dtc.py`. +# :ref:`sphx_glr_auto_examples_multiview_plot_multiview_dtc.py`. From c206463490b1f7c140379180c1b1526caac72dd7 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 15:53:34 -0400 Subject: [PATCH 6/9] Organize examples Signed-off-by: Adam Li --- examples/calibration/README.txt | 6 + .../plot_overlapping_gaussians.py | 0 ...gigantic_hypothesis_testing_forest copy.py | 138 ------------------ examples/plot_multiview_dtc.py | 127 ---------------- examples/sklearn_vs_sktree/README.txt | 6 + .../{ => sklearn_vs_sktree}/plot_iris_dtc.py | 0 examples/sparse_oblique_trees/README.txt | 6 + .../plot_extra_oblique_random_forest.py | 0 .../plot_extra_orf_sample_size.py | 0 ...ique_axis_aligned_forests_sparse_parity.py | 0 .../plot_oblique_forests_iris.py | 0 .../plot_oblique_random_forest.py | 0 12 files changed, 18 insertions(+), 265 deletions(-) create mode 100644 examples/calibration/README.txt rename examples/{ => calibration}/plot_overlapping_gaussians.py (100%) delete mode 100644 examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py delete mode 100644 examples/plot_multiview_dtc.py create mode 100644 examples/sklearn_vs_sktree/README.txt rename examples/{ => sklearn_vs_sktree}/plot_iris_dtc.py (100%) create mode 100644 examples/sparse_oblique_trees/README.txt rename examples/{ => sparse_oblique_trees}/plot_extra_oblique_random_forest.py (100%) rename examples/{ => sparse_oblique_trees}/plot_extra_orf_sample_size.py (100%) rename examples/{ => sparse_oblique_trees}/plot_oblique_axis_aligned_forests_sparse_parity.py (100%) rename examples/{ => sparse_oblique_trees}/plot_oblique_forests_iris.py (100%) rename examples/{ => sparse_oblique_trees}/plot_oblique_random_forest.py (100%) diff --git a/examples/calibration/README.txt b/examples/calibration/README.txt new file mode 100644 index 000000000..84d5f5758 --- /dev/null +++ b/examples/calibration/README.txt @@ -0,0 +1,6 @@ +.. _calibration_examples: + +Calibrated decision trees via honesty +------------------------------------- + +Examples demonstrating the usage of honest decision trees to obtain calibrated predictions. diff --git a/examples/plot_overlapping_gaussians.py b/examples/calibration/plot_overlapping_gaussians.py similarity index 100% rename from examples/plot_overlapping_gaussians.py rename to examples/calibration/plot_overlapping_gaussians.py diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py deleted file mode 100644 index 423bc63dc..000000000 --- a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py +++ /dev/null @@ -1,138 +0,0 @@ -""" -=========================================================== -Mutual Information for Gigantic Hypothesis Testing (MIGHT) -=========================================================== - -An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric -multivariate hypothesis test, on simulated datasets. Here, we present a simulation -of how MIGHT is used to test the hypothesis that a "feature set is important for -predicting the target". This is a generalization of the framework presented in -:footcite:`coleman2022scalable`. - -We simulate a dataset with 1000 features, 500 samples, and a binary class target -variable. Within each feature set, there is 500 features associated with one feature -set, and another 500 features associated with another feature set. One could think of -these for example as different datasets collected on the same patient in a biomedical setting. -The first feature set (X) is strongly correlated with the target, and the second -feature set (W) is weakly correlated with the target (y). Here, we are testing the -null hypothesis: - -- ``H0: I(X; y) - I(X, W; y) = 0`` -- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with - respect to ``y`` - -where ``I`` is mutual information. For example, this could be true in the following settings, -where X is our informative feature set and W is our uninformative feature set. - -- ``W X -> y``: here ``W`` is completely disconnected from X and y. -- ``W -> X -> y``: here ``W`` is d-separated from y given X. -- ``W <- X -> y``: here ``W`` is d-separated from y given X. - -We then use MIGHT to test the hypothesis that the first feature set is important for -predicting the target, and the second feature set is not important for predicting the -target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`. -""" - -import numpy as np -from scipy.special import expit - -from sktree import HonestForestClassifier -from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier - -seed = 12345 -rng = np.random.default_rng(seed) - -# %% -# Simulate data -# ------------- -# We simulate the two feature sets, and the target variable. We then combine them -# into a single dataset to perform hypothesis testing. - -n_samples = 1000 -n_features_set = 500 -mean = 1.0 -sigma = 2.0 -beta = 5.0 - -unimportant_mean = 0.0 -unimportant_sigma = 4.5 - -# first sample the informative features, and then the uniformative features -X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) -X_important = np.hstack( - [ - X_important, - rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) - ), - ] -) - -X_unimportant = rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) -) -X = np.hstack([X_important, X_unimportant]) - -# simulate the binary target variable -y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) - -# %% -# Perform hypothesis testing using Mutual Information -# --------------------------------------------------- -# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis -# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated -# between two forests. One forest is trained on the original dataset, and one forest is trained -# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly. -# -# The null distribution is then estimated in an efficient manner using the framework of -# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors) -# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then -# computed as the proportion of samples in the null distribution that are less than the -# observed test statistic. - -n_estimators = 200 -max_features = "sqrt" -test_size = 0.2 -n_repeats = 1000 -n_jobs = -1 - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - random_state=seed, - honest_fraction=0.7, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, - permute_per_tree=True, - sample_dataset_per_tree=False, -) - -print( - f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " - f"{est.sample_dataset_per_tree}" -) -# we test for the first feature set, which is important and thus should return a pvalue < 0.05 -stat, pvalue = est.test( - X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats -) -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 -stat, pvalue = est.test( - X, - y, - covariate_index=np.arange(n_features_set, dtype=int) + n_features_set, - metric="mi", - n_repeats=n_repeats, -) -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") - -# %% -# References -# ---------- -# .. footbibliography:: diff --git a/examples/plot_multiview_dtc.py b/examples/plot_multiview_dtc.py deleted file mode 100644 index bd67940db..000000000 --- a/examples/plot_multiview_dtc.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -============================================================ -Analyze a multi-view dataset with a multi-view random forest -============================================================ - -An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric -multivariate hypothesis test, on simulated datasets. Here, we present a simulation -of how MIGHT is used to evaluate how a "feature set is important for predicting the target". - -We simulate a dataset with 1000 features, 500 samples, and a binary class target -variable. Within each feature set, there is 500 features associated with one feature -set, and another 500 features associated with another feature set. One could think of -these for example as different datasets collected on the same patient in a biomedical setting. -The first feature set (X) is strongly correlated with the target, and the second -feature set (W) is weakly correlated with the target (y). - -We then use MIGHT to calculate the partial AUC of these sets. -""" - -import numpy as np -from scipy.special import expit - -from sktree import HonestForestClassifier -from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier - -seed = 12345 -rng = np.random.default_rng(seed) - -# %% -# Simulate data -# ------------- -# We simulate the two feature sets, and the target variable. We then combine them -# into a single dataset to perform hypothesis testing. - -n_samples = 1000 -n_features_set = 500 -mean = 1.0 -sigma = 2.0 -beta = 5.0 - -unimportant_mean = 0.0 -unimportant_sigma = 4.5 - -# first sample the informative features, and then the uniformative features -X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) -X_important = np.hstack( - [ - X_important, - rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) - ), - ] -) - -X_unimportant = rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) -) - -# simulate the binary target variable -y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) - -# %% -# Use partial AUC as test statistic -# --------------------------------- -# You can specify the maximum specificity by modifying ``max_fpr`` in ``statistic``. - -n_estimators = 125 -max_features = "sqrt" -metric = "auc" -test_size = 0.2 -n_jobs = -1 -honest_fraction = 0.7 -max_fpr = 0.1 - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - random_state=seed, - honest_fraction=honest_fraction, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, - permute_per_tree=True, - sample_dataset_per_tree=True, -) - -# we test for the first feature set, which is important and thus should return a higher AUC -stat, posterior_arr, samples = est.statistic( - X_important, - y, - metric=metric, - return_posteriors=True, -) - -print(f"ASH-90 / Partial AUC: {stat}") -print(f"Shape of Observed Samples: {samples.shape}") -print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") - -# %% -# Repeat for the second feature set -# --------------------------------- -# This feature set has a smaller statistic, which is expected due to its weak correlation. - -stat, posterior_arr, samples = est.statistic( - X_unimportant, - y, - metric=metric, - return_posteriors=True, -) - -print(f"ASH-90 / Partial AUC: {stat}") -print(f"Shape of Observed Samples: {samples.shape}") -print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") - -# %% -# All posteriors are saved within the model -# ----------------------------------------- -# Extract the results from the model variables anytime. You can save the model with ``pickle``. -# -# ASH-90 / Partial AUC: ``est.observe_stat_`` -# Observed Samples: ``est.observe_samples_`` -# Tree Posteriors for the positive class: ``est.observe_posteriors_`` (n_trees, n_samples_test, 1) -# True Labels: ``est.y_true_final_`` diff --git a/examples/sklearn_vs_sktree/README.txt b/examples/sklearn_vs_sktree/README.txt new file mode 100644 index 000000000..d942d71f8 --- /dev/null +++ b/examples/sklearn_vs_sktree/README.txt @@ -0,0 +1,6 @@ +.. _sklearn_examples: + +Comparing sklearn and sktree decision trees +------------------------------------------- + +Examples demonstrating the difference between sklearn and sktree decision trees. diff --git a/examples/plot_iris_dtc.py b/examples/sklearn_vs_sktree/plot_iris_dtc.py similarity index 100% rename from examples/plot_iris_dtc.py rename to examples/sklearn_vs_sktree/plot_iris_dtc.py diff --git a/examples/sparse_oblique_trees/README.txt b/examples/sparse_oblique_trees/README.txt new file mode 100644 index 000000000..61c596af1 --- /dev/null +++ b/examples/sparse_oblique_trees/README.txt @@ -0,0 +1,6 @@ +.. _sporf_examples: + +Sparse oblique projections with oblique decision-trees +------------------------------------------------------ + +Examples demonstrating learning using oblique random forests. diff --git a/examples/plot_extra_oblique_random_forest.py b/examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py similarity index 100% rename from examples/plot_extra_oblique_random_forest.py rename to examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py diff --git a/examples/plot_extra_orf_sample_size.py b/examples/sparse_oblique_trees/plot_extra_orf_sample_size.py similarity index 100% rename from examples/plot_extra_orf_sample_size.py rename to examples/sparse_oblique_trees/plot_extra_orf_sample_size.py diff --git a/examples/plot_oblique_axis_aligned_forests_sparse_parity.py b/examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py similarity index 100% rename from examples/plot_oblique_axis_aligned_forests_sparse_parity.py rename to examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py diff --git a/examples/plot_oblique_forests_iris.py b/examples/sparse_oblique_trees/plot_oblique_forests_iris.py similarity index 100% rename from examples/plot_oblique_forests_iris.py rename to examples/sparse_oblique_trees/plot_oblique_forests_iris.py diff --git a/examples/plot_oblique_random_forest.py b/examples/sparse_oblique_trees/plot_oblique_random_forest.py similarity index 100% rename from examples/plot_oblique_random_forest.py rename to examples/sparse_oblique_trees/plot_oblique_random_forest.py From c34f20649c3de46dbb930b920dbfbe28525253bc Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 16:11:49 -0400 Subject: [PATCH 7/9] Fix docs Signed-off-by: Adam Li --- .gitignore | 1 + doc/modules/ensemble.rst | 4 ++-- .../sparse_oblique_trees/plot_extra_oblique_random_forest.py | 2 +- examples/sparse_oblique_trees/plot_oblique_random_forest.py | 2 +- examples/splitters/plot_sparse_projection_matrix.py | 4 ++-- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 39a3a651a..db44ed79b 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ doc/coverages doc/samples cover examples/*.jpg +examples/**/*.jpg env/ html/ diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 1f76b888f..38685efad 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -22,8 +22,8 @@ more information and intuition, see .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_plot_oblique_random_forest.py` - * :ref:`sphx_glr_auto_examples_plot_oblique_axis_aligned_forests_sparse_parity.py` + * :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_random_forest.py` + * :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_axis_aligned_forests_sparse_parity.py` .. topic:: References diff --git a/examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py b/examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py index 586edd833..fd920cc16 100644 --- a/examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py +++ b/examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py @@ -59,7 +59,7 @@ range, hence the complexity is `O(n)`. This makes the algorithm more suitable for large datasets. To see how sample-sizes affect the performance of Extra Oblique Trees vs regular Oblique Trees, -see :ref:`sphx_glr_auto_examples_plot_extra_orf_sample_size.py` +see :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_extra_orf_sample_size.py` References ---------- diff --git a/examples/sparse_oblique_trees/plot_oblique_random_forest.py b/examples/sparse_oblique_trees/plot_oblique_random_forest.py index 34cb8c68a..dca3f19da 100644 --- a/examples/sparse_oblique_trees/plot_oblique_random_forest.py +++ b/examples/sparse_oblique_trees/plot_oblique_random_forest.py @@ -18,7 +18,7 @@ are subsampled due to computational constraints. For an example of using extra-oblique trees/forests in practice on data, see the following -example :ref:`sphx_glr_auto_examples_plot_extra_oblique_random_forest.py`. +example :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_extra_oblique_random_forest.py`. """ from datetime import datetime diff --git a/examples/splitters/plot_sparse_projection_matrix.py b/examples/splitters/plot_sparse_projection_matrix.py index 6eb53e0ca..c77c128a0 100644 --- a/examples/splitters/plot_sparse_projection_matrix.py +++ b/examples/splitters/plot_sparse_projection_matrix.py @@ -125,5 +125,5 @@ # For an example of using oblique trees/forests in practice on data, see the following # examples: # -# - :ref:`sphx_glr_auto_examples_plot_oblique_forests_iris.py` -# - :ref:`sphx_glr_auto_examples_plot_oblique_random_forest.py` +# - :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_forests_iris.py` +# - :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_random_forest.py` From 92a79e8013fe5280b162a493e5dced7bbed69c7a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 16:11:58 -0400 Subject: [PATCH 8/9] Fix docs Signed-off-by: Adam Li --- doc/modules/supervised_tree.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/supervised_tree.rst b/doc/modules/supervised_tree.rst index 91824e04c..1aa0a602a 100644 --- a/doc/modules/supervised_tree.rst +++ b/doc/modules/supervised_tree.rst @@ -41,7 +41,7 @@ as follows: >>> clf = tree.ObliqueDecisionTreeClassifier() >>> clf = clf.fit(X, y) -.. figure:: ../auto_examples/images/sphx_glr_plot_iris_dtc_002.png +.. figure:: ../auto_examples/sklearn_vs_sktree/images/sphx_glr_plot_iris_dtc_002.png :target: ../auto_examples/plot_iris_dtc.html :align: center From 993c30fee8367411592baaa930099b03b422d06b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 13 Oct 2023 10:07:24 -0400 Subject: [PATCH 9/9] Adding imbalanced MIGHT example Signed-off-by: Adam Li --- .../plot_MI_imbalanced_hyppo_testing.py | 221 +++++++++++++----- 1 file changed, 160 insertions(+), 61 deletions(-) diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py index 14c260725..882f80c3d 100644 --- a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py +++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py @@ -3,42 +3,34 @@ Mutual Information for Gigantic Hypothesis Testing (MIGHT) with Imbalanced Data =============================================================================== -An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric -multivariate hypothesis test, on simulated datasets. Here, we present a simulation -of how MIGHT is used to test the hypothesis that a "feature set is important for -predicting the target". This is a generalization of the framework presented in -:footcite:`coleman2022scalable`. - -We simulate a dataset with 1000 features, 500 samples, and a binary class target -variable. Within each feature set, there is 500 features associated with one feature -set, and another 500 features associated with another feature set. One could think of -these for example as different datasets collected on the same patient in a biomedical setting. -The first feature set (X) is strongly correlated with the target, and the second -feature set (W) is weakly correlated with the target (y). Here, we are testing the -null hypothesis: - -- ``H0: I(X; y) - I(X, W; y) = 0`` -- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with - respect to ``y`` - -where ``I`` is mutual information. For example, this could be true in the following settings, -where X is our informative feature set and W is our uninformative feature set. - -- ``W X -> y``: here ``W`` is completely disconnected from X and y. -- ``W -> X -> y``: here ``W`` is d-separated from y given X. -- ``W <- X -> y``: here ``W`` is d-separated from y given X. - -We then use MIGHT to test the hypothesis that the first feature set is important for -predicting the target, and the second feature set is not important for predicting the -target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`. +Here, we demonstrate how to do hypothesis testing on highly imbalanced data +in terms of their feature-set dimensionalities. +using mutual information as a test statistic. We use the framework of +:footcite:`coleman2022scalable` to estimate pvalues efficiently. + +Here, we simulate two feature sets, one of which is important for the target, +but significantly smaller in dimensionality than the other feature set, which +is unimportant for the target. We then use the MIGHT framework to test for +the importance of each feature set. Instead of leveraging a normal honest random +forest to estimate the posteriors, here we leverage a multi-view honest random +forest, with knowledge of the multi-view structure of the ``X`` data. + +For other examples of hypothesis testing, see the following: + +- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_MI_gigantic_hypothesis_testing_forest.py` +- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_might_auc.py` + +For more information on the multi-view decision-tree, see +:ref:`sphx_glr_auto_examples_multiview_plot_multiview_dtc.py`. """ +import matplotlib.pyplot as plt import numpy as np -from scipy.special import expit +from sklearn.datasets import make_blobs from sktree import HonestForestClassifier from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier +from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier seed = 12345 rng = np.random.default_rng(seed) @@ -48,35 +40,67 @@ # ------------- # We simulate the two feature sets, and the target variable. We then combine them # into a single dataset to perform hypothesis testing. +seed = 12345 +rng = np.random.default_rng(seed) -n_samples = 1000 -n_features_set = 500 -mean = 1.0 -sigma = 2.0 -beta = 5.0 - -unimportant_mean = 0.0 -unimportant_sigma = 4.5 - -# first sample the informative features, and then the uniformative features -X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) -X_important = np.hstack( - [ - X_important, - rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) - ), - ] -) -X_unimportant = rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) +def make_multiview_classification( + n_samples=100, n_features_1=10, n_features_2=1000, cluster_std=2.0, seed=None +): + rng = np.random.default_rng(seed=seed) + + # Create a high-dimensional multiview dataset with a low-dimensional informative + # subspace in one view of the dataset. + X0_first, y0 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1 // 2, + random_state=rng.integers(1, 10000), + centers=1, + ) + + X1_first, y1 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1 // 2, + random_state=rng.integers(1, 10000), + centers=1, + ) + + # create the first views for y=0 and y=1 + X0_first = np.concatenate( + (X0_first, rng.standard_normal(size=(n_samples, n_features_1 // 2))), axis=1 + ) + X1_first = np.concatenate( + (X1_first, rng.standard_normal(size=(n_samples, n_features_1 // 2))), axis=1 + ) + y1[:] = 1 + + # add the second view for y=0 and y=1, which is completely noise + X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + + # combine the views and targets + X = np.vstack((X0, X1)) + y = np.hstack((y0, y1)).T + + # add noise to the data + X = X + rng.standard_normal(size=X.shape) + + return X, y + + +n_samples = 100 +n_features = 10000 +n_features_views = [10, n_features] + +X, y = make_multiview_classification( + n_samples=n_samples, + n_features_1=10, + n_features_2=n_features, + cluster_std=2.0, + seed=seed, ) -X = np.hstack([X_important, X_unimportant]) - -# simulate the binary target variable -y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) - # %% # Perform hypothesis testing using Mutual Information # --------------------------------------------------- @@ -101,37 +125,112 @@ estimator=HonestForestClassifier( n_estimators=n_estimators, max_features=max_features, - tree_estimator=DecisionTreeClassifier(), + tree_estimator=MultiViewDecisionTreeClassifier(feature_set_ends=n_features_views), random_state=seed, - honest_fraction=0.7, + honest_fraction=0.5, n_jobs=n_jobs, ), random_state=seed, test_size=test_size, - permute_per_tree=True, + permute_per_tree=False, sample_dataset_per_tree=False, ) +mv_results = dict() + print( f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " f"{est.sample_dataset_per_tree}" ) # we test for the first feature set, which is important and thus should return a pvalue < 0.05 stat, pvalue = est.test( - X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats + X, y, covariate_index=np.arange(10, dtype=int), metric="mi", n_repeats=n_repeats ) +mv_results["important_feature_stat"] = stat +mv_results["important_feature_pvalue"] = pvalue print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") # we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 stat, pvalue = est.test( X, y, - covariate_index=np.arange(n_features_set, dtype=int) + n_features_set, + covariate_index=np.arange(10, n_features, dtype=int), metric="mi", n_repeats=n_repeats, ) +mv_results["unimportant_feature_stat"] = stat +mv_results["unimportant_feature_pvalue"] = pvalue print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") +# %% +# Let's investigate what happens when we do not use a multi-view decision tree. +# All other parameters are kept the same. + +est = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=n_estimators, + max_features=max_features, + tree_estimator=DecisionTreeClassifier(), + random_state=seed, + honest_fraction=0.5, + n_jobs=n_jobs, + ), + random_state=seed, + test_size=test_size, + permute_per_tree=False, + sample_dataset_per_tree=False, +) + +rf_results = dict() + +# we test for the first feature set, which is important and thus should return a pvalue < 0.05 +stat, pvalue = est.test( + X, y, covariate_index=np.arange(10, dtype=int), metric="mi", n_repeats=n_repeats +) +rf_results["important_feature_stat"] = stat +rf_results["important_feature_pvalue"] = pvalue +print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}") + +# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 +stat, pvalue = est.test( + X, + y, + covariate_index=np.arange(10, n_features, dtype=int), + metric="mi", + n_repeats=n_repeats, +) +rf_results["unimportant_feature_stat"] = stat +rf_results["unimportant_feature_pvalue"] = pvalue +print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}") + +fig, ax = plt.subplots(figsize=(5, 3)) + +# plot pvalues +ax.bar(0, rf_results["important_feature_pvalue"], label="Important Feature Set (RF)") +ax.bar(1, rf_results["unimportant_feature_pvalue"], label="Unimportant Feature Set (RF)") +ax.bar(2, mv_results["important_feature_pvalue"], label="Important Feature Set (MV)") +ax.bar(3, mv_results["unimportant_feature_pvalue"], label="Unimportant Feature Set (MV)") +ax.axhline(0.05, color="k", linestyle="--", label="alpha=0.05") +ax.set(ylabel="Log10(PValue)", xlim=[-0.5, 3.5], yscale="log") +ax.legend() + +fig.tight_layout() +plt.show() + +# %% +# Discussion +# ---------- +# We see that the multi-view decision tree is able to detect the important feature set, +# while the regular decision tree is not. This is because the regular decision tree +# is not aware of the multi-view structure of the data, and thus is challenged +# by the imbalanced dimensionality of the feature sets. I.e. it rarely splits on +# the first low-dimensional feature set, and thus is unable to detect its importance. +# +# Note both approaches still fail to reject the null hypothesis (for alpha of 0.05) +# when testing the unimportant feature set. The difference in the two approaches +# show the statistical power of the multi-view decision tree is higher than the +# regular decision tree in this simulation. + # %% # References # ----------