diff --git a/examples/calibration/README.txt b/examples/calibration/README.txt new file mode 100644 index 000000000..84d5f5758 --- /dev/null +++ b/examples/calibration/README.txt @@ -0,0 +1,6 @@ +.. _calibration_examples: + +Calibrated decision trees via honesty +------------------------------------- + +Examples demonstrating the usage of honest decision trees to obtain calibrated predictions. diff --git a/examples/plot_overlapping_gaussians.py b/examples/calibration/plot_overlapping_gaussians.py similarity index 100% rename from examples/plot_overlapping_gaussians.py rename to examples/calibration/plot_overlapping_gaussians.py diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py deleted file mode 100644 index 423bc63dc..000000000 --- a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py +++ /dev/null @@ -1,138 +0,0 @@ -""" -=========================================================== -Mutual Information for Gigantic Hypothesis Testing (MIGHT) -=========================================================== - -An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric -multivariate hypothesis test, on simulated datasets. Here, we present a simulation -of how MIGHT is used to test the hypothesis that a "feature set is important for -predicting the target". This is a generalization of the framework presented in -:footcite:`coleman2022scalable`. - -We simulate a dataset with 1000 features, 500 samples, and a binary class target -variable. Within each feature set, there is 500 features associated with one feature -set, and another 500 features associated with another feature set. One could think of -these for example as different datasets collected on the same patient in a biomedical setting. -The first feature set (X) is strongly correlated with the target, and the second -feature set (W) is weakly correlated with the target (y). Here, we are testing the -null hypothesis: - -- ``H0: I(X; y) - I(X, W; y) = 0`` -- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with - respect to ``y`` - -where ``I`` is mutual information. For example, this could be true in the following settings, -where X is our informative feature set and W is our uninformative feature set. - -- ``W X -> y``: here ``W`` is completely disconnected from X and y. -- ``W -> X -> y``: here ``W`` is d-separated from y given X. -- ``W <- X -> y``: here ``W`` is d-separated from y given X. - -We then use MIGHT to test the hypothesis that the first feature set is important for -predicting the target, and the second feature set is not important for predicting the -target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`. -""" - -import numpy as np -from scipy.special import expit - -from sktree import HonestForestClassifier -from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier - -seed = 12345 -rng = np.random.default_rng(seed) - -# %% -# Simulate data -# ------------- -# We simulate the two feature sets, and the target variable. We then combine them -# into a single dataset to perform hypothesis testing. - -n_samples = 1000 -n_features_set = 500 -mean = 1.0 -sigma = 2.0 -beta = 5.0 - -unimportant_mean = 0.0 -unimportant_sigma = 4.5 - -# first sample the informative features, and then the uniformative features -X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) -X_important = np.hstack( - [ - X_important, - rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) - ), - ] -) - -X_unimportant = rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) -) -X = np.hstack([X_important, X_unimportant]) - -# simulate the binary target variable -y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) - -# %% -# Perform hypothesis testing using Mutual Information -# --------------------------------------------------- -# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis -# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated -# between two forests. One forest is trained on the original dataset, and one forest is trained -# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly. -# -# The null distribution is then estimated in an efficient manner using the framework of -# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors) -# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then -# computed as the proportion of samples in the null distribution that are less than the -# observed test statistic. - -n_estimators = 200 -max_features = "sqrt" -test_size = 0.2 -n_repeats = 1000 -n_jobs = -1 - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - random_state=seed, - honest_fraction=0.7, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, - permute_per_tree=True, - sample_dataset_per_tree=False, -) - -print( - f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " - f"{est.sample_dataset_per_tree}" -) -# we test for the first feature set, which is important and thus should return a pvalue < 0.05 -stat, pvalue = est.test( - X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats -) -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 -stat, pvalue = est.test( - X, - y, - covariate_index=np.arange(n_features_set, dtype=int) + n_features_set, - metric="mi", - n_repeats=n_repeats, -) -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") - -# %% -# References -# ---------- -# .. footbibliography:: diff --git a/examples/plot_multiview_dtc.py b/examples/plot_multiview_dtc.py deleted file mode 100644 index bd67940db..000000000 --- a/examples/plot_multiview_dtc.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -============================================================ -Analyze a multi-view dataset with a multi-view random forest -============================================================ - -An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric -multivariate hypothesis test, on simulated datasets. Here, we present a simulation -of how MIGHT is used to evaluate how a "feature set is important for predicting the target". - -We simulate a dataset with 1000 features, 500 samples, and a binary class target -variable. Within each feature set, there is 500 features associated with one feature -set, and another 500 features associated with another feature set. One could think of -these for example as different datasets collected on the same patient in a biomedical setting. -The first feature set (X) is strongly correlated with the target, and the second -feature set (W) is weakly correlated with the target (y). - -We then use MIGHT to calculate the partial AUC of these sets. -""" - -import numpy as np -from scipy.special import expit - -from sktree import HonestForestClassifier -from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier - -seed = 12345 -rng = np.random.default_rng(seed) - -# %% -# Simulate data -# ------------- -# We simulate the two feature sets, and the target variable. We then combine them -# into a single dataset to perform hypothesis testing. - -n_samples = 1000 -n_features_set = 500 -mean = 1.0 -sigma = 2.0 -beta = 5.0 - -unimportant_mean = 0.0 -unimportant_sigma = 4.5 - -# first sample the informative features, and then the uniformative features -X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) -X_important = np.hstack( - [ - X_important, - rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) - ), - ] -) - -X_unimportant = rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) -) - -# simulate the binary target variable -y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) - -# %% -# Use partial AUC as test statistic -# --------------------------------- -# You can specify the maximum specificity by modifying ``max_fpr`` in ``statistic``. - -n_estimators = 125 -max_features = "sqrt" -metric = "auc" -test_size = 0.2 -n_jobs = -1 -honest_fraction = 0.7 -max_fpr = 0.1 - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - random_state=seed, - honest_fraction=honest_fraction, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, - permute_per_tree=True, - sample_dataset_per_tree=True, -) - -# we test for the first feature set, which is important and thus should return a higher AUC -stat, posterior_arr, samples = est.statistic( - X_important, - y, - metric=metric, - return_posteriors=True, -) - -print(f"ASH-90 / Partial AUC: {stat}") -print(f"Shape of Observed Samples: {samples.shape}") -print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") - -# %% -# Repeat for the second feature set -# --------------------------------- -# This feature set has a smaller statistic, which is expected due to its weak correlation. - -stat, posterior_arr, samples = est.statistic( - X_unimportant, - y, - metric=metric, - return_posteriors=True, -) - -print(f"ASH-90 / Partial AUC: {stat}") -print(f"Shape of Observed Samples: {samples.shape}") -print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") - -# %% -# All posteriors are saved within the model -# ----------------------------------------- -# Extract the results from the model variables anytime. You can save the model with ``pickle``. -# -# ASH-90 / Partial AUC: ``est.observe_stat_`` -# Observed Samples: ``est.observe_samples_`` -# Tree Posteriors for the positive class: ``est.observe_posteriors_`` (n_trees, n_samples_test, 1) -# True Labels: ``est.y_true_final_`` diff --git a/examples/sklearn_vs_sktree/README.txt b/examples/sklearn_vs_sktree/README.txt new file mode 100644 index 000000000..d942d71f8 --- /dev/null +++ b/examples/sklearn_vs_sktree/README.txt @@ -0,0 +1,6 @@ +.. _sklearn_examples: + +Comparing sklearn and sktree decision trees +------------------------------------------- + +Examples demonstrating the difference between sklearn and sktree decision trees. diff --git a/examples/plot_iris_dtc.py b/examples/sklearn_vs_sktree/plot_iris_dtc.py similarity index 100% rename from examples/plot_iris_dtc.py rename to examples/sklearn_vs_sktree/plot_iris_dtc.py diff --git a/examples/sparse_oblique_trees/README.txt b/examples/sparse_oblique_trees/README.txt new file mode 100644 index 000000000..61c596af1 --- /dev/null +++ b/examples/sparse_oblique_trees/README.txt @@ -0,0 +1,6 @@ +.. _sporf_examples: + +Sparse oblique projections with oblique decision-trees +------------------------------------------------------ + +Examples demonstrating learning using oblique random forests. diff --git a/examples/plot_extra_oblique_random_forest.py b/examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py similarity index 100% rename from examples/plot_extra_oblique_random_forest.py rename to examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py diff --git a/examples/plot_extra_orf_sample_size.py b/examples/sparse_oblique_trees/plot_extra_orf_sample_size.py similarity index 100% rename from examples/plot_extra_orf_sample_size.py rename to examples/sparse_oblique_trees/plot_extra_orf_sample_size.py diff --git a/examples/plot_oblique_axis_aligned_forests_sparse_parity.py b/examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py similarity index 100% rename from examples/plot_oblique_axis_aligned_forests_sparse_parity.py rename to examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py diff --git a/examples/plot_oblique_forests_iris.py b/examples/sparse_oblique_trees/plot_oblique_forests_iris.py similarity index 100% rename from examples/plot_oblique_forests_iris.py rename to examples/sparse_oblique_trees/plot_oblique_forests_iris.py diff --git a/examples/plot_oblique_random_forest.py b/examples/sparse_oblique_trees/plot_oblique_random_forest.py similarity index 100% rename from examples/plot_oblique_random_forest.py rename to examples/sparse_oblique_trees/plot_oblique_random_forest.py