diff --git a/.spin/cmds.py b/.spin/cmds.py index 5071dc0a4..a71f863be 100644 --- a/.spin/cmds.py +++ b/.spin/cmds.py @@ -54,7 +54,7 @@ def setup_submodule(forcesubmodule=False): This will update the submodule, which then must be commited so that git knows the submodule needs to be at a certain commit hash. """ - commit_fpath = "./sktree/_lib/sklearn_fork/commit.txt" + commit_fpath = "./sktree/_lib/commit.txt" submodule = "./sktree/_lib/sklearn_fork" commit = "" current_hash = "" diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py new file mode 100644 index 000000000..423bc63dc --- /dev/null +++ b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py @@ -0,0 +1,138 @@ +""" +=========================================================== +Mutual Information for Gigantic Hypothesis Testing (MIGHT) +=========================================================== + +An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric +multivariate hypothesis test, on simulated datasets. Here, we present a simulation +of how MIGHT is used to test the hypothesis that a "feature set is important for +predicting the target". This is a generalization of the framework presented in +:footcite:`coleman2022scalable`. + +We simulate a dataset with 1000 features, 500 samples, and a binary class target +variable. Within each feature set, there is 500 features associated with one feature +set, and another 500 features associated with another feature set. One could think of +these for example as different datasets collected on the same patient in a biomedical setting. +The first feature set (X) is strongly correlated with the target, and the second +feature set (W) is weakly correlated with the target (y). Here, we are testing the +null hypothesis: + +- ``H0: I(X; y) - I(X, W; y) = 0`` +- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with + respect to ``y`` + +where ``I`` is mutual information. For example, this could be true in the following settings, +where X is our informative feature set and W is our uninformative feature set. + +- ``W X -> y``: here ``W`` is completely disconnected from X and y. +- ``W -> X -> y``: here ``W`` is d-separated from y given X. +- ``W <- X -> y``: here ``W`` is d-separated from y given X. + +We then use MIGHT to test the hypothesis that the first feature set is important for +predicting the target, and the second feature set is not important for predicting the +target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`. +""" + +import numpy as np +from scipy.special import expit + +from sktree import HonestForestClassifier +from sktree.stats import FeatureImportanceForestClassifier +from sktree.tree import DecisionTreeClassifier + +seed = 12345 +rng = np.random.default_rng(seed) + +# %% +# Simulate data +# ------------- +# We simulate the two feature sets, and the target variable. We then combine them +# into a single dataset to perform hypothesis testing. + +n_samples = 1000 +n_features_set = 500 +mean = 1.0 +sigma = 2.0 +beta = 5.0 + +unimportant_mean = 0.0 +unimportant_sigma = 4.5 + +# first sample the informative features, and then the uniformative features +X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) +X_important = np.hstack( + [ + X_important, + rng.normal( + loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) + ), + ] +) + +X_unimportant = rng.normal( + loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) +) +X = np.hstack([X_important, X_unimportant]) + +# simulate the binary target variable +y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) + +# %% +# Perform hypothesis testing using Mutual Information +# --------------------------------------------------- +# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis +# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated +# between two forests. One forest is trained on the original dataset, and one forest is trained +# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly. +# +# The null distribution is then estimated in an efficient manner using the framework of +# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors) +# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then +# computed as the proportion of samples in the null distribution that are less than the +# observed test statistic. + +n_estimators = 200 +max_features = "sqrt" +test_size = 0.2 +n_repeats = 1000 +n_jobs = -1 + +est = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=n_estimators, + max_features=max_features, + tree_estimator=DecisionTreeClassifier(), + random_state=seed, + honest_fraction=0.7, + n_jobs=n_jobs, + ), + random_state=seed, + test_size=test_size, + permute_per_tree=True, + sample_dataset_per_tree=False, +) + +print( + f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " + f"{est.sample_dataset_per_tree}" +) +# we test for the first feature set, which is important and thus should return a pvalue < 0.05 +stat, pvalue = est.test( + X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats +) +print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") + +# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 +stat, pvalue = est.test( + X, + y, + covariate_index=np.arange(n_features_set, dtype=int) + n_features_set, + metric="mi", + n_repeats=n_repeats, +) +print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") + +# %% +# References +# ---------- +# .. footbibliography:: diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py new file mode 100644 index 000000000..14c260725 --- /dev/null +++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py @@ -0,0 +1,138 @@ +""" +=============================================================================== +Mutual Information for Gigantic Hypothesis Testing (MIGHT) with Imbalanced Data +=============================================================================== + +An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric +multivariate hypothesis test, on simulated datasets. Here, we present a simulation +of how MIGHT is used to test the hypothesis that a "feature set is important for +predicting the target". This is a generalization of the framework presented in +:footcite:`coleman2022scalable`. + +We simulate a dataset with 1000 features, 500 samples, and a binary class target +variable. Within each feature set, there is 500 features associated with one feature +set, and another 500 features associated with another feature set. One could think of +these for example as different datasets collected on the same patient in a biomedical setting. +The first feature set (X) is strongly correlated with the target, and the second +feature set (W) is weakly correlated with the target (y). Here, we are testing the +null hypothesis: + +- ``H0: I(X; y) - I(X, W; y) = 0`` +- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with + respect to ``y`` + +where ``I`` is mutual information. For example, this could be true in the following settings, +where X is our informative feature set and W is our uninformative feature set. + +- ``W X -> y``: here ``W`` is completely disconnected from X and y. +- ``W -> X -> y``: here ``W`` is d-separated from y given X. +- ``W <- X -> y``: here ``W`` is d-separated from y given X. + +We then use MIGHT to test the hypothesis that the first feature set is important for +predicting the target, and the second feature set is not important for predicting the +target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`. +""" + +import numpy as np +from scipy.special import expit + +from sktree import HonestForestClassifier +from sktree.stats import FeatureImportanceForestClassifier +from sktree.tree import DecisionTreeClassifier + +seed = 12345 +rng = np.random.default_rng(seed) + +# %% +# Simulate data +# ------------- +# We simulate the two feature sets, and the target variable. We then combine them +# into a single dataset to perform hypothesis testing. + +n_samples = 1000 +n_features_set = 500 +mean = 1.0 +sigma = 2.0 +beta = 5.0 + +unimportant_mean = 0.0 +unimportant_sigma = 4.5 + +# first sample the informative features, and then the uniformative features +X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) +X_important = np.hstack( + [ + X_important, + rng.normal( + loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) + ), + ] +) + +X_unimportant = rng.normal( + loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) +) +X = np.hstack([X_important, X_unimportant]) + +# simulate the binary target variable +y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) + +# %% +# Perform hypothesis testing using Mutual Information +# --------------------------------------------------- +# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis +# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated +# between two forests. One forest is trained on the original dataset, and one forest is trained +# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly. +# +# The null distribution is then estimated in an efficient manner using the framework of +# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors) +# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then +# computed as the proportion of samples in the null distribution that are less than the +# observed test statistic. + +n_estimators = 200 +max_features = "sqrt" +test_size = 0.2 +n_repeats = 1000 +n_jobs = -1 + +est = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=n_estimators, + max_features=max_features, + tree_estimator=DecisionTreeClassifier(), + random_state=seed, + honest_fraction=0.7, + n_jobs=n_jobs, + ), + random_state=seed, + test_size=test_size, + permute_per_tree=True, + sample_dataset_per_tree=False, +) + +print( + f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " + f"{est.sample_dataset_per_tree}" +) +# we test for the first feature set, which is important and thus should return a pvalue < 0.05 +stat, pvalue = est.test( + X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats +) +print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") + +# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 +stat, pvalue = est.test( + X, + y, + covariate_index=np.arange(n_features_set, dtype=int) + n_features_set, + metric="mi", + n_repeats=n_repeats, +) +print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") + +# %% +# References +# ---------- +# .. footbibliography:: diff --git a/examples/multiview/README.txt b/examples/multiview/README.txt new file mode 100644 index 000000000..127ad6e57 --- /dev/null +++ b/examples/multiview/README.txt @@ -0,0 +1,6 @@ +.. _multiview_examples: + +Multi-view learning with Decision-trees +--------------------------------------- + +Examples demonstrating multi-view learning using random forest variants. diff --git a/examples/multiview/plot_multiview_dtc.py b/examples/multiview/plot_multiview_dtc.py new file mode 100644 index 000000000..84089547e --- /dev/null +++ b/examples/multiview/plot_multiview_dtc.py @@ -0,0 +1,124 @@ +""" +============================================================ +Analyze a multi-view dataset with a multi-view random forest +============================================================ + +An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric +multivariate hypothesis test, on simulated datasets. Here, we present a simulation +of how MIGHT is used to evaluate how a "feature set is important for predicting the target". + +We simulate a dataset with 1000 features, 500 samples, and a binary class target +variable. Within each feature set, there is 500 features associated with one feature +set, and another 500 features associated with another feature set. One could think of +these for example as different datasets collected on the same patient in a biomedical setting. +The first feature set (X) is strongly correlated with the target, and the second +feature set (W) is weakly correlated with the target (y). + +We then use MIGHT to calculate the partial AUC of these sets. +""" + +import numpy as np +from sklearn.datasets import make_blobs +from sklearn.model_selection import cross_val_score +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from collections import defaultdict +from sktree import MultiViewRandomForestClassifier, RandomForestClassifier + +seed = 12345 +rng = np.random.default_rng(seed) + + +def make_multiview_classification(n_samples=100, n_features_1=5, n_features_2=1000, cluster_std=2.0, seed=None): + rng = np.random.default_rng(seed=seed) + + # Create a high-dimensional multiview dataset with a low-dimensional informative + # subspace in one view of the dataset. + X0_first, y0 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1, + random_state=rng.integers(1, 10000), + centers=1, + ) + + X1_first, y1 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1, + random_state=rng.integers(1, 10000), + centers=1, + ) + y1[:] = 1 + X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X = np.vstack((X0, X1)) + y = np.hstack((y0, y1)).T + + return X, y + +# %% +# Simulate data +# ------------- +# We simulate a 2-view dataset with both views containing informative low-dimensional features. +# The first view has five dimensions, while the second view will vary from five to a thousand +# dimensions. The sample-size will be kept fixed, so we can compare the performance of +# regular Random forests with Multi-view Random Forests. + +n_samples = 1000 +n_features_views = np.linspace(5, 1000, 2).astype(int) + +datasets = [] +for idx, n_features in enumerate(n_features_views): + X, y = make_multiview_classification(n_samples=n_samples, n_features_1=5, n_features_2=n_features, cluster_std=2.0, seed=seed + idx) + datasets.append((X, y)) + +# %% +# Fit Random Forest and Multi-view Random Forest +# ---------------------------------------------- +# Here, we fit both forests over all the datasets. + +n_estimators = 50 +n_jobs = -1 + +scores = defaultdict(list) + +for ((X, y), n_features) in zip(datasets, n_features_views): + feature_set_ends = np.array([5, n_features + 5]) + + rf = RandomForestClassifier( + n_estimators=n_estimators, + n_jobs=n_jobs, + ) + + mvrf = MultiViewRandomForestClassifier( + n_estimators=n_estimators, + n_jobs=n_jobs, + feature_set_ends=n_features_views, + ) + + # obtain the cross-validation score + rf_score = cross_val_score(rf, X, y, cv=2).mean() + mvrf_score = cross_val_score(mvrf, X, y, cv=2).mean() + + scores['rf'].append(rf_score) + scores['mvrf'].append(mvrf_score) + +# %% +# Visualize scores and compare performance +# ---------------------------------------- +# Now, we can compare the performance from the cross-validation experiment. + +scores['n_features'] = n_features_views +df = pd.DataFrame(scores) + +# melt the dataframe, to make it easier to plot +df = pd.melt(df, id_vars=['n_features'], var_name='model', value_name='score') + +fig, ax = plt.subplots() +sns.lineplot(data=df, x='n_features', y='score', hue='model', label='CV Score', ax=ax) +ax.set_ylabel('CV Score') +ax.set_xlabel('Number of features in second view') +ax.set_title('Random Forest vs Multi-view Random Forest') +plt.show()