diff --git a/.spin/cmds.py b/.spin/cmds.py
index 5071dc0a4..a71f863be 100644
--- a/.spin/cmds.py
+++ b/.spin/cmds.py
@@ -54,7 +54,7 @@ def setup_submodule(forcesubmodule=False):
     This will update the submodule, which then must be commited so that
     git knows the submodule needs to be at a certain commit hash.
     """
-    commit_fpath = "./sktree/_lib/sklearn_fork/commit.txt"
+    commit_fpath = "./sktree/_lib/commit.txt"
     submodule = "./sktree/_lib/sklearn_fork"
     commit = ""
     current_hash = ""
diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py
new file mode 100644
index 000000000..423bc63dc
--- /dev/null
+++ b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py	
@@ -0,0 +1,138 @@
+"""
+===========================================================
+Mutual Information for Gigantic Hypothesis Testing (MIGHT)
+===========================================================
+
+An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric
+multivariate hypothesis test, on simulated datasets. Here, we present a simulation
+of how MIGHT is used to test the hypothesis that a "feature set is important for
+predicting the target". This is a generalization of the framework presented in
+:footcite:`coleman2022scalable`.
+
+We simulate a dataset with 1000 features, 500 samples, and a binary class target
+variable. Within each feature set, there is 500 features associated with one feature
+set, and another 500 features associated with another feature set. One could think of
+these for example as different datasets collected on the same patient in a biomedical setting.
+The first feature set (X) is strongly correlated with the target, and the second
+feature set (W) is weakly correlated with the target (y). Here, we are testing the
+null hypothesis:
+
+- ``H0: I(X; y) - I(X, W; y) = 0``
+- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with
+    respect to ``y``
+
+where ``I`` is mutual information. For example, this could be true in the following settings,
+where X is our informative feature set and W is our uninformative feature set.
+
+- ``W    X -> y``: here ``W`` is completely disconnected from X and y.
+- ``W -> X -> y``: here ``W`` is d-separated from y given X.
+- ``W <- X -> y``: here ``W`` is d-separated from y given X.
+
+We then use MIGHT to test the hypothesis that the first feature set is important for
+predicting the target, and the second feature set is not important for predicting the
+target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`.
+"""
+
+import numpy as np
+from scipy.special import expit
+
+from sktree import HonestForestClassifier
+from sktree.stats import FeatureImportanceForestClassifier
+from sktree.tree import DecisionTreeClassifier
+
+seed = 12345
+rng = np.random.default_rng(seed)
+
+# %%
+# Simulate data
+# -------------
+# We simulate the two feature sets, and the target variable. We then combine them
+# into a single dataset to perform hypothesis testing.
+
+n_samples = 1000
+n_features_set = 500
+mean = 1.0
+sigma = 2.0
+beta = 5.0
+
+unimportant_mean = 0.0
+unimportant_sigma = 4.5
+
+# first sample the informative features, and then the uniformative features
+X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10))
+X_important = np.hstack(
+    [
+        X_important,
+        rng.normal(
+            loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10)
+        ),
+    ]
+)
+
+X_unimportant = rng.normal(
+    loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set)
+)
+X = np.hstack([X_important, X_unimportant])
+
+# simulate the binary target variable
+y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples)
+
+# %%
+# Perform hypothesis testing using Mutual Information
+# ---------------------------------------------------
+# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis
+# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated
+# between two forests. One forest is trained on the original dataset, and one forest is trained
+# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly.
+#
+# The null distribution is then estimated in an efficient manner using the framework of
+# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors)
+# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then
+# computed as the proportion of samples in the null distribution that are less than the
+# observed test statistic.
+
+n_estimators = 200
+max_features = "sqrt"
+test_size = 0.2
+n_repeats = 1000
+n_jobs = -1
+
+est = FeatureImportanceForestClassifier(
+    estimator=HonestForestClassifier(
+        n_estimators=n_estimators,
+        max_features=max_features,
+        tree_estimator=DecisionTreeClassifier(),
+        random_state=seed,
+        honest_fraction=0.7,
+        n_jobs=n_jobs,
+    ),
+    random_state=seed,
+    test_size=test_size,
+    permute_per_tree=True,
+    sample_dataset_per_tree=False,
+)
+
+print(
+    f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: "
+    f"{est.sample_dataset_per_tree}"
+)
+# we test for the first feature set, which is important and thus should return a pvalue < 0.05
+stat, pvalue = est.test(
+    X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats
+)
+print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
+
+# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05
+stat, pvalue = est.test(
+    X,
+    y,
+    covariate_index=np.arange(n_features_set, dtype=int) + n_features_set,
+    metric="mi",
+    n_repeats=n_repeats,
+)
+print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
+
+# %%
+# References
+# ----------
+# .. footbibliography::
diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py
new file mode 100644
index 000000000..14c260725
--- /dev/null
+++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py
@@ -0,0 +1,138 @@
+"""
+===============================================================================
+Mutual Information for Gigantic Hypothesis Testing (MIGHT) with Imbalanced Data
+===============================================================================
+
+An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric
+multivariate hypothesis test, on simulated datasets. Here, we present a simulation
+of how MIGHT is used to test the hypothesis that a "feature set is important for
+predicting the target". This is a generalization of the framework presented in
+:footcite:`coleman2022scalable`.
+
+We simulate a dataset with 1000 features, 500 samples, and a binary class target
+variable. Within each feature set, there is 500 features associated with one feature
+set, and another 500 features associated with another feature set. One could think of
+these for example as different datasets collected on the same patient in a biomedical setting.
+The first feature set (X) is strongly correlated with the target, and the second
+feature set (W) is weakly correlated with the target (y). Here, we are testing the
+null hypothesis:
+
+- ``H0: I(X; y) - I(X, W; y) = 0``
+- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with
+    respect to ``y``
+
+where ``I`` is mutual information. For example, this could be true in the following settings,
+where X is our informative feature set and W is our uninformative feature set.
+
+- ``W    X -> y``: here ``W`` is completely disconnected from X and y.
+- ``W -> X -> y``: here ``W`` is d-separated from y given X.
+- ``W <- X -> y``: here ``W`` is d-separated from y given X.
+
+We then use MIGHT to test the hypothesis that the first feature set is important for
+predicting the target, and the second feature set is not important for predicting the
+target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`.
+"""
+
+import numpy as np
+from scipy.special import expit
+
+from sktree import HonestForestClassifier
+from sktree.stats import FeatureImportanceForestClassifier
+from sktree.tree import DecisionTreeClassifier
+
+seed = 12345
+rng = np.random.default_rng(seed)
+
+# %%
+# Simulate data
+# -------------
+# We simulate the two feature sets, and the target variable. We then combine them
+# into a single dataset to perform hypothesis testing.
+
+n_samples = 1000
+n_features_set = 500
+mean = 1.0
+sigma = 2.0
+beta = 5.0
+
+unimportant_mean = 0.0
+unimportant_sigma = 4.5
+
+# first sample the informative features, and then the uniformative features
+X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10))
+X_important = np.hstack(
+    [
+        X_important,
+        rng.normal(
+            loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10)
+        ),
+    ]
+)
+
+X_unimportant = rng.normal(
+    loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set)
+)
+X = np.hstack([X_important, X_unimportant])
+
+# simulate the binary target variable
+y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples)
+
+# %%
+# Perform hypothesis testing using Mutual Information
+# ---------------------------------------------------
+# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis
+# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated
+# between two forests. One forest is trained on the original dataset, and one forest is trained
+# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly.
+#
+# The null distribution is then estimated in an efficient manner using the framework of
+# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors)
+# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then
+# computed as the proportion of samples in the null distribution that are less than the
+# observed test statistic.
+
+n_estimators = 200
+max_features = "sqrt"
+test_size = 0.2
+n_repeats = 1000
+n_jobs = -1
+
+est = FeatureImportanceForestClassifier(
+    estimator=HonestForestClassifier(
+        n_estimators=n_estimators,
+        max_features=max_features,
+        tree_estimator=DecisionTreeClassifier(),
+        random_state=seed,
+        honest_fraction=0.7,
+        n_jobs=n_jobs,
+    ),
+    random_state=seed,
+    test_size=test_size,
+    permute_per_tree=True,
+    sample_dataset_per_tree=False,
+)
+
+print(
+    f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: "
+    f"{est.sample_dataset_per_tree}"
+)
+# we test for the first feature set, which is important and thus should return a pvalue < 0.05
+stat, pvalue = est.test(
+    X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats
+)
+print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
+
+# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05
+stat, pvalue = est.test(
+    X,
+    y,
+    covariate_index=np.arange(n_features_set, dtype=int) + n_features_set,
+    metric="mi",
+    n_repeats=n_repeats,
+)
+print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
+
+# %%
+# References
+# ----------
+# .. footbibliography::
diff --git a/examples/multiview/README.txt b/examples/multiview/README.txt
new file mode 100644
index 000000000..127ad6e57
--- /dev/null
+++ b/examples/multiview/README.txt
@@ -0,0 +1,6 @@
+.. _multiview_examples:
+
+Multi-view learning with Decision-trees
+---------------------------------------
+
+Examples demonstrating multi-view learning using random forest variants.
diff --git a/examples/multiview/plot_multiview_dtc.py b/examples/multiview/plot_multiview_dtc.py
new file mode 100644
index 000000000..84089547e
--- /dev/null
+++ b/examples/multiview/plot_multiview_dtc.py
@@ -0,0 +1,124 @@
+"""
+============================================================
+Analyze a multi-view dataset with a multi-view random forest
+============================================================
+
+An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric
+multivariate hypothesis test, on simulated datasets. Here, we present a simulation
+of how MIGHT is used to evaluate how a "feature set is important for predicting the target".
+
+We simulate a dataset with 1000 features, 500 samples, and a binary class target
+variable. Within each feature set, there is 500 features associated with one feature
+set, and another 500 features associated with another feature set. One could think of
+these for example as different datasets collected on the same patient in a biomedical setting.
+The first feature set (X) is strongly correlated with the target, and the second
+feature set (W) is weakly correlated with the target (y).
+
+We then use MIGHT to calculate the partial AUC of these sets.
+"""
+
+import numpy as np
+from sklearn.datasets import make_blobs
+from sklearn.model_selection import cross_val_score
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from collections import defaultdict
+from sktree import MultiViewRandomForestClassifier, RandomForestClassifier
+
+seed = 12345
+rng = np.random.default_rng(seed)
+
+
+def make_multiview_classification(n_samples=100, n_features_1=5, n_features_2=1000, cluster_std=2.0, seed=None):
+    rng = np.random.default_rng(seed=seed)
+
+    # Create a high-dimensional multiview dataset with a low-dimensional informative
+    # subspace in one view of the dataset.
+    X0_first, y0 = make_blobs(
+        n_samples=n_samples,
+        cluster_std=cluster_std,
+        n_features=n_features_1,
+        random_state=rng.integers(1, 10000),
+        centers=1,
+    )
+
+    X1_first, y1 = make_blobs(
+        n_samples=n_samples,
+        cluster_std=cluster_std,
+        n_features=n_features_1,
+        random_state=rng.integers(1, 10000),
+        centers=1,
+    )
+    y1[:] = 1
+    X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1)
+    X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1)
+    X = np.vstack((X0, X1))
+    y = np.hstack((y0, y1)).T
+
+    return X, y
+
+# %%
+# Simulate data
+# -------------
+# We simulate a 2-view dataset with both views containing informative low-dimensional features.
+# The first view has five dimensions, while the second view will vary from five to a thousand
+# dimensions. The sample-size will be kept fixed, so we can compare the performance of
+# regular Random forests with Multi-view Random Forests.
+
+n_samples = 1000
+n_features_views = np.linspace(5, 1000, 2).astype(int)
+
+datasets = []
+for idx, n_features in enumerate(n_features_views):
+    X, y = make_multiview_classification(n_samples=n_samples, n_features_1=5, n_features_2=n_features, cluster_std=2.0, seed=seed + idx)
+    datasets.append((X, y))
+
+# %%
+# Fit Random Forest and Multi-view Random Forest
+# ----------------------------------------------
+# Here, we fit both forests over all the datasets.
+
+n_estimators = 50
+n_jobs = -1
+
+scores = defaultdict(list)
+
+for ((X, y), n_features) in zip(datasets, n_features_views):
+    feature_set_ends = np.array([5, n_features + 5])
+
+    rf = RandomForestClassifier(
+        n_estimators=n_estimators,
+        n_jobs=n_jobs,
+    )
+
+    mvrf = MultiViewRandomForestClassifier(
+        n_estimators=n_estimators,
+        n_jobs=n_jobs,
+        feature_set_ends=n_features_views,
+    )
+
+    # obtain the cross-validation score
+    rf_score = cross_val_score(rf, X, y, cv=2).mean()
+    mvrf_score = cross_val_score(mvrf, X, y, cv=2).mean()
+
+    scores['rf'].append(rf_score)
+    scores['mvrf'].append(mvrf_score)
+
+# %%
+# Visualize scores and compare performance
+# ----------------------------------------
+# Now, we can compare the performance from the cross-validation experiment.
+
+scores['n_features'] = n_features_views
+df = pd.DataFrame(scores)
+
+# melt the dataframe, to make it easier to plot
+df = pd.melt(df, id_vars=['n_features'], var_name='model', value_name='score')
+
+fig, ax = plt.subplots()
+sns.lineplot(data=df, x='n_features', y='score', hue='model', label='CV Score', ax=ax)
+ax.set_ylabel('CV Score')
+ax.set_xlabel('Number of features in second view')
+ax.set_title('Random Forest vs Multi-view Random Forest')
+plt.show()