Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DOC] Add Quantile forest examples #147

Merged
merged 14 commits into from
Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/whats_new/v0.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ the project since version inception, including:

* `Adam Li`_
* `Sambit Panda`_
* `Yuxin Bai`_
SUKI-O marked this conversation as resolved.
Show resolved Hide resolved

6 changes: 6 additions & 0 deletions examples/quantile_predictions/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.. _quantile_examples:

Quantile Predictions with Random Forest
---------------------------------------

Examples demonstrating how to generate quantile predictions using RandomForest class.
SUKI-O marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
========================================================
Predicting with different quantile interpolation methods
========================================================

An example comparison of interpolation methods that can be applied during
prediction when the desired quantile lies between two data points.

"""

from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Create toy dataset.
X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]])
y = np.array([-2, -1, 0, 1, 2])

interpolations = ["linear", "lower", "higher", "midpoint", "nearest"]
colors = ["#006aff", "#ffd237", "#0d4599", "#f2a619", "#a6e5ff"]
quantiles = [0.025, 0.5, 0.975]

y_medians = []
y_errs = []
est = RandomForestRegressor(
n_estimators=1,
random_state=0,
)
# fit the model
est.fit(X, y)
# get the leaf nodes that each sample fell into
leaf_ids = est.apply(X)
# create a list of dictionary that maps node to samples that fell into it
# for each tree
node_to_indices = []
for tree in range(leaf_ids.shape[1]):
d = defaultdict(list)
for id, leaf in enumerate(leaf_ids[:, tree]):
d[leaf].append(id)
node_to_indices.append(d)
# drop the X_test to the trained tree and
# get the indices of leaf nodes that fall into it
leaf_ids_test = est.apply(X)
# for each samples, collect the indices of the samples that fell into
# the same leaf node for each tree
y_pred_quantile = []
for sample in range(leaf_ids_test.shape[0]):
li = [
node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])
]
# merge the list of indices into one
idx = [item for sublist in li for item in sublist]
# get the y_train for each corresponding id``
y_pred_quantile.append(y[idx])

for interpolation in interpolations:
# get the quatile preditions for each predicted sample
y_pred = [
np.array(
[
np.quantile(y_pred_quantile[i], quantile, method=interpolation)
for i in range(len(y_pred_quantile))
]
)
for quantile in quantiles
]
y_medians.append(y_pred[1])
y_errs.append(
np.concatenate(
(
[y_pred[1] - y_pred[0]],
[y_pred[2] - y_pred[1]],
),
axis=0,
)
)

SUKI-O marked this conversation as resolved.
Show resolved Hide resolved
sc = plt.scatter(np.arange(len(y)) - 0.35, y, color="k", zorder=10)
ebs = []
for i, (median, y_err) in enumerate(zip(y_medians, y_errs)):
ebs.append(
plt.errorbar(
np.arange(len(y)) + (0.15 * (i + 1)) - 0.35,
median,
yerr=y_err,
color=colors[i],
ecolor=colors[i],
fmt="o",
)
)
plt.xlim([-0.75, len(y) - 0.25])
plt.xticks(np.arange(len(y)), X.tolist())
plt.xlabel("Samples (Feature Values)")
plt.ylabel("Actual and Predicted Values")
plt.legend([sc] + ebs, ["actual"] + interpolations, loc=2)
plt.show()
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""
==========================================================
Quantile prediction intervals with Random Forest Regressor
==========================================================

An example of how to generate quantile prediction intervals with
Random Forest Regressor class on the California Housing dataset.

"""

from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import FuncFormatter
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.utils.validation import check_random_state


# function to calculate the quantile predictions
def get_quantile_prediction(estimator, X_train, X_test, y_train, quantiles=[0.5]):
estimator.fit(X_train, y_train)
# get the leaf nodes that each sample fell into
leaf_ids = estimator.apply(X_train)
# create a list of dictionary that maps node to samples that fell into it
# for each tree
node_to_indices = []
for tree in range(leaf_ids.shape[1]):
d = defaultdict(list)
for id, leaf in enumerate(leaf_ids[:, tree]):
d[leaf].append(id)
node_to_indices.append(d)
# drop the X_test to the trained tree and
# get the indices of leaf nodes that fall into it
leaf_ids_test = estimator.apply(X_test)
# for each samples, collect the indices of the samples that fell into
# the same leaf node for each tree
y_pred_quantile = []
for sample in range(leaf_ids_test.shape[0]):
li = [
node_to_indices[tree][leaf_ids_test[sample][tree]]
for tree in range(leaf_ids_test.shape[1])
]
# merge the list of indices into one
idx = [item for sublist in li for item in sublist]
# get the y_train for each corresponding id``
y_pred_quantile.append(y_train[idx])
# get the quatile preditions for each predicted sample
y_preds = [
[np.quantile(y_pred_quantile[i], quantile) for i in range(len(y_pred_quantile))]
for quantile in quantiles
]
return y_preds


rng = check_random_state(0)

dollar_formatter = FuncFormatter(lambda x, p: "$" + format(int(x), ","))

# Load the California Housing Prices dataset.
california = datasets.fetch_california_housing()
n_samples = min(california.target.size, 1000)
perm = rng.permutation(n_samples)
X = california.data[perm]
y = california.target[perm]

rf = RandomForestRegressor(n_estimators=100, random_state=0)

kf = KFold(n_splits=5)
kf.get_n_splits(X)

y_true = []
y_pred = []
y_pred_lower = []
y_pred_upper = []

for train_index, test_index in kf.split(X):
X_train, X_test, y_train, y_test = (
X[train_index],
X[test_index],
y[train_index],
y[test_index],
)

rf.set_params(max_features=X_train.shape[1] // 3)

# Get predictions at 95% prediction intervals and median.
y_pred_i = get_quantile_prediction(rf, X_train, X_test, y_train, quantiles=[0.025, 0.5, 0.975])

y_true = np.concatenate((y_true, y_test))
y_pred = np.concatenate((y_pred, y_pred_i[1]))
y_pred_lower = np.concatenate((y_pred_lower, y_pred_i[0]))
y_pred_upper = np.concatenate((y_pred_upper, y_pred_i[2]))

# Scale data to dollars.
y_true *= 1e5
y_pred *= 1e5
y_pred_lower *= 1e5
y_pred_upper *= 1e5

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

y_pred_interval = y_pred_upper - y_pred_lower
SUKI-O marked this conversation as resolved.
Show resolved Hide resolved
sort_idx = np.argsort(y_pred)
y_true = y_true[sort_idx]
y_pred = y_pred[sort_idx]
y_pred_lower = y_pred_lower[sort_idx]
y_pred_upper = y_pred_upper[sort_idx]
y_min = min(np.minimum(y_true, y_pred))
y_max = max(np.maximum(y_true, y_pred))
y_min = float(np.round((y_min / 10000) - 1, 0) * 10000)
y_max = float(np.round((y_max / 10000) - 1, 0) * 10000)

for low, mid, upp in zip(y_pred_lower, y_pred, y_pred_upper):
ax1.plot([mid, mid], [low, upp], lw=4, c="#e0f2ff")

ax1.plot(y_pred, y_true, c="#f2a619", lw=0, marker=".", ms=5)
ax1.plot(y_pred, y_pred_lower, alpha=0.4, c="#006AFF", lw=0, marker="_", ms=4)
ax1.plot(y_pred, y_pred_upper, alpha=0.4, c="#006AFF", lw=0, marker="_", ms=4)
ax1.plot([y_min, y_max], [y_min, y_max], ls="--", lw=1, c="grey")
ax1.grid(axis="x", color="0.95")
ax1.grid(axis="y", color="0.95")
ax1.xaxis.set_major_formatter(dollar_formatter)
ax1.yaxis.set_major_formatter(dollar_formatter)
ax1.set_xlim(y_min, y_max)
ax1.set_ylim(y_min, y_max)
ax1.set_xlabel("Fitted Values (Conditional Median)")
ax1.set_ylabel("Observed Values")

y_pred_interval = y_pred_upper - y_pred_lower
sort_idx = np.argsort(y_pred_interval)
y_true = y_true[sort_idx]
y_pred_lower = y_pred_lower[sort_idx]
y_pred_upper = y_pred_upper[sort_idx]

# Center data, with the mean of the prediction interval at 0.
mean = (y_pred_lower + y_pred_upper) / 2
y_true -= mean
y_pred_lower -= mean
y_pred_upper -= mean

ax2.plot(y_true, c="#f2a619", lw=0, marker=".", ms=5)
ax2.fill_between(
np.arange(len(y_pred_upper)),
y_pred_lower,
y_pred_upper,
alpha=0.8,
color="#e0f2ff",
)
ax2.plot(np.arange(n_samples), y_pred_lower, alpha=0.8, c="#006aff", lw=2)
ax2.plot(np.arange(n_samples), y_pred_upper, alpha=0.8, c="#006aff", lw=2)
ax2.grid(axis="x", color="0.95")
ax2.grid(axis="y", color="0.95")
ax2.yaxis.set_major_formatter(dollar_formatter)
ax2.set_xlim([0, n_samples])
ax2.set_xlabel("Ordered Samples")
ax2.set_ylabel("Observed Values and Prediction Intervals")

plt.subplots_adjust(top=0.15)
fig.tight_layout(pad=3)

plt.show()
88 changes: 88 additions & 0 deletions examples/quantile_predictions/plot_quantile_toy_example_with_RF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""
======================================================
Quantile prediction with Random Forest Regressor class
======================================================

An example that demonstrates how to use the Random Forest to generate
quantile predictions such as conditional median and prediction intervals.
The example compares the predictions to a ground truth function used
to generate noisy samples.

"""

from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


def make_toy_dataset(n_samples, seed=0):
rng = np.random.RandomState(seed)

x = rng.uniform(0, 10, size=n_samples)
f = x * np.sin(x)

sigma = 0.25 + x / 10
noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2)
y = f + noise

return np.atleast_2d(x).T, y


n_samples = 1000
X, y = make_toy_dataset(n_samples)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

xx = np.atleast_2d(np.linspace(0, 10, n_samples)).T

rf = RandomForestRegressor(max_depth=3, random_state=0)
rf.fit(X_train, y_train)

y_pred = rf.predict(xx)

# get the leaf nodes that each sample fell into
leaf_ids = rf.apply(X_train)
# create a list of dictionary that maps node to samples that fell into it
# for each tree
node_to_indices = []
for tree in range(leaf_ids.shape[1]):
d = defaultdict(list)
for id, leaf in enumerate(leaf_ids[:, tree]):
d[leaf].append(id)
node_to_indices.append(d)
# drop the X_test to the trained tree and
# get the indices of leaf nodes that fall into it
leaf_ids_test = rf.apply(xx)
# for each samples, collect the indices of the samples that fell into
# the same leaf node for each tree
y_pred_quatile = []
for sample in range(leaf_ids_test.shape[0]):
li = [
node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])
]
# merge the list of indices into one
idx = [item for sublist in li for item in sublist]
# get the y_train for each corresponding id
y_pred_quatile.append(y_train[idx])
# get the quatile preditions for each predicted sample
y_pred_low = [np.quantile(y_pred_quatile[i], 0.025) for i in range(len(y_pred_quatile))]
y_pred_med = [np.quantile(y_pred_quatile[i], 0.5) for i in range(len(y_pred_quatile))]
y_pred_upp = [np.quantile(y_pred_quatile[i], 0.975) for i in range(len(y_pred_quatile))]

plt.plot(X_test, y_test, ".", c="#f2a619", label="Test Observations", ms=5)
SUKI-O marked this conversation as resolved.
Show resolved Hide resolved
plt.plot(xx, (xx * np.sin(xx)), c="black", label="$f(x) = x\,\sin(x)$", lw=2)
plt.plot(xx, y_pred_med, c="#006aff", label="Predicted Median", lw=3, ms=5)
plt.fill_between(
xx.ravel(),
y_pred_low,
y_pred_upp,
color="#e0f2ff",
label="Predicted 95% Interval",
)
plt.xlabel("$x$")
plt.ylabel("$f(x)$")
plt.legend(loc="upper left")
plt.show()
Loading