Skip to content

Commit

Permalink
DOC improve example on cyclic feature engineering (scikit-learn#27501)
Browse files Browse the repository at this point in the history
  • Loading branch information
lorentzenchr authored Oct 11, 2023
1 parent fa84c90 commit d7a1144
Showing 1 changed file with 42 additions and 15 deletions.
57 changes: 42 additions & 15 deletions examples/applications/plot_cyclical_feature_engineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,14 @@
# .. note::
#
# The fit method of the models used in this notebook all minimize the
# mean squared error to estimate the conditional mean instead of the mean
# absolute error that would fit an estimator of the conditional median.
#
# When reporting performance measure on the test set in the discussion, we
# instead choose to focus on the mean absolute error that is more
# intuitive than the (root) mean squared error. Note, however, that the
# best models for one metric are also the best for the other in this
# study.
# mean squared error to estimate the conditional mean.
# The absolute error, however, would estimate the conditional median.
#
# Nevertheless, when reporting performance measures on the test set in
# the discussion, we choose to focus on the mean absolute error instead
# of the (root) mean squared error because it is more intuitive to
# interpret. Note, however, that in this study the best models for one
# metric are also the best ones in terms of the other metric.
y = df["count"] / df["count"].max()

# %%
Expand Down Expand Up @@ -170,7 +170,10 @@
# efficiently handle heteorogenous tabular data with a mix of categorical and
# numerical features as long as the number of samples is large enough.
#
# Here, we do minimal ordinal encoding for the categorical variables and then
# Here, we use the modern
# :class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support
# for categorical features. Therefore, we only do minimal ordinal encoding for
# the categorical variables and then
# let the model know that it should treat those as categorical variables by
# using a dedicated tree splitting rule. Since we use an ordinal encoder, we
# pass the list of categorical values explicitly to use a logical order when
Expand Down Expand Up @@ -213,6 +216,9 @@
verbose_feature_names_out=False,
),
HistGradientBoostingRegressor(
max_iter=300,
early_stopping=True,
validation_fraction=0.1,
categorical_features=categorical_columns,
random_state=42,
),
Expand All @@ -222,16 +228,26 @@
#
# Lets evaluate our gradient boosting model with the mean absolute error of the
# relative demand averaged across our 5 time-based cross-validation splits:
import numpy as np


def evaluate(model, X, y, cv):
def evaluate(model, X, y, cv, model_prop=None, model_step=None):
cv_results = cross_validate(
model,
X,
y,
cv=cv,
scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
return_estimator=model_prop is not None,
)
if model_prop is not None:
if model_step is not None:
values = [
getattr(m[model_step], model_prop) for m in cv_results["estimator"]
]
else:
values = [getattr(m, model_prop) for m in cv_results["estimator"]]
print(f"Mean model.{model_prop} = {np.mean(values)}")
mae = -cv_results["test_neg_mean_absolute_error"]
rmse = -cv_results["test_neg_root_mean_squared_error"]
print(
Expand All @@ -240,9 +256,18 @@ def evaluate(model, X, y, cv):
)


evaluate(gbrt_pipeline, X, y, cv=ts_cv)
evaluate(
gbrt_pipeline,
X,
y,
cv=ts_cv,
model_prop="n_iter_",
model_step="histgradientboostingregressor",
)

# %%
# We see that we set `max_iter` large enough such that early stopping took place.
#
# This model has an average error around 4 to 5% of the maximum demand. This is
# quite good for a first trial without any hyper-parameter tuning! We just had
# to make the categorical variables explicit. Note that the time related
Expand All @@ -258,10 +283,8 @@ def evaluate(model, X, y, cv):
#
# As usual for linear models, categorical variables need to be one-hot encoded.
# For consistency, we scale the numerical features to the same 0-1 range using
# class:`sklearn.preprocessing.MinMaxScaler`, although in this case it does not
# :class:`~sklearn.preprocessing.MinMaxScaler`, although in this case it does not
# impact the results much because they are already on comparable scales:
import numpy as np

from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

Expand All @@ -278,10 +301,14 @@ def evaluate(model, X, y, cv):
)


evaluate(naive_linear_pipeline, X, y, cv=ts_cv)
evaluate(
naive_linear_pipeline, X, y, cv=ts_cv, model_prop="alpha_", model_step="ridgecv"
)


# %%
# It is affirmative to see that the selected `alpha_` is in our specified
# range.
#
# The performance is not good: the average error is around 14% of the maximum
# demand. This is more than three times higher than the average error of the
Expand Down

0 comments on commit d7a1144

Please sign in to comment.