Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parameter to select model name #563

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,13 +367,15 @@ def fit(
"""
return self._fit(X, y, sample_weight, cv)

def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame]) -> numpy.ndarray:
def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame], model_name=None) -> numpy.ndarray:
"""
Computes predictions from AutoML best model.

Arguments:
X (list or numpy.ndarray or pandas.DataFrame):
Input values to make predictions on.
model_name (str):
Name of the model that must be loaded

Returns:
numpy.ndarray:
Expand All @@ -384,10 +386,11 @@ def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame]) -> numpy.ndar
Raises:
AutoMLException: Model has not yet been fitted.
"""
return self._predict(X)

return self._predict(X,model_name)

def predict_proba(
self, X: Union[List, numpy.ndarray, pandas.DataFrame]
self, X: Union[List, numpy.ndarray, pandas.DataFrame], model_name=None
) -> numpy.ndarray:
"""
Computes class probabilities from AutoML best model.
Expand All @@ -396,6 +399,8 @@ def predict_proba(
Arguments:
X (list or numpy.ndarray or pandas.DataFrame):
Input values to make predictions on.
model_name (str):
Name of the model that must be loaded

Returns:
numpy.ndarray of shape (n_samples, n_classes):
Expand All @@ -405,10 +410,10 @@ def predict_proba(
AutoMLException: Model has not yet been fitted.

"""
return self._predict_proba(X)
return self._predict_proba(X,model_name)

def predict_all(
self, X: Union[List, numpy.ndarray, pandas.DataFrame]
self, X: Union[List, numpy.ndarray, pandas.DataFrame], model_name=None
) -> pandas.DataFrame:
"""
Computes both class probabilities and class labels for classification tasks.
Expand All @@ -417,6 +422,8 @@ def predict_all(
Arguments:
X (list or numpy.ndarray or pandas.DataFrame):
Input values to make predictions on.
model_name (str):
Name of the model that must be loaded

Returns:
pandas.Dataframe:
Expand All @@ -428,7 +435,7 @@ def predict_all(
AutoMLException: Model has not yet been fitted.

"""
return self._predict_all(X)
return self._predict_all(X,model_name)

def score(
self,
Expand Down
87 changes: 48 additions & 39 deletions supervised/base_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def __init__(self):
self._top_models_to_improve = None
self._random_state = 1234
self._models = [] # instances of iterative learner framework or ensemble
self._best_model = None
self._model = None
self._verbose = True
self._threshold = None # used only in classification
self._metrics_details = None
Expand Down Expand Up @@ -128,7 +128,7 @@ def _check_can_load(self):
self.load(self.results_path)
self._results_path = self.results_path

def load(self, path):
def load(self, path, model_name=None):
logger.info("Loading AutoML models ...")
try:
params = json.load(open(os.path.join(path, "params.json")))
Expand Down Expand Up @@ -174,9 +174,10 @@ def load(self, path):
self._n_jobs = params.get("n_jobs", self._n_jobs)
self._random_state = params.get("random_state", self._random_state)
stacked_models = params.get("stacked")

best_model_name = params.get("best_model")
load_on_predict = params.get("load_on_predict")
load_on_predict = None
if model_name is None:
model_name = params.get("best_model")
load_on_predict = params.get("load_on_predict")
self._fit_level = params.get("fit_level")
lazy_load = not (
self._fit_level is not None and self._fit_level == "finished"
Expand All @@ -186,7 +187,7 @@ def load(self, path):
load_models = load_on_predict
# just in case there is check for which models should be loaded
# fix https://github.com/mljar/mljar-supervised/issues/395
models_needed = self.models_needed_on_predict(best_model_name)
models_needed = self.models_needed_on_predict(model_name)
# join them and return unique list
load_models = list(np.unique(load_models + models_needed))

Expand All @@ -204,12 +205,14 @@ def load(self, path):
self._models += [m]
models_map[m.get_name()] = m

self._best_model = None
if best_model_name is not None:
self._best_model = models_map.get(best_model_name)
self._model = None
if model_name is not None:
if model_name not in models_map:
raise ValueError(f"model name {model_name} does not exist in file")
self._model = models_map[model_name]

if stacked_models is not None and (
self._best_model._is_stacked or self._fit_level != "finished"
self._model._is_stacked or self._fit_level != "finished"
):
self._stacked_models = []
for stacked_model_name in stacked_models:
Expand Down Expand Up @@ -1120,7 +1123,7 @@ def _fit(self, X, y, sample_weight=None, cv=None):
self.verbose_print(
f"AutoML fit time: {np.round(time.time() - self._start_time,2)} seconds"
)
self.verbose_print(f"AutoML best model: {self._best_model.get_name()}")
self.verbose_print(f"AutoML best model: {self._model.get_name()}")

except Exception as e:
raise e
Expand All @@ -1143,22 +1146,22 @@ def _update_errors_report(self, model_name, error_msg):

def select_and_save_best(self, show_warnings=False):
# Select best model based on the lowest loss
self._best_model = None
self._model = None
if self._models:
model_list = [
m
for m in self._models
if m.is_valid() and m.is_fast_enough(self._max_single_prediction_time)
]
if model_list:
self._best_model = min(
self._model = min(
model_list,
key=lambda x: x.get_final_loss(),
)
# if none selected please select again and warn the user
if (
len(self._models)
and self._best_model is None
and self._model is None
and self._max_single_prediction_time is not None
):
if show_warnings:
Expand All @@ -1171,7 +1174,7 @@ def select_and_save_best(self, show_warnings=False):
)
self.verbose_print(msg)

self._best_model = min(
self._model = min(
[m for m in self._models if m.is_valid()],
key=lambda x: x.get_final_loss(),
)
Expand Down Expand Up @@ -1204,11 +1207,11 @@ def select_and_save_best(self, show_warnings=False):
"saved": self._model_subpaths,
"fit_level": self._fit_level,
}
if self._best_model is not None:
params["best_model"] = self._best_model.get_name()
if self._model is not None:
params["best_model"] = self._model.get_name()
load_on_predict = []
load_on_predict += self._best_model.involved_model_names()
if self._best_model._is_stacked and self._stacked_models is not None:
load_on_predict += self._model.involved_model_names()
if self._model._is_stacked and self._stacked_models is not None:
for m in self._stacked_models:
load_on_predict += m.involved_model_names()
params["load_on_predict"] = list(np.unique(load_on_predict))
Expand All @@ -1224,7 +1227,7 @@ def select_and_save_best(self, show_warnings=False):
# save report
ldb.insert(loc=0, column="Best model", value="")
ldb.loc[
ldb.name == self._best_model.get_name(), "Best model"
ldb.name == self._model.get_name(), "Best model"
] = "**the best**"
ldb["name"] = [f"[{m}]({m}/README.md)" for m in ldb["name"].values]

Expand Down Expand Up @@ -1287,11 +1290,10 @@ def models_needed_on_predict(self, required_model_name):
)

def _base_predict(self, X, model=None):

if model is None:
if self._best_model is None:
if self._model is None:
self.load(self.results_path)
model = self._best_model
model = self._model

if model is None:
raise AutoMLException(
Expand Down Expand Up @@ -1356,9 +1358,10 @@ def _base_predict(self, X, model=None):
else:
return predictions

def _predict(self, X):

predictions = self._base_predict(X)
def _predict(self, X,model_name=None):
if model_name is not None:
self.load(self._results_path,model_name)
predictions = self._base_predict(X,self._model)
# Return predictions
# If classification task the result is in column 'label'
# If regression task the result is in column 'prediction'
Expand All @@ -1368,7 +1371,7 @@ def _predict(self, X):
else predictions["prediction"].to_numpy()
)

def _predict_proba(self, X):
def _predict_proba(self, X,model_name):
# Check is task type is correct
if self._ml_task == REGRESSION:
raise AutoMLException(
Expand All @@ -1378,11 +1381,17 @@ def _predict_proba(self, X):
# Make and return predictions
# If classification task the result is in column 'label'
# Need to drop `label` column.
return self._base_predict(X).drop(["label"], axis=1).to_numpy()

def _predict_all(self, X):
model=None
if model_name is not None:
model = self.load(self._results_path,model_name)
return self._base_predict(X,model).drop(["label"], axis=1).to_numpy()

def _predict_all(self, X,model_name):
model = None
if model_name is not None:
model = self.load(self.results_path, model_name)
# Make and return predictions
return self._base_predict(X)
return self._base_predict(X,model)

def _score(self, X, y=None, sample_weight=None):
# y default must be None for scikit-learn compatibility
Expand Down Expand Up @@ -2025,23 +2034,23 @@ def _validate_random_state(self):
check_positive_integer(self.random_state, "random_state")

def to_json(self):
if self._best_model is None:
if self._model is None:
return None

return {
"best_model": self._best_model.to_json(),
"best_model": self._model.to_json(),
"threshold": self._threshold,
"ml_task": self._ml_task,
}

def from_json(self, json_data):

if json_data["best_model"]["algorithm_short_name"] == "Ensemble":
self._best_model = Ensemble()
self._best_model.from_json(json_data["best_model"])
self._model = Ensemble()
self._model.from_json(json_data["best_model"])
else:
self._best_model = ModelFramework(json_data["best_model"].get("params"))
self._best_model.from_json(json_data["best_model"])
self._model = ModelFramework(json_data["best_model"].get("params"))
self._model.from_json(json_data["best_model"])
self._threshold = json_data.get("threshold")

self._ml_task = json_data.get("ml_task")
Expand Down Expand Up @@ -2254,7 +2263,7 @@ def _report(self, width=900, height=1200):

def _need_retrain(self, X, y, sample_weight, decrease):

metric = self._best_model.get_metric()
metric = self._model.get_metric()

X, y, sample_weight = ExcludeRowsMissingTarget.transform(
X, y, sample_weight, warn=True
Expand All @@ -2270,7 +2279,7 @@ def _need_retrain(self, X, y, sample_weight, decrease):
sign = -1.0 if Metric.optimize_negative(metric.name) else 1.0

new_score = metric(y, prediction, sample_weight)
old_score = self._best_model.get_final_loss()
old_score = self._model.get_final_loss()

change = np.abs((old_score - new_score) / old_score)

Expand Down
40 changes: 40 additions & 0 deletions tests/tests_automl/test_specific_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
import unittest
import pytest
import json
import shutil

import supervised.exceptions
from supervised import AutoML
from sklearn import datasets

iris = datasets.load_iris()

class ModelSelectionTest(unittest.TestCase):

automl_dir = "model_selection_tests"

def tearDown(self):
shutil.rmtree(self.automl_dir, ignore_errors=True)

def test_choose_model(self):
model = AutoML(
explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
)
model.fit(iris.data, iris.target)
params = json.load(open(os.path.join(self.automl_dir, "params.json")))
for model_name in params['saved']:
model.predict(iris.data,model_name)
model.predict_all(iris.data, model_name)
model.predict_proba(iris.data, model_name)

def test_raise_with_wrong_model(self):
model = AutoML(
explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
)
model.fit(iris.data, iris.target)
msg = "Cannot load AutoML directory. model name random_name does not exist in file"
with pytest.raises(supervised.exceptions.AutoMLException, match=msg):
model.predict(iris.data, "random_name")