From 93ac6e72148e058091541f858056ac264cd36a30 Mon Sep 17 00:00:00 2001 From: Michael Van de Steene <124588413+michael-nml@users.noreply.github.com> Date: Mon, 20 Nov 2023 12:10:18 +0100 Subject: [PATCH] Fix handling `NaN` values when fitting JS univariate drift (#340) * Add column & method for univariate fitting errors * Refactor to use single data cleaning method * Filter NaN's when fitting JS * Refactor data cleaning to accept columns argument Previously the data cleaning method operated by accepting multiple dataframes and inspecting each dataframe separetely for `NaN`'s. Depending on how the data is processed after cleaning, splitting columns into separate dataframes can be rather annoying. To avoid that this commit changes the method to accept a single dataframe and a columns argument. The columns argument specifies which column subsets should be inspected for `NaN`'s, enabling the same behaviour using a more convenient syntax. * Remove errors and use warning behaviour instead The performance calculator for binary classification had checks in place to generate an exception if the prediction column contains nothing but `NaN`'s. This behaviour contradicts the warning functionality that is in the same functions that would should return `NaN` and issue a warning. It is also inconsistent with other calculators which do issue a warning instead of raising an error. This commit removes the errors and relies on the existing warning functionality. * Refactor more data cleaning methods * Deal with mypy overload issue --------- Co-authored-by: Niels Nuyttens --- nannyml/base.py | 38 ++++++++-- nannyml/drift/univariate/calculator.py | 63 ++++++++++------- nannyml/drift/univariate/methods.py | 25 +++---- .../performance_calculation/metrics/base.py | 22 ------ .../metrics/binary_classification.py | 70 ++++--------------- .../metrics/multiclass_classification.py | 22 +++--- .../metrics/regression.py | 16 ++--- .../confidence_based/metrics.py | 28 ++------ .../direct_loss_estimation/metrics.py | 16 ++--- 9 files changed, 126 insertions(+), 174 deletions(-) diff --git a/nannyml/base.py b/nannyml/base.py index 6ef629fb4..953354dc9 100644 --- a/nannyml/base.py +++ b/nannyml/base.py @@ -8,7 +8,7 @@ import copy import logging from abc import ABC, abstractmethod -from typing import Generic, List, Optional, Tuple, TypeVar, Union +from typing import Generic, Iterable, List, Optional, Tuple, TypeVar, Union, overload import numpy as np import pandas as pd @@ -533,12 +533,38 @@ def _column_is_categorical(column: pd.Series) -> bool: return column.dtype in ['object', 'string', 'category', 'bool'] -def _remove_missing_data(column: pd.Series): - if isinstance(column, pd.Series): - column = column.dropna().reset_index(drop=True) +@overload +def _remove_nans(data: pd.Series) -> pd.Series: + ... + +@overload +def _remove_nans(data: pd.DataFrame, columns: Optional[Iterable[Union[str, Iterable[str]]]]) -> pd.DataFrame: + ... + + +def _remove_nans( + data: Union[pd.Series, pd.DataFrame], columns: Optional[Iterable[Union[str, Iterable[str]]]] = None +) -> Tuple[pd.DataFrame, ...]: + """Remove rows with NaN values in the specified columns. + + If no columns are given, drop rows with NaN values in any column. If columns are given, drop rows with NaN values + in the specified columns. If a set of columns is given, drop rows with NaN values in all of the columns in the set. + """ + # If no columns are given, drop rows with NaN values in any columns + if columns is None: + mask = ~data.isna() + if isinstance(mask, pd.DataFrame): + mask = mask.all(axis=1) else: - column = column[~np.isnan(column)] - return column + mask = np.ones(len(data), dtype=bool) + for column_selector in columns: + nans = data[column_selector].isna() + if isinstance(nans, pd.DataFrame): + nans = nans.all(axis=1) + mask &= ~nans + + # NaN values have been dropped. Try to infer types again + return data[mask].reset_index(drop=True).infer_objects() def _column_is_continuous(column: pd.Series) -> bool: diff --git a/nannyml/drift/univariate/calculator.py b/nannyml/drift/univariate/calculator.py index 6f2052989..1ce569b3c 100644 --- a/nannyml/drift/univariate/calculator.py +++ b/nannyml/drift/univariate/calculator.py @@ -39,7 +39,7 @@ from nannyml.chunk import Chunker from nannyml.drift.univariate.methods import FeatureType, Method, MethodFactory from nannyml.drift.univariate.result import Result -from nannyml.exceptions import InvalidArgumentsException +from nannyml.exceptions import CalculatorException, InvalidArgumentsException from nannyml.thresholds import ConstantThreshold, StandardDeviationThreshold, Threshold from nannyml.usage_logging import UsageEvent, log_usage @@ -271,34 +271,45 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> UnivariateDrift if column_name not in self.categorical_column_names: self.categorical_column_names.append(column_name) + timestamps = reference_data[self.timestamp_column_name] if self.timestamp_column_name else None for column_name in self.continuous_column_names: - self._column_to_models_mapping[column_name] += [ - MethodFactory.create( - key=method, - feature_type=FeatureType.CONTINUOUS, - chunker=self.chunker, - computation_params=self.computation_params or {}, - threshold=self.thresholds[method], - ).fit( - reference_data=reference_data[column_name], - timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None, - ) - for method in self.continuous_method_names - ] + methods = [] + for method in self.continuous_method_names: + try: + methods.append( + MethodFactory.create( + key=method, + feature_type=FeatureType.CONTINUOUS, + chunker=self.chunker, + computation_params=self.computation_params or {}, + threshold=self.thresholds[method], + ).fit( + reference_data=reference_data[column_name], + timestamps=timestamps, + ) + ) + except Exception as ex: + raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex + self._column_to_models_mapping[column_name] = methods for column_name in self.categorical_column_names: - self._column_to_models_mapping[column_name] += [ - MethodFactory.create( - key=method, - feature_type=FeatureType.CATEGORICAL, - chunker=self.chunker, - threshold=self.thresholds[method], - ).fit( - reference_data=reference_data[column_name], - timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None, - ) - for method in self.categorical_method_names - ] + methods = [] + for method in self.categorical_method_names: + try: + methods.append( + MethodFactory.create( + key=method, + feature_type=FeatureType.CATEGORICAL, + chunker=self.chunker, + threshold=self.thresholds[method], + ).fit( + reference_data=reference_data[column_name], + timestamps=timestamps, + ) + ) + except Exception as ex: + raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex + self._column_to_models_mapping[column_name] = methods self.result = self._calculate(reference_data) self.result.data['chunk', 'chunk', 'period'] = 'reference' diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py index e165964b6..84a7d3c12 100644 --- a/nannyml/drift/univariate/methods.py +++ b/nannyml/drift/univariate/methods.py @@ -29,7 +29,7 @@ from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance from nannyml._typing import Self -from nannyml.base import _column_is_categorical, _remove_missing_data +from nannyml.base import _remove_nans, _column_is_categorical from nannyml.chunk import Chunker from nannyml.exceptions import InvalidArgumentsException, NotFittedException from nannyml.thresholds import Threshold, calculate_threshold_values @@ -278,6 +278,7 @@ def __init__(self, **kwargs) -> None: self._reference_proba_in_bins: np.ndarray def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None): + reference_data = _remove_nans(reference_data) if _column_is_categorical(reference_data): treat_as_type = 'cat' else: @@ -305,7 +306,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None def _calculate(self, data: pd.Series): reference_proba_in_bins = copy(self._reference_proba_in_bins) - data = _remove_missing_data(data) + data = _remove_nans(data) if data.empty: return np.nan if self._treat_as_type == 'cont': @@ -374,7 +375,7 @@ def __init__(self, **kwargs) -> None: self.n_bins = kwargs['computation_params'].get('n_bins', 10_000) def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data = _remove_missing_data(reference_data) + reference_data = _remove_nans(reference_data) if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact': self._reference_data = reference_data else: @@ -389,7 +390,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None return self def _calculate(self, data: pd.Series): - data = _remove_missing_data(data) + data = _remove_nans(data) if data.empty: return np.nan if not self._fitted: @@ -443,13 +444,13 @@ def __init__(self, **kwargs) -> None: self._fitted = False def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data = _remove_missing_data(reference_data) + reference_data = _remove_nans(reference_data) self._reference_data_vcs = reference_data.value_counts().loc[lambda v: v != 0] self._fitted = True return self def _calculate(self, data: pd.Series): - data = _remove_missing_data(data) + data = _remove_nans(data) if data.empty: return np.nan if not self._fitted: @@ -505,7 +506,7 @@ def __init__(self, **kwargs) -> None: self._reference_proba: Optional[dict] = None def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data = _remove_missing_data(reference_data) + reference_data = _remove_nans(reference_data) ref_labels = reference_data.unique() self._reference_proba = {label: (reference_data == label).sum() / len(reference_data) for label in ref_labels} @@ -516,7 +517,7 @@ def _calculate(self, data: pd.Series): raise NotFittedException( "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first" ) - data = _remove_missing_data(data) + data = _remove_nans(data) if data.empty: return np.nan data_labels = data.unique() @@ -574,7 +575,7 @@ def __init__(self, **kwargs) -> None: self.n_bins = kwargs['computation_params'].get('n_bins', 10_000) def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data = _remove_missing_data(reference_data) + reference_data = _remove_nans(reference_data) if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact': self._reference_data = reference_data else: @@ -592,7 +593,7 @@ def _calculate(self, data: pd.Series): raise NotFittedException( "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first" ) - data = _remove_missing_data(data) + data = _remove_nans(data) if data.empty: return np.nan if ( @@ -668,7 +669,7 @@ def __init__(self, **kwargs) -> None: self._reference_proba_in_bins: np.ndarray def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data = _remove_missing_data(reference_data) + reference_data = _remove_nans(reference_data) if _column_is_categorical(reference_data): treat_as_type = 'cat' else: @@ -695,7 +696,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None return self def _calculate(self, data: pd.Series): - data = _remove_missing_data(data) + data = _remove_nans(data) if data.empty: return np.nan reference_proba_in_bins = copy(self._reference_proba_in_bins) diff --git a/nannyml/performance_calculation/metrics/base.py b/nannyml/performance_calculation/metrics/base.py index 49da27ff0..18e2ee412 100644 --- a/nannyml/performance_calculation/metrics/base.py +++ b/nannyml/performance_calculation/metrics/base.py @@ -255,25 +255,3 @@ def inner_wrapper(wrapped_class: Type[Metric]) -> Type[Metric]: return wrapped_class return inner_wrapper - - -def _common_data_cleaning(y_true: pd.Series, y_pred: Union[pd.Series, pd.DataFrame]): - y_true, y_pred = ( - y_true.reset_index(drop=True), - y_pred.reset_index(drop=True), - ) - - if isinstance(y_pred, pd.DataFrame): - y_true = y_true[~y_pred.isna().all(axis=1)] - else: - y_true = y_true[~y_pred.isna()] - y_pred.dropna(inplace=True) - - y_pred = y_pred[~y_true.isna()] - y_true.dropna(inplace=True) - - # NaN values have been dropped. Try to infer types again - y_pred = y_pred.infer_objects() - y_true = y_true.infer_objects() - - return y_true, y_pred diff --git a/nannyml/performance_calculation/metrics/binary_classification.py b/nannyml/performance_calculation/metrics/binary_classification.py index 7ea66ec8e..f39ae5392 100644 --- a/nannyml/performance_calculation/metrics/binary_classification.py +++ b/nannyml/performance_calculation/metrics/binary_classification.py @@ -9,10 +9,10 @@ from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score from nannyml._typing import ProblemType -from nannyml.base import _list_missing +from nannyml.base import _remove_nans, _list_missing from nannyml.chunk import Chunk, Chunker from nannyml.exceptions import InvalidArgumentsException -from nannyml.performance_calculation.metrics.base import Metric, MetricFactory, _common_data_cleaning +from nannyml.performance_calculation.metrics.base import Metric, MetricFactory from nannyml.sampling_error.binary_classification import ( accuracy_sampling_error, accuracy_sampling_error_components, @@ -93,12 +93,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred_proba], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred_proba] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) - if y_true.nunique() <= 1: warnings.warn("Calculated ROC-AUC score contains NaN values.") return np.nan @@ -162,12 +161,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated F1-score contains NaN values.") return np.nan @@ -230,12 +228,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Precision score contains NaN values.") return np.nan @@ -298,12 +295,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Recall score contains NaN values.") return np.nan @@ -366,17 +362,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - f"could not calculate metric {self.display_name}: " "prediction column contains no data" - ) - - y_true, y_pred = _common_data_cleaning(y_true, y_pred) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Specificity score contains NaN values.") return np.nan @@ -440,17 +430,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - f"could not calculate metric '{self.display_name}': " "prediction column contains no data" - ) - - y_true, y_pred = _common_data_cleaning(y_true, y_pred) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Accuracy score contains NaN values.") return np.nan @@ -547,19 +531,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - f"could not calculate metric '{self.name}': " "prediction column contains no data" - ) - - y_true, y_pred = _common_data_cleaning(y_true, y_pred) - if y_true is None: - warnings.warn("Calculated Business Value contains NaN values.") - return np.NaN if y_true.shape[0] == 0: warnings.warn("Calculated Business Value contains NaN values.") return np.NaN @@ -743,16 +719,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate_true_positives(self, data: pd.DataFrame) -> float: _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - "could not calculate metric true_positive. prediction column contains no data" - ) - - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated true_positives contain NaN values.") return np.nan @@ -772,16 +743,11 @@ def _calculate_true_positives(self, data: pd.DataFrame) -> float: def _calculate_true_negatives(self, data: pd.DataFrame) -> float: _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - "could not calculate metric true_negative. prediction column contains no data" - ) - - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated true_negatives contain NaN values.") return np.nan @@ -801,16 +767,11 @@ def _calculate_true_negatives(self, data: pd.DataFrame) -> float: def _calculate_false_positives(self, data: pd.DataFrame) -> float: _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - "could not calculate metric false_positive. prediction column contains no data" - ) - - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated false_positives contain NaN values.") return np.nan @@ -830,16 +791,11 @@ def _calculate_false_positives(self, data: pd.DataFrame) -> float: def _calculate_false_negatives(self, data: pd.DataFrame) -> float: _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - "could not calculate metric false_negative. prediction column contains no data" - ) - - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated false_negatives contain NaN values.") return np.nan diff --git a/nannyml/performance_calculation/metrics/multiclass_classification.py b/nannyml/performance_calculation/metrics/multiclass_classification.py index 33ba37849..3fb34fe55 100644 --- a/nannyml/performance_calculation/metrics/multiclass_classification.py +++ b/nannyml/performance_calculation/metrics/multiclass_classification.py @@ -24,10 +24,10 @@ from sklearn.preprocessing import LabelBinarizer, label_binarize from nannyml._typing import ProblemType, class_labels, model_output_column_names -from nannyml.base import _list_missing +from nannyml.base import _remove_nans, _list_missing from nannyml.chunk import Chunker from nannyml.exceptions import InvalidArgumentsException -from nannyml.performance_calculation.metrics.base import Metric, MetricFactory, _common_data_cleaning +from nannyml.performance_calculation.metrics.base import Metric, MetricFactory from nannyml.sampling_error.multiclass_classification import ( accuracy_sampling_error, accuracy_sampling_error_components, @@ -116,6 +116,7 @@ def _calculate(self, data: pd.DataFrame): ) _list_missing([self.y_true] + model_output_column_names(self.y_pred_proba), data) + data = _remove_nans(data, (self.y_true, self.y_pred_proba.values())) labels, class_probability_columns = [], [] for label in sorted(list(self.y_pred_proba.keys())): @@ -123,19 +124,18 @@ def _calculate(self, data: pd.DataFrame): class_probability_columns.append(self.y_pred_proba[label]) y_true = data[self.y_true] - y_pred = data[class_probability_columns] + y_pred_proba = data[class_probability_columns] - if y_pred.isna().all().any(): + if y_pred_proba.isna().all().any(): raise InvalidArgumentsException( f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.nunique() <= 1: warnings.warn("Calculated ROC-AUC score contains NaN values.") return np.nan else: - return roc_auc_score(y_true, y_pred, multi_class='ovr', average='macro', labels=labels) + return roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro', labels=labels) def _sampling_error(self, data: pd.DataFrame) -> float: return auroc_sampling_error(self._sampling_error_components, data) @@ -208,6 +208,7 @@ def _calculate(self, data: pd.DataFrame): ) _list_missing([self.y_true, self.y_pred], data) + data = _remove_nans(data, (self.y_true, self.y_pred)) labels = sorted(list(self.y_pred_proba.keys())) y_true = data[self.y_true] @@ -218,7 +219,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated F1-score contains NaN values.") return np.nan @@ -296,6 +296,7 @@ def _calculate(self, data: pd.DataFrame): ) _list_missing([self.y_true, self.y_pred], data) + data = _remove_nans(data, (self.y_true, self.y_pred)) labels = sorted(list(self.y_pred_proba.keys())) y_true = data[self.y_true] @@ -306,7 +307,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Precision score contains NaN values.") return np.nan @@ -384,6 +384,7 @@ def _calculate(self, data: pd.DataFrame): ) _list_missing([self.y_true, self.y_pred], data) + data = _remove_nans(data, (self.y_true, self.y_pred)) labels = sorted(list(self.y_pred_proba.keys())) y_true = data[self.y_true] @@ -394,7 +395,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Recall score contains NaN values.") return np.nan @@ -472,6 +472,7 @@ def _calculate(self, data: pd.DataFrame): ) _list_missing([self.y_true, self.y_pred], data) + data = _remove_nans(data, (self.y_true, self.y_pred)) labels = sorted(list(self.y_pred_proba.keys())) y_true = data[self.y_true] @@ -482,7 +483,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Specificity score contains NaN values.") return np.nan @@ -557,6 +557,7 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], data) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] @@ -566,7 +567,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric '{self.display_name}': " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Accuracy score contains NaN values.") return np.nan diff --git a/nannyml/performance_calculation/metrics/regression.py b/nannyml/performance_calculation/metrics/regression.py index 0c75f83f2..31e9ade27 100644 --- a/nannyml/performance_calculation/metrics/regression.py +++ b/nannyml/performance_calculation/metrics/regression.py @@ -13,8 +13,8 @@ ) from nannyml._typing import ProblemType -from nannyml.base import _list_missing, _raise_exception_for_negative_values -from nannyml.performance_calculation.metrics.base import Metric, MetricFactory, _common_data_cleaning +from nannyml.base import _remove_nans, _list_missing, _raise_exception_for_negative_values +from nannyml.performance_calculation.metrics.base import Metric, MetricFactory from nannyml.sampling_error.regression import ( mae_sampling_error, mae_sampling_error_components, @@ -76,11 +76,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -134,11 +134,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -192,11 +192,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -250,11 +250,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -313,11 +313,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -371,11 +371,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan diff --git a/nannyml/performance_estimation/confidence_based/metrics.py b/nannyml/performance_estimation/confidence_based/metrics.py index 4899d8249..08e8a242f 100644 --- a/nannyml/performance_estimation/confidence_based/metrics.py +++ b/nannyml/performance_estimation/confidence_based/metrics.py @@ -31,6 +31,7 @@ import nannyml.sampling_error.binary_classification as bse import nannyml.sampling_error.multiclass_classification as mse from nannyml._typing import ModelOutputsType, ProblemType, class_labels +from nannyml.base import _remove_nans from nannyml.chunk import Chunk, Chunker from nannyml.exceptions import CalculatorException, InvalidArgumentsException from nannyml.performance_estimation.confidence_based import SUPPORTED_METRIC_VALUES @@ -234,30 +235,13 @@ def _common_cleaning( ) y_pred_proba_column_name = self.y_pred_proba - clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all() - - y_pred_proba = data[y_pred_proba_column_name] - y_pred = data[self.y_pred] - y_true = data[self.y_true] if clean_targets else None - - # Create mask to filter out NaN values - mask = ~(y_pred.isna() | y_pred_proba.isna()) - if clean_targets: - mask = mask | ~(y_true.isna()) + data = _remove_nans(data, [self.y_pred, y_pred_proba_column_name]) - # Drop missing values (NaN/None) - y_pred_proba = y_pred_proba[mask] - y_pred = y_pred[mask] - if clean_targets: - y_true = y_true[mask] - - # NaN values have been dropped. Try to infer types again - y_pred_proba = y_pred_proba.infer_objects() - y_pred = y_pred.infer_objects() + clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all() if clean_targets: - y_true = y_true.infer_objects() + data = _remove_nans(data, [self.y_true]) - return y_pred_proba, y_pred, y_true + return data[y_pred_proba_column_name], data[self.y_pred], (data[self.y_true] if clean_targets else None) def get_chunk_record(self, chunk_data: pd.DataFrame) -> Dict: """Returns a dictionary containing the performance metrics for a given chunk. @@ -1584,7 +1568,7 @@ def _realized_performance(self, data: pd.DataFrame) -> float: if y_true.shape[0] == 0: warnings.warn("Calculated Business Value contains NaN values.") return np.NaN - + tp_value = self.business_value_matrix[1, 1] tn_value = self.business_value_matrix[0, 0] fp_value = self.business_value_matrix[0, 1] diff --git a/nannyml/performance_estimation/direct_loss_estimation/metrics.py b/nannyml/performance_estimation/direct_loss_estimation/metrics.py index 2daf6b99c..03e35a654 100644 --- a/nannyml/performance_estimation/direct_loss_estimation/metrics.py +++ b/nannyml/performance_estimation/direct_loss_estimation/metrics.py @@ -29,7 +29,7 @@ ) from nannyml._typing import ProblemType -from nannyml.base import _raise_exception_for_negative_values +from nannyml.base import _raise_exception_for_negative_values, _remove_nans from nannyml.chunk import Chunk, Chunker from nannyml.exceptions import InvalidArgumentsException from nannyml.sampling_error.regression import ( @@ -271,18 +271,14 @@ def __eq__(self, other): """Establishes equality by comparing all properties.""" return self.display_name == other.display_name and self.column_name == other.column_name - def _common_cleaning(self, data: pd.DataFrame) -> Tuple[pd.Series, pd.Series]: - clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all() + def _common_cleaning(self, data: pd.DataFrame) -> Tuple[pd.Series, Optional[pd.Series]]: + data = _remove_nans(data, [self.y_pred]) - y_pred = data[self.y_pred] + clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all() if clean_targets: - y_true = data[self.y_true] - y_pred = y_pred[~y_true.isna()] - y_true.dropna(inplace=True) - else: - y_true = None + data = _remove_nans(data, [self.y_pred, self.y_true]) - return y_pred, y_true + return data[self.y_pred], (data[self.y_true] if clean_targets else None) def _train_direct_error_estimation_model( self,