From 55ff6a32211c258aa13786f6e45b46fe25246822 Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Sun, 19 Nov 2023 19:00:53 +0100 Subject: [PATCH 1/7] Add column & method for univariate fitting errors --- nannyml/drift/univariate/calculator.py | 63 +++++++++++++++----------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/nannyml/drift/univariate/calculator.py b/nannyml/drift/univariate/calculator.py index 6f2052989..1ce569b3c 100644 --- a/nannyml/drift/univariate/calculator.py +++ b/nannyml/drift/univariate/calculator.py @@ -39,7 +39,7 @@ from nannyml.chunk import Chunker from nannyml.drift.univariate.methods import FeatureType, Method, MethodFactory from nannyml.drift.univariate.result import Result -from nannyml.exceptions import InvalidArgumentsException +from nannyml.exceptions import CalculatorException, InvalidArgumentsException from nannyml.thresholds import ConstantThreshold, StandardDeviationThreshold, Threshold from nannyml.usage_logging import UsageEvent, log_usage @@ -271,34 +271,45 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> UnivariateDrift if column_name not in self.categorical_column_names: self.categorical_column_names.append(column_name) + timestamps = reference_data[self.timestamp_column_name] if self.timestamp_column_name else None for column_name in self.continuous_column_names: - self._column_to_models_mapping[column_name] += [ - MethodFactory.create( - key=method, - feature_type=FeatureType.CONTINUOUS, - chunker=self.chunker, - computation_params=self.computation_params or {}, - threshold=self.thresholds[method], - ).fit( - reference_data=reference_data[column_name], - timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None, - ) - for method in self.continuous_method_names - ] + methods = [] + for method in self.continuous_method_names: + try: + methods.append( + MethodFactory.create( + key=method, + feature_type=FeatureType.CONTINUOUS, + chunker=self.chunker, + computation_params=self.computation_params or {}, + threshold=self.thresholds[method], + ).fit( + reference_data=reference_data[column_name], + timestamps=timestamps, + ) + ) + except Exception as ex: + raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex + self._column_to_models_mapping[column_name] = methods for column_name in self.categorical_column_names: - self._column_to_models_mapping[column_name] += [ - MethodFactory.create( - key=method, - feature_type=FeatureType.CATEGORICAL, - chunker=self.chunker, - threshold=self.thresholds[method], - ).fit( - reference_data=reference_data[column_name], - timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None, - ) - for method in self.categorical_method_names - ] + methods = [] + for method in self.categorical_method_names: + try: + methods.append( + MethodFactory.create( + key=method, + feature_type=FeatureType.CATEGORICAL, + chunker=self.chunker, + threshold=self.thresholds[method], + ).fit( + reference_data=reference_data[column_name], + timestamps=timestamps, + ) + ) + except Exception as ex: + raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex + self._column_to_models_mapping[column_name] = methods self.result = self._calculate(reference_data) self.result.data['chunk', 'chunk', 'period'] = 'reference' From 0cfcf9264d409ef82bfb54b63898aea22da0e89f Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Sun, 19 Nov 2023 19:18:00 +0100 Subject: [PATCH 2/7] Refactor to use single data cleaning method --- nannyml/base.py | 17 +++++++----- nannyml/drift/univariate/methods.py | 24 ++++++++--------- .../performance_calculation/metrics/base.py | 22 ---------------- .../metrics/binary_classification.py | 26 +++++++++---------- .../metrics/multiclass_classification.py | 16 ++++++------ .../metrics/regression.py | 16 ++++++------ 6 files changed, 52 insertions(+), 69 deletions(-) diff --git a/nannyml/base.py b/nannyml/base.py index 6ef629fb4..6e313daa6 100644 --- a/nannyml/base.py +++ b/nannyml/base.py @@ -533,12 +533,17 @@ def _column_is_categorical(column: pd.Series) -> bool: return column.dtype in ['object', 'string', 'category', 'bool'] -def _remove_missing_data(column: pd.Series): - if isinstance(column, pd.Series): - column = column.dropna().reset_index(drop=True) - else: - column = column[~np.isnan(column)] - return column +def _clean_data(*data: Union[pd.Series, pd.DataFrame]) -> Tuple[pd.DataFrame, ...]: + """Remove rows with NaN values from the given data.""" + mask = np.ones(len(data[0]), dtype=bool) + for df in data: + if isinstance(df, pd.DataFrame): + mask &= ~df.isna().all(axis=1) + else: + mask &= ~df.isna() + + # NaN values have been dropped. Try to infer types again + return tuple(df[mask].reset_index(drop=True).infer_objects() for df in data) def _column_is_continuous(column: pd.Series) -> bool: diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py index e165964b6..73e61a1a1 100644 --- a/nannyml/drift/univariate/methods.py +++ b/nannyml/drift/univariate/methods.py @@ -29,7 +29,7 @@ from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance from nannyml._typing import Self -from nannyml.base import _column_is_categorical, _remove_missing_data +from nannyml.base import _clean_data, _column_is_categorical from nannyml.chunk import Chunker from nannyml.exceptions import InvalidArgumentsException, NotFittedException from nannyml.thresholds import Threshold, calculate_threshold_values @@ -305,7 +305,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None def _calculate(self, data: pd.Series): reference_proba_in_bins = copy(self._reference_proba_in_bins) - data = _remove_missing_data(data) + data, = _clean_data(data) if data.empty: return np.nan if self._treat_as_type == 'cont': @@ -374,7 +374,7 @@ def __init__(self, **kwargs) -> None: self.n_bins = kwargs['computation_params'].get('n_bins', 10_000) def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data = _remove_missing_data(reference_data) + reference_data, = _clean_data(reference_data) if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact': self._reference_data = reference_data else: @@ -389,7 +389,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None return self def _calculate(self, data: pd.Series): - data = _remove_missing_data(data) + data, = _clean_data(data) if data.empty: return np.nan if not self._fitted: @@ -443,13 +443,13 @@ def __init__(self, **kwargs) -> None: self._fitted = False def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data = _remove_missing_data(reference_data) + reference_data, = _clean_data(reference_data) self._reference_data_vcs = reference_data.value_counts().loc[lambda v: v != 0] self._fitted = True return self def _calculate(self, data: pd.Series): - data = _remove_missing_data(data) + data, = _clean_data(data) if data.empty: return np.nan if not self._fitted: @@ -505,7 +505,7 @@ def __init__(self, **kwargs) -> None: self._reference_proba: Optional[dict] = None def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data = _remove_missing_data(reference_data) + reference_data, = _clean_data(reference_data) ref_labels = reference_data.unique() self._reference_proba = {label: (reference_data == label).sum() / len(reference_data) for label in ref_labels} @@ -516,7 +516,7 @@ def _calculate(self, data: pd.Series): raise NotFittedException( "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first" ) - data = _remove_missing_data(data) + data, = _clean_data(data) if data.empty: return np.nan data_labels = data.unique() @@ -574,7 +574,7 @@ def __init__(self, **kwargs) -> None: self.n_bins = kwargs['computation_params'].get('n_bins', 10_000) def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data = _remove_missing_data(reference_data) + reference_data, = _clean_data(reference_data) if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact': self._reference_data = reference_data else: @@ -592,7 +592,7 @@ def _calculate(self, data: pd.Series): raise NotFittedException( "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first" ) - data = _remove_missing_data(data) + data, = _clean_data(data) if data.empty: return np.nan if ( @@ -668,7 +668,7 @@ def __init__(self, **kwargs) -> None: self._reference_proba_in_bins: np.ndarray def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data = _remove_missing_data(reference_data) + reference_data, = _clean_data(reference_data) if _column_is_categorical(reference_data): treat_as_type = 'cat' else: @@ -695,7 +695,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None return self def _calculate(self, data: pd.Series): - data = _remove_missing_data(data) + data, = _clean_data(data) if data.empty: return np.nan reference_proba_in_bins = copy(self._reference_proba_in_bins) diff --git a/nannyml/performance_calculation/metrics/base.py b/nannyml/performance_calculation/metrics/base.py index 49da27ff0..18e2ee412 100644 --- a/nannyml/performance_calculation/metrics/base.py +++ b/nannyml/performance_calculation/metrics/base.py @@ -255,25 +255,3 @@ def inner_wrapper(wrapped_class: Type[Metric]) -> Type[Metric]: return wrapped_class return inner_wrapper - - -def _common_data_cleaning(y_true: pd.Series, y_pred: Union[pd.Series, pd.DataFrame]): - y_true, y_pred = ( - y_true.reset_index(drop=True), - y_pred.reset_index(drop=True), - ) - - if isinstance(y_pred, pd.DataFrame): - y_true = y_true[~y_pred.isna().all(axis=1)] - else: - y_true = y_true[~y_pred.isna()] - y_pred.dropna(inplace=True) - - y_pred = y_pred[~y_true.isna()] - y_true.dropna(inplace=True) - - # NaN values have been dropped. Try to infer types again - y_pred = y_pred.infer_objects() - y_true = y_true.infer_objects() - - return y_true, y_pred diff --git a/nannyml/performance_calculation/metrics/binary_classification.py b/nannyml/performance_calculation/metrics/binary_classification.py index 7ea66ec8e..4c210c56d 100644 --- a/nannyml/performance_calculation/metrics/binary_classification.py +++ b/nannyml/performance_calculation/metrics/binary_classification.py @@ -9,10 +9,10 @@ from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score from nannyml._typing import ProblemType -from nannyml.base import _list_missing +from nannyml.base import _clean_data, _list_missing from nannyml.chunk import Chunk, Chunker from nannyml.exceptions import InvalidArgumentsException -from nannyml.performance_calculation.metrics.base import Metric, MetricFactory, _common_data_cleaning +from nannyml.performance_calculation.metrics.base import Metric, MetricFactory from nannyml.sampling_error.binary_classification import ( accuracy_sampling_error, accuracy_sampling_error_components, @@ -97,7 +97,7 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred_proba] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.nunique() <= 1: warnings.warn("Calculated ROC-AUC score contains NaN values.") @@ -166,7 +166,7 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated F1-score contains NaN values.") @@ -234,7 +234,7 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Precision score contains NaN values.") @@ -302,7 +302,7 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Recall score contains NaN values.") @@ -375,7 +375,7 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Specificity score contains NaN values.") @@ -449,7 +449,7 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric '{self.display_name}': " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Accuracy score contains NaN values.") @@ -556,7 +556,7 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric '{self.name}': " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true is None: warnings.warn("Calculated Business Value contains NaN values.") return np.NaN @@ -752,7 +752,7 @@ def _calculate_true_positives(self, data: pd.DataFrame) -> float: "could not calculate metric true_positive. prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated true_positives contain NaN values.") return np.nan @@ -781,7 +781,7 @@ def _calculate_true_negatives(self, data: pd.DataFrame) -> float: "could not calculate metric true_negative. prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated true_negatives contain NaN values.") return np.nan @@ -810,7 +810,7 @@ def _calculate_false_positives(self, data: pd.DataFrame) -> float: "could not calculate metric false_positive. prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated false_positives contain NaN values.") return np.nan @@ -839,7 +839,7 @@ def _calculate_false_negatives(self, data: pd.DataFrame) -> float: "could not calculate metric false_negative. prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated false_negatives contain NaN values.") return np.nan diff --git a/nannyml/performance_calculation/metrics/multiclass_classification.py b/nannyml/performance_calculation/metrics/multiclass_classification.py index 33ba37849..928eb2b46 100644 --- a/nannyml/performance_calculation/metrics/multiclass_classification.py +++ b/nannyml/performance_calculation/metrics/multiclass_classification.py @@ -24,10 +24,10 @@ from sklearn.preprocessing import LabelBinarizer, label_binarize from nannyml._typing import ProblemType, class_labels, model_output_column_names -from nannyml.base import _list_missing +from nannyml.base import _clean_data, _list_missing from nannyml.chunk import Chunker from nannyml.exceptions import InvalidArgumentsException -from nannyml.performance_calculation.metrics.base import Metric, MetricFactory, _common_data_cleaning +from nannyml.performance_calculation.metrics.base import Metric, MetricFactory from nannyml.sampling_error.multiclass_classification import ( accuracy_sampling_error, accuracy_sampling_error_components, @@ -130,7 +130,7 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.nunique() <= 1: warnings.warn("Calculated ROC-AUC score contains NaN values.") return np.nan @@ -218,7 +218,7 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated F1-score contains NaN values.") return np.nan @@ -306,7 +306,7 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Precision score contains NaN values.") return np.nan @@ -394,7 +394,7 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Recall score contains NaN values.") return np.nan @@ -482,7 +482,7 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Specificity score contains NaN values.") return np.nan @@ -566,7 +566,7 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric '{self.display_name}': " "prediction column contains no data" ) - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Accuracy score contains NaN values.") return np.nan diff --git a/nannyml/performance_calculation/metrics/regression.py b/nannyml/performance_calculation/metrics/regression.py index 0c75f83f2..8b0c65399 100644 --- a/nannyml/performance_calculation/metrics/regression.py +++ b/nannyml/performance_calculation/metrics/regression.py @@ -13,8 +13,8 @@ ) from nannyml._typing import ProblemType -from nannyml.base import _list_missing, _raise_exception_for_negative_values -from nannyml.performance_calculation.metrics.base import Metric, MetricFactory, _common_data_cleaning +from nannyml.base import _clean_data, _list_missing, _raise_exception_for_negative_values +from nannyml.performance_calculation.metrics.base import Metric, MetricFactory from nannyml.sampling_error.regression import ( mae_sampling_error, mae_sampling_error_components, @@ -80,7 +80,7 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -138,7 +138,7 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -196,7 +196,7 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -254,7 +254,7 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -317,7 +317,7 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -375,7 +375,7 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _common_data_cleaning(y_true, y_pred) + y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan From 369efbfe9ec05d915e01a3d330e9764d0571eb0a Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Sun, 19 Nov 2023 19:24:34 +0100 Subject: [PATCH 3/7] Filter NaN's when fitting JS --- nannyml/drift/univariate/methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py index 73e61a1a1..899adec85 100644 --- a/nannyml/drift/univariate/methods.py +++ b/nannyml/drift/univariate/methods.py @@ -278,6 +278,7 @@ def __init__(self, **kwargs) -> None: self._reference_proba_in_bins: np.ndarray def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None): + reference_data, = _clean_data(reference_data) if _column_is_categorical(reference_data): treat_as_type = 'cat' else: From a0e5b4e32740db782c6034237ca8fb0d5cd3ea2d Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Sun, 19 Nov 2023 23:09:48 +0100 Subject: [PATCH 4/7] Refactor data cleaning to accept columns argument Previously the data cleaning method operated by accepting multiple dataframes and inspecting each dataframe separetely for `NaN`'s. Depending on how the data is processed after cleaning, splitting columns into separate dataframes can be rather annoying. To avoid that this commit changes the method to accept a single dataframe and a columns argument. The columns argument specifies which column subsets should be inspected for `NaN`'s, enabling the same behaviour using a more convenient syntax. --- nannyml/base.py | 41 ++++++++++++++----- nannyml/drift/univariate/methods.py | 26 ++++++------ .../metrics/binary_classification.py | 31 ++++++-------- .../metrics/multiclass_classification.py | 20 ++++----- .../metrics/regression.py | 14 +++---- 5 files changed, 73 insertions(+), 59 deletions(-) diff --git a/nannyml/base.py b/nannyml/base.py index 6e313daa6..a7958ce3f 100644 --- a/nannyml/base.py +++ b/nannyml/base.py @@ -8,7 +8,7 @@ import copy import logging from abc import ABC, abstractmethod -from typing import Generic, List, Optional, Tuple, TypeVar, Union +from typing import Generic, Iterable, List, Optional, Tuple, TypeVar, Union, overload import numpy as np import pandas as pd @@ -533,17 +533,38 @@ def _column_is_categorical(column: pd.Series) -> bool: return column.dtype in ['object', 'string', 'category', 'bool'] -def _clean_data(*data: Union[pd.Series, pd.DataFrame]) -> Tuple[pd.DataFrame, ...]: - """Remove rows with NaN values from the given data.""" - mask = np.ones(len(data[0]), dtype=bool) - for df in data: - if isinstance(df, pd.DataFrame): - mask &= ~df.isna().all(axis=1) - else: - mask &= ~df.isna() +@overload +def _remove_nans(data: pd.Series, columns: None) -> pd.Series: + ... + +@overload +def _remove_nans(data: pd.DataFrame, columns: Optional[Iterable[Union[str, Iterable[str]]]]) -> pd.DataFrame: + ... + + +def _remove_nans( + data: Union[pd.Series, pd.DataFrame], columns: Optional[Iterable[Union[str, Iterable[str]]]] = None +) -> Tuple[pd.DataFrame, ...]: + """Remove rows with NaN values in the specified columns. + + If no columns are given, drop rows with NaN values in any column. If columns are given, drop rows with NaN values + in the specified columns. If a set of columns is given, drop rows with NaN values in all of the columns in the set. + """ + # If no columns are given, drop rows with NaN values in any columns + if columns is None: + mask = ~data.isna() + if isinstance(mask, pd.DataFrame): + mask = mask.all(axis=1) + else: + mask = np.ones(len(data), dtype=bool) + for column_selector in columns: + nans = data[column_selector].isna() + if isinstance(nans, pd.DataFrame): + nans = nans.all(axis=1) + mask &= ~nans # NaN values have been dropped. Try to infer types again - return tuple(df[mask].reset_index(drop=True).infer_objects() for df in data) + return data[mask].reset_index(drop=True).infer_objects() def _column_is_continuous(column: pd.Series) -> bool: diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py index 899adec85..84a7d3c12 100644 --- a/nannyml/drift/univariate/methods.py +++ b/nannyml/drift/univariate/methods.py @@ -29,7 +29,7 @@ from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance from nannyml._typing import Self -from nannyml.base import _clean_data, _column_is_categorical +from nannyml.base import _remove_nans, _column_is_categorical from nannyml.chunk import Chunker from nannyml.exceptions import InvalidArgumentsException, NotFittedException from nannyml.thresholds import Threshold, calculate_threshold_values @@ -278,7 +278,7 @@ def __init__(self, **kwargs) -> None: self._reference_proba_in_bins: np.ndarray def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None): - reference_data, = _clean_data(reference_data) + reference_data = _remove_nans(reference_data) if _column_is_categorical(reference_data): treat_as_type = 'cat' else: @@ -306,7 +306,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None def _calculate(self, data: pd.Series): reference_proba_in_bins = copy(self._reference_proba_in_bins) - data, = _clean_data(data) + data = _remove_nans(data) if data.empty: return np.nan if self._treat_as_type == 'cont': @@ -375,7 +375,7 @@ def __init__(self, **kwargs) -> None: self.n_bins = kwargs['computation_params'].get('n_bins', 10_000) def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data, = _clean_data(reference_data) + reference_data = _remove_nans(reference_data) if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact': self._reference_data = reference_data else: @@ -390,7 +390,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None return self def _calculate(self, data: pd.Series): - data, = _clean_data(data) + data = _remove_nans(data) if data.empty: return np.nan if not self._fitted: @@ -444,13 +444,13 @@ def __init__(self, **kwargs) -> None: self._fitted = False def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data, = _clean_data(reference_data) + reference_data = _remove_nans(reference_data) self._reference_data_vcs = reference_data.value_counts().loc[lambda v: v != 0] self._fitted = True return self def _calculate(self, data: pd.Series): - data, = _clean_data(data) + data = _remove_nans(data) if data.empty: return np.nan if not self._fitted: @@ -506,7 +506,7 @@ def __init__(self, **kwargs) -> None: self._reference_proba: Optional[dict] = None def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data, = _clean_data(reference_data) + reference_data = _remove_nans(reference_data) ref_labels = reference_data.unique() self._reference_proba = {label: (reference_data == label).sum() / len(reference_data) for label in ref_labels} @@ -517,7 +517,7 @@ def _calculate(self, data: pd.Series): raise NotFittedException( "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first" ) - data, = _clean_data(data) + data = _remove_nans(data) if data.empty: return np.nan data_labels = data.unique() @@ -575,7 +575,7 @@ def __init__(self, **kwargs) -> None: self.n_bins = kwargs['computation_params'].get('n_bins', 10_000) def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data, = _clean_data(reference_data) + reference_data = _remove_nans(reference_data) if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact': self._reference_data = reference_data else: @@ -593,7 +593,7 @@ def _calculate(self, data: pd.Series): raise NotFittedException( "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first" ) - data, = _clean_data(data) + data = _remove_nans(data) if data.empty: return np.nan if ( @@ -669,7 +669,7 @@ def __init__(self, **kwargs) -> None: self._reference_proba_in_bins: np.ndarray def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: - reference_data, = _clean_data(reference_data) + reference_data = _remove_nans(reference_data) if _column_is_categorical(reference_data): treat_as_type = 'cat' else: @@ -696,7 +696,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None return self def _calculate(self, data: pd.Series): - data, = _clean_data(data) + data = _remove_nans(data) if data.empty: return np.nan reference_proba_in_bins = copy(self._reference_proba_in_bins) diff --git a/nannyml/performance_calculation/metrics/binary_classification.py b/nannyml/performance_calculation/metrics/binary_classification.py index 4c210c56d..e41ec382b 100644 --- a/nannyml/performance_calculation/metrics/binary_classification.py +++ b/nannyml/performance_calculation/metrics/binary_classification.py @@ -9,7 +9,7 @@ from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score from nannyml._typing import ProblemType -from nannyml.base import _clean_data, _list_missing +from nannyml.base import _remove_nans, _list_missing from nannyml.chunk import Chunk, Chunker from nannyml.exceptions import InvalidArgumentsException from nannyml.performance_calculation.metrics.base import Metric, MetricFactory @@ -93,12 +93,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred_proba], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred_proba] - y_true, y_pred = _clean_data(y_true, y_pred) - if y_true.nunique() <= 1: warnings.warn("Calculated ROC-AUC score contains NaN values.") return np.nan @@ -162,12 +161,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _clean_data(y_true, y_pred) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated F1-score contains NaN values.") return np.nan @@ -230,12 +228,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _clean_data(y_true, y_pred) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Precision score contains NaN values.") return np.nan @@ -298,12 +295,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _clean_data(y_true, y_pred) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Recall score contains NaN values.") return np.nan @@ -366,6 +362,7 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] @@ -375,8 +372,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Specificity score contains NaN values.") return np.nan @@ -440,6 +435,7 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] @@ -449,8 +445,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric '{self.display_name}': " "prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Accuracy score contains NaN values.") return np.nan @@ -547,6 +541,7 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] @@ -556,7 +551,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric '{self.name}': " "prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) if y_true is None: warnings.warn("Calculated Business Value contains NaN values.") return np.NaN @@ -743,6 +737,7 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate_true_positives(self, data: pd.DataFrame) -> float: _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] @@ -752,7 +747,6 @@ def _calculate_true_positives(self, data: pd.DataFrame) -> float: "could not calculate metric true_positive. prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated true_positives contain NaN values.") return np.nan @@ -772,6 +766,7 @@ def _calculate_true_positives(self, data: pd.DataFrame) -> float: def _calculate_true_negatives(self, data: pd.DataFrame) -> float: _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] @@ -781,7 +776,6 @@ def _calculate_true_negatives(self, data: pd.DataFrame) -> float: "could not calculate metric true_negative. prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated true_negatives contain NaN values.") return np.nan @@ -801,6 +795,7 @@ def _calculate_true_negatives(self, data: pd.DataFrame) -> float: def _calculate_false_positives(self, data: pd.DataFrame) -> float: _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] @@ -809,8 +804,6 @@ def _calculate_false_positives(self, data: pd.DataFrame) -> float: raise InvalidArgumentsException( "could not calculate metric false_positive. prediction column contains no data" ) - - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated false_positives contain NaN values.") return np.nan @@ -830,6 +823,7 @@ def _calculate_false_positives(self, data: pd.DataFrame) -> float: def _calculate_false_negatives(self, data: pd.DataFrame) -> float: _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] @@ -839,7 +833,6 @@ def _calculate_false_negatives(self, data: pd.DataFrame) -> float: "could not calculate metric false_negative. prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: warnings.warn("Calculated false_negatives contain NaN values.") return np.nan diff --git a/nannyml/performance_calculation/metrics/multiclass_classification.py b/nannyml/performance_calculation/metrics/multiclass_classification.py index 928eb2b46..3fb34fe55 100644 --- a/nannyml/performance_calculation/metrics/multiclass_classification.py +++ b/nannyml/performance_calculation/metrics/multiclass_classification.py @@ -24,7 +24,7 @@ from sklearn.preprocessing import LabelBinarizer, label_binarize from nannyml._typing import ProblemType, class_labels, model_output_column_names -from nannyml.base import _clean_data, _list_missing +from nannyml.base import _remove_nans, _list_missing from nannyml.chunk import Chunker from nannyml.exceptions import InvalidArgumentsException from nannyml.performance_calculation.metrics.base import Metric, MetricFactory @@ -116,6 +116,7 @@ def _calculate(self, data: pd.DataFrame): ) _list_missing([self.y_true] + model_output_column_names(self.y_pred_proba), data) + data = _remove_nans(data, (self.y_true, self.y_pred_proba.values())) labels, class_probability_columns = [], [] for label in sorted(list(self.y_pred_proba.keys())): @@ -123,19 +124,18 @@ def _calculate(self, data: pd.DataFrame): class_probability_columns.append(self.y_pred_proba[label]) y_true = data[self.y_true] - y_pred = data[class_probability_columns] + y_pred_proba = data[class_probability_columns] - if y_pred.isna().all().any(): + if y_pred_proba.isna().all().any(): raise InvalidArgumentsException( f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.nunique() <= 1: warnings.warn("Calculated ROC-AUC score contains NaN values.") return np.nan else: - return roc_auc_score(y_true, y_pred, multi_class='ovr', average='macro', labels=labels) + return roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro', labels=labels) def _sampling_error(self, data: pd.DataFrame) -> float: return auroc_sampling_error(self._sampling_error_components, data) @@ -208,6 +208,7 @@ def _calculate(self, data: pd.DataFrame): ) _list_missing([self.y_true, self.y_pred], data) + data = _remove_nans(data, (self.y_true, self.y_pred)) labels = sorted(list(self.y_pred_proba.keys())) y_true = data[self.y_true] @@ -218,7 +219,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated F1-score contains NaN values.") return np.nan @@ -296,6 +296,7 @@ def _calculate(self, data: pd.DataFrame): ) _list_missing([self.y_true, self.y_pred], data) + data = _remove_nans(data, (self.y_true, self.y_pred)) labels = sorted(list(self.y_pred_proba.keys())) y_true = data[self.y_true] @@ -306,7 +307,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Precision score contains NaN values.") return np.nan @@ -384,6 +384,7 @@ def _calculate(self, data: pd.DataFrame): ) _list_missing([self.y_true, self.y_pred], data) + data = _remove_nans(data, (self.y_true, self.y_pred)) labels = sorted(list(self.y_pred_proba.keys())) y_true = data[self.y_true] @@ -394,7 +395,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: " "prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Recall score contains NaN values.") return np.nan @@ -472,6 +472,7 @@ def _calculate(self, data: pd.DataFrame): ) _list_missing([self.y_true, self.y_pred], data) + data = _remove_nans(data, (self.y_true, self.y_pred)) labels = sorted(list(self.y_pred_proba.keys())) y_true = data[self.y_true] @@ -482,7 +483,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric {self.display_name}: prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Specificity score contains NaN values.") return np.nan @@ -557,6 +557,7 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], data) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] @@ -566,7 +567,6 @@ def _calculate(self, data: pd.DataFrame): f"could not calculate metric '{self.display_name}': " "prediction column contains no data" ) - y_true, y_pred = _clean_data(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Accuracy score contains NaN values.") return np.nan diff --git a/nannyml/performance_calculation/metrics/regression.py b/nannyml/performance_calculation/metrics/regression.py index 8b0c65399..31e9ade27 100644 --- a/nannyml/performance_calculation/metrics/regression.py +++ b/nannyml/performance_calculation/metrics/regression.py @@ -13,7 +13,7 @@ ) from nannyml._typing import ProblemType -from nannyml.base import _clean_data, _list_missing, _raise_exception_for_negative_values +from nannyml.base import _remove_nans, _list_missing, _raise_exception_for_negative_values from nannyml.performance_calculation.metrics.base import Metric, MetricFactory from nannyml.sampling_error.regression import ( mae_sampling_error, @@ -76,11 +76,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -134,11 +134,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -192,11 +192,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -250,11 +250,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -313,11 +313,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan @@ -371,11 +371,11 @@ def _fit(self, reference_data: pd.DataFrame): def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) + data = _remove_nans(data, (self.y_true, self.y_pred)) y_true = data[self.y_true] y_pred = data[self.y_pred] - y_true, y_pred = _clean_data(y_true, y_pred) if y_true.empty or y_pred.empty: return np.nan From de1677227dce6c74671f31b9882f860843b0c837 Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Sun, 19 Nov 2023 23:14:23 +0100 Subject: [PATCH 5/7] Remove errors and use warning behaviour instead The performance calculator for binary classification had checks in place to generate an exception if the prediction column contains nothing but `NaN`'s. This behaviour contradicts the warning functionality that is in the same functions that would should return `NaN` and issue a warning. It is also inconsistent with other calculators which do issue a warning instead of raising an error. This commit removes the errors and relies on the existing warning functionality. --- .../metrics/binary_classification.py | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/nannyml/performance_calculation/metrics/binary_classification.py b/nannyml/performance_calculation/metrics/binary_classification.py index e41ec382b..f39ae5392 100644 --- a/nannyml/performance_calculation/metrics/binary_classification.py +++ b/nannyml/performance_calculation/metrics/binary_classification.py @@ -367,11 +367,6 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - f"could not calculate metric {self.display_name}: " "prediction column contains no data" - ) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Specificity score contains NaN values.") return np.nan @@ -440,11 +435,6 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - f"could not calculate metric '{self.display_name}': " "prediction column contains no data" - ) - if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): warnings.warn("Calculated Accuracy score contains NaN values.") return np.nan @@ -546,14 +536,6 @@ def _calculate(self, data: pd.DataFrame): y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - f"could not calculate metric '{self.name}': " "prediction column contains no data" - ) - - if y_true is None: - warnings.warn("Calculated Business Value contains NaN values.") - return np.NaN if y_true.shape[0] == 0: warnings.warn("Calculated Business Value contains NaN values.") return np.NaN @@ -742,11 +724,6 @@ def _calculate_true_positives(self, data: pd.DataFrame) -> float: y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - "could not calculate metric true_positive. prediction column contains no data" - ) - if y_true.empty or y_pred.empty: warnings.warn("Calculated true_positives contain NaN values.") return np.nan @@ -771,11 +748,6 @@ def _calculate_true_negatives(self, data: pd.DataFrame) -> float: y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - "could not calculate metric true_negative. prediction column contains no data" - ) - if y_true.empty or y_pred.empty: warnings.warn("Calculated true_negatives contain NaN values.") return np.nan @@ -800,10 +772,6 @@ def _calculate_false_positives(self, data: pd.DataFrame) -> float: y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - "could not calculate metric false_positive. prediction column contains no data" - ) if y_true.empty or y_pred.empty: warnings.warn("Calculated false_positives contain NaN values.") return np.nan @@ -828,11 +796,6 @@ def _calculate_false_negatives(self, data: pd.DataFrame) -> float: y_true = data[self.y_true] y_pred = data[self.y_pred] - if y_pred.isna().all(): - raise InvalidArgumentsException( - "could not calculate metric false_negative. prediction column contains no data" - ) - if y_true.empty or y_pred.empty: warnings.warn("Calculated false_negatives contain NaN values.") return np.nan From 71314191d26750ccad6e54a978360499b58f9a15 Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Mon, 20 Nov 2023 00:03:16 +0100 Subject: [PATCH 6/7] Refactor more data cleaning methods --- .../confidence_based/metrics.py | 28 ++++--------------- .../direct_loss_estimation/metrics.py | 16 ++++------- 2 files changed, 12 insertions(+), 32 deletions(-) diff --git a/nannyml/performance_estimation/confidence_based/metrics.py b/nannyml/performance_estimation/confidence_based/metrics.py index 4899d8249..08e8a242f 100644 --- a/nannyml/performance_estimation/confidence_based/metrics.py +++ b/nannyml/performance_estimation/confidence_based/metrics.py @@ -31,6 +31,7 @@ import nannyml.sampling_error.binary_classification as bse import nannyml.sampling_error.multiclass_classification as mse from nannyml._typing import ModelOutputsType, ProblemType, class_labels +from nannyml.base import _remove_nans from nannyml.chunk import Chunk, Chunker from nannyml.exceptions import CalculatorException, InvalidArgumentsException from nannyml.performance_estimation.confidence_based import SUPPORTED_METRIC_VALUES @@ -234,30 +235,13 @@ def _common_cleaning( ) y_pred_proba_column_name = self.y_pred_proba - clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all() - - y_pred_proba = data[y_pred_proba_column_name] - y_pred = data[self.y_pred] - y_true = data[self.y_true] if clean_targets else None - - # Create mask to filter out NaN values - mask = ~(y_pred.isna() | y_pred_proba.isna()) - if clean_targets: - mask = mask | ~(y_true.isna()) + data = _remove_nans(data, [self.y_pred, y_pred_proba_column_name]) - # Drop missing values (NaN/None) - y_pred_proba = y_pred_proba[mask] - y_pred = y_pred[mask] - if clean_targets: - y_true = y_true[mask] - - # NaN values have been dropped. Try to infer types again - y_pred_proba = y_pred_proba.infer_objects() - y_pred = y_pred.infer_objects() + clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all() if clean_targets: - y_true = y_true.infer_objects() + data = _remove_nans(data, [self.y_true]) - return y_pred_proba, y_pred, y_true + return data[y_pred_proba_column_name], data[self.y_pred], (data[self.y_true] if clean_targets else None) def get_chunk_record(self, chunk_data: pd.DataFrame) -> Dict: """Returns a dictionary containing the performance metrics for a given chunk. @@ -1584,7 +1568,7 @@ def _realized_performance(self, data: pd.DataFrame) -> float: if y_true.shape[0] == 0: warnings.warn("Calculated Business Value contains NaN values.") return np.NaN - + tp_value = self.business_value_matrix[1, 1] tn_value = self.business_value_matrix[0, 0] fp_value = self.business_value_matrix[0, 1] diff --git a/nannyml/performance_estimation/direct_loss_estimation/metrics.py b/nannyml/performance_estimation/direct_loss_estimation/metrics.py index 2daf6b99c..03e35a654 100644 --- a/nannyml/performance_estimation/direct_loss_estimation/metrics.py +++ b/nannyml/performance_estimation/direct_loss_estimation/metrics.py @@ -29,7 +29,7 @@ ) from nannyml._typing import ProblemType -from nannyml.base import _raise_exception_for_negative_values +from nannyml.base import _raise_exception_for_negative_values, _remove_nans from nannyml.chunk import Chunk, Chunker from nannyml.exceptions import InvalidArgumentsException from nannyml.sampling_error.regression import ( @@ -271,18 +271,14 @@ def __eq__(self, other): """Establishes equality by comparing all properties.""" return self.display_name == other.display_name and self.column_name == other.column_name - def _common_cleaning(self, data: pd.DataFrame) -> Tuple[pd.Series, pd.Series]: - clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all() + def _common_cleaning(self, data: pd.DataFrame) -> Tuple[pd.Series, Optional[pd.Series]]: + data = _remove_nans(data, [self.y_pred]) - y_pred = data[self.y_pred] + clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all() if clean_targets: - y_true = data[self.y_true] - y_pred = y_pred[~y_true.isna()] - y_true.dropna(inplace=True) - else: - y_true = None + data = _remove_nans(data, [self.y_pred, self.y_true]) - return y_pred, y_true + return data[self.y_pred], (data[self.y_true] if clean_targets else None) def _train_direct_error_estimation_model( self, From 59ba73ce8e26e322e349a451612191787da4d684 Mon Sep 17 00:00:00 2001 From: Niels Nuyttens Date: Mon, 20 Nov 2023 11:57:53 +0100 Subject: [PATCH 7/7] Deal with mypy overload issue --- nannyml/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nannyml/base.py b/nannyml/base.py index a7958ce3f..953354dc9 100644 --- a/nannyml/base.py +++ b/nannyml/base.py @@ -534,7 +534,7 @@ def _column_is_categorical(column: pd.Series) -> bool: @overload -def _remove_nans(data: pd.Series, columns: None) -> pd.Series: +def _remove_nans(data: pd.Series) -> pd.Series: ... @overload