Fix handling NaN values when fitting JS univariate drift (#340)

* Add column & method for univariate fitting errors * Refactor to use single data cleaning method * Filter NaN's when fitting JS * Refactor data cleaning to accept columns argument Previously the data cleaning method operated by accepting multiple dataframes and inspecting each dataframe separetely for `NaN`'s. Depending on how the data is processed after cleaning, splitting columns into separate dataframes can be rather annoying. To avoid that this commit changes the method to accept a single dataframe and a columns argument. The columns argument specifies which column subsets should be inspected for `NaN`'s, enabling the same behaviour using a more convenient syntax. * Remove errors and use warning behaviour instead The performance calculator for binary classification had checks in place to generate an exception if the prediction column contains nothing but `NaN`'s. This behaviour contradicts the warning functionality that is in the same functions that would should return `NaN` and issue a warning. It is also inconsistent with other calculators which do issue a warning instead of raising an error. This commit removes the errors and relies on the existing warning functionality. * Refactor more data cleaning methods * Deal with mypy overload issue --------- Co-authored-by: Niels Nuyttens <niels@nannyml.com>
NannyML · Nov 20, 2023 · 93ac6e7 · 93ac6e7
1 parent 51105c7
commit 93ac6e7
Show file tree

Hide file tree

Showing 9 changed files with 126 additions and 174 deletions.
diff --git a/nannyml/base.py b/nannyml/base.py
@@ -8,7 +8,7 @@
 import copy
 import logging
 from abc import ABC, abstractmethod
-from typing import Generic, List, Optional, Tuple, TypeVar, Union
+from typing import Generic, Iterable, List, Optional, Tuple, TypeVar, Union, overload
 
 import numpy as np
 import pandas as pd
@@ -533,12 +533,38 @@ def _column_is_categorical(column: pd.Series) -> bool:
     return column.dtype in ['object', 'string', 'category', 'bool']
 
 
-def _remove_missing_data(column: pd.Series):
-    if isinstance(column, pd.Series):
-        column = column.dropna().reset_index(drop=True)
+@overload
+def _remove_nans(data: pd.Series) -> pd.Series:
+    ...
+
+@overload
+def _remove_nans(data: pd.DataFrame, columns: Optional[Iterable[Union[str, Iterable[str]]]]) -> pd.DataFrame:
+    ...
+
+
+def _remove_nans(
+    data: Union[pd.Series, pd.DataFrame], columns: Optional[Iterable[Union[str, Iterable[str]]]] = None
+) -> Tuple[pd.DataFrame, ...]:
+    """Remove rows with NaN values in the specified columns.
+
+    If no columns are given, drop rows with NaN values in any column. If columns are given, drop rows with NaN values
+    in the specified columns. If a set of columns is given, drop rows with NaN values in all of the columns in the set.
+    """
+    # If no columns are given, drop rows with NaN values in any columns
+    if columns is None:
+        mask = ~data.isna()
+        if isinstance(mask, pd.DataFrame):
+            mask = mask.all(axis=1)
     else:
-        column = column[~np.isnan(column)]
-    return column
+        mask = np.ones(len(data), dtype=bool)
+        for column_selector in columns:
+            nans = data[column_selector].isna()
+            if isinstance(nans, pd.DataFrame):
+                nans = nans.all(axis=1)
+            mask &= ~nans
+
+    # NaN values have been dropped. Try to infer types again
+    return data[mask].reset_index(drop=True).infer_objects()
 
 
 def _column_is_continuous(column: pd.Series) -> bool:

diff --git a/nannyml/drift/univariate/calculator.py b/nannyml/drift/univariate/calculator.py
@@ -39,7 +39,7 @@
 from nannyml.chunk import Chunker
 from nannyml.drift.univariate.methods import FeatureType, Method, MethodFactory
 from nannyml.drift.univariate.result import Result
-from nannyml.exceptions import InvalidArgumentsException
+from nannyml.exceptions import CalculatorException, InvalidArgumentsException
 from nannyml.thresholds import ConstantThreshold, StandardDeviationThreshold, Threshold
 from nannyml.usage_logging import UsageEvent, log_usage
 
@@ -271,34 +271,45 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> UnivariateDrift
             if column_name not in self.categorical_column_names:
                 self.categorical_column_names.append(column_name)
 
+        timestamps = reference_data[self.timestamp_column_name] if self.timestamp_column_name else None
         for column_name in self.continuous_column_names:
-            self._column_to_models_mapping[column_name] += [
-                MethodFactory.create(
-                    key=method,
-                    feature_type=FeatureType.CONTINUOUS,
-                    chunker=self.chunker,
-                    computation_params=self.computation_params or {},
-                    threshold=self.thresholds[method],
-                ).fit(
-                    reference_data=reference_data[column_name],
-                    timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None,
-                )
-                for method in self.continuous_method_names
-            ]
+            methods = []
+            for method in self.continuous_method_names:
+                try:
+                    methods.append(
+                        MethodFactory.create(
+                            key=method,
+                            feature_type=FeatureType.CONTINUOUS,
+                            chunker=self.chunker,
+                            computation_params=self.computation_params or {},
+                            threshold=self.thresholds[method],
+                        ).fit(
+                            reference_data=reference_data[column_name],
+                            timestamps=timestamps,
+                        )
+                    )
+                except Exception as ex:
+                    raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex
+            self._column_to_models_mapping[column_name] = methods
 
         for column_name in self.categorical_column_names:
-            self._column_to_models_mapping[column_name] += [
-                MethodFactory.create(
-                    key=method,
-                    feature_type=FeatureType.CATEGORICAL,
-                    chunker=self.chunker,
-                    threshold=self.thresholds[method],
-                ).fit(
-                    reference_data=reference_data[column_name],
-                    timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None,
-                )
-                for method in self.categorical_method_names
-            ]
+            methods = []
+            for method in self.categorical_method_names:
+                try:
+                    methods.append(
+                        MethodFactory.create(
+                            key=method,
+                            feature_type=FeatureType.CATEGORICAL,
+                            chunker=self.chunker,
+                            threshold=self.thresholds[method],
+                        ).fit(
+                            reference_data=reference_data[column_name],
+                            timestamps=timestamps,
+                        )
+                    )
+                except Exception as ex:
+                    raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex
+            self._column_to_models_mapping[column_name] = methods
 
         self.result = self._calculate(reference_data)
         self.result.data['chunk', 'chunk', 'period'] = 'reference'

diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py
@@ -29,7 +29,7 @@
 from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance
 
 from nannyml._typing import Self
-from nannyml.base import _column_is_categorical, _remove_missing_data
+from nannyml.base import _remove_nans, _column_is_categorical
 from nannyml.chunk import Chunker
 from nannyml.exceptions import InvalidArgumentsException, NotFittedException
 from nannyml.thresholds import Threshold, calculate_threshold_values
@@ -278,6 +278,7 @@ def __init__(self, **kwargs) -> None:
         self._reference_proba_in_bins: np.ndarray
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None):
+        reference_data = _remove_nans(reference_data)
         if _column_is_categorical(reference_data):
             treat_as_type = 'cat'
         else:
@@ -305,7 +306,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
 
     def _calculate(self, data: pd.Series):
         reference_proba_in_bins = copy(self._reference_proba_in_bins)
-        data = _remove_missing_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         if self._treat_as_type == 'cont':
@@ -374,7 +375,7 @@ def __init__(self, **kwargs) -> None:
             self.n_bins = kwargs['computation_params'].get('n_bins', 10_000)
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data = _remove_missing_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact':
             self._reference_data = reference_data
         else:
@@ -389,7 +390,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
         return self
 
     def _calculate(self, data: pd.Series):
-        data = _remove_missing_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         if not self._fitted:
@@ -443,13 +444,13 @@ def __init__(self, **kwargs) -> None:
         self._fitted = False
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data = _remove_missing_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         self._reference_data_vcs = reference_data.value_counts().loc[lambda v: v != 0]
         self._fitted = True
         return self
 
     def _calculate(self, data: pd.Series):
-        data = _remove_missing_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         if not self._fitted:
@@ -505,7 +506,7 @@ def __init__(self, **kwargs) -> None:
         self._reference_proba: Optional[dict] = None
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data = _remove_missing_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         ref_labels = reference_data.unique()
         self._reference_proba = {label: (reference_data == label).sum() / len(reference_data) for label in ref_labels}
 
@@ -516,7 +517,7 @@ def _calculate(self, data: pd.Series):
             raise NotFittedException(
                 "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
             )
-        data = _remove_missing_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         data_labels = data.unique()
@@ -574,7 +575,7 @@ def __init__(self, **kwargs) -> None:
             self.n_bins = kwargs['computation_params'].get('n_bins', 10_000)
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data = _remove_missing_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact':
             self._reference_data = reference_data
         else:
@@ -592,7 +593,7 @@ def _calculate(self, data: pd.Series):
             raise NotFittedException(
                 "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
             )
-        data = _remove_missing_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         if (
@@ -668,7 +669,7 @@ def __init__(self, **kwargs) -> None:
         self._reference_proba_in_bins: np.ndarray
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data = _remove_missing_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         if _column_is_categorical(reference_data):
             treat_as_type = 'cat'
         else:
@@ -695,7 +696,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
         return self
 
     def _calculate(self, data: pd.Series):
-        data = _remove_missing_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         reference_proba_in_bins = copy(self._reference_proba_in_bins)

diff --git a/nannyml/performance_calculation/metrics/base.py b/nannyml/performance_calculation/metrics/base.py
@@ -255,25 +255,3 @@ def inner_wrapper(wrapped_class: Type[Metric]) -> Type[Metric]:
             return wrapped_class
 
         return inner_wrapper
-
-
-def _common_data_cleaning(y_true: pd.Series, y_pred: Union[pd.Series, pd.DataFrame]):
-    y_true, y_pred = (
-        y_true.reset_index(drop=True),
-        y_pred.reset_index(drop=True),
-    )
-
-    if isinstance(y_pred, pd.DataFrame):
-        y_true = y_true[~y_pred.isna().all(axis=1)]
-    else:
-        y_true = y_true[~y_pred.isna()]
-    y_pred.dropna(inplace=True)
-
-    y_pred = y_pred[~y_true.isna()]
-    y_true.dropna(inplace=True)
-
-    # NaN values have been dropped. Try to infer types again
-    y_pred = y_pred.infer_objects()
-    y_true = y_true.infer_objects()
-
-    return y_true, y_pred