Skip to content

Commit

Permalink
Fix handling NaN values when fitting JS univariate drift (#340)
Browse files Browse the repository at this point in the history
* Add column & method for univariate fitting errors

* Refactor to use single data cleaning method

* Filter NaN's when fitting JS

* Refactor data cleaning to accept columns argument

Previously the data cleaning method operated by accepting multiple
dataframes and inspecting each dataframe separetely for `NaN`'s.
Depending on how the data is processed after cleaning, splitting columns
into separate dataframes can be rather annoying.

To avoid that this commit changes the method to accept a single
dataframe and a columns argument. The columns argument specifies which
column subsets should be inspected for `NaN`'s, enabling the same
behaviour using a more convenient syntax.

* Remove errors and use warning behaviour instead

The performance calculator for binary classification had checks in place
to generate an exception if the prediction column contains nothing but
`NaN`'s. This behaviour contradicts the warning functionality that is in
the same functions that would should return `NaN` and issue a warning.
It is also inconsistent with other calculators which do issue a warning
instead of raising an error.

This commit removes the errors and relies on the existing warning
functionality.

* Refactor more data cleaning methods

* Deal with mypy overload issue

---------

Co-authored-by: Niels Nuyttens <niels@nannyml.com>
  • Loading branch information
michael-nml and nnansters authored Nov 20, 2023
1 parent 51105c7 commit 93ac6e7
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 174 deletions.
38 changes: 32 additions & 6 deletions nannyml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import copy
import logging
from abc import ABC, abstractmethod
from typing import Generic, List, Optional, Tuple, TypeVar, Union
from typing import Generic, Iterable, List, Optional, Tuple, TypeVar, Union, overload

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -533,12 +533,38 @@ def _column_is_categorical(column: pd.Series) -> bool:
return column.dtype in ['object', 'string', 'category', 'bool']


def _remove_missing_data(column: pd.Series):
if isinstance(column, pd.Series):
column = column.dropna().reset_index(drop=True)
@overload
def _remove_nans(data: pd.Series) -> pd.Series:
...

@overload
def _remove_nans(data: pd.DataFrame, columns: Optional[Iterable[Union[str, Iterable[str]]]]) -> pd.DataFrame:
...


def _remove_nans(
data: Union[pd.Series, pd.DataFrame], columns: Optional[Iterable[Union[str, Iterable[str]]]] = None
) -> Tuple[pd.DataFrame, ...]:
"""Remove rows with NaN values in the specified columns.
If no columns are given, drop rows with NaN values in any column. If columns are given, drop rows with NaN values
in the specified columns. If a set of columns is given, drop rows with NaN values in all of the columns in the set.
"""
# If no columns are given, drop rows with NaN values in any columns
if columns is None:
mask = ~data.isna()
if isinstance(mask, pd.DataFrame):
mask = mask.all(axis=1)
else:
column = column[~np.isnan(column)]
return column
mask = np.ones(len(data), dtype=bool)
for column_selector in columns:
nans = data[column_selector].isna()
if isinstance(nans, pd.DataFrame):
nans = nans.all(axis=1)
mask &= ~nans

# NaN values have been dropped. Try to infer types again
return data[mask].reset_index(drop=True).infer_objects()


def _column_is_continuous(column: pd.Series) -> bool:
Expand Down
63 changes: 37 additions & 26 deletions nannyml/drift/univariate/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from nannyml.chunk import Chunker
from nannyml.drift.univariate.methods import FeatureType, Method, MethodFactory
from nannyml.drift.univariate.result import Result
from nannyml.exceptions import InvalidArgumentsException
from nannyml.exceptions import CalculatorException, InvalidArgumentsException
from nannyml.thresholds import ConstantThreshold, StandardDeviationThreshold, Threshold
from nannyml.usage_logging import UsageEvent, log_usage

Expand Down Expand Up @@ -271,34 +271,45 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> UnivariateDrift
if column_name not in self.categorical_column_names:
self.categorical_column_names.append(column_name)

timestamps = reference_data[self.timestamp_column_name] if self.timestamp_column_name else None
for column_name in self.continuous_column_names:
self._column_to_models_mapping[column_name] += [
MethodFactory.create(
key=method,
feature_type=FeatureType.CONTINUOUS,
chunker=self.chunker,
computation_params=self.computation_params or {},
threshold=self.thresholds[method],
).fit(
reference_data=reference_data[column_name],
timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None,
)
for method in self.continuous_method_names
]
methods = []
for method in self.continuous_method_names:
try:
methods.append(
MethodFactory.create(
key=method,
feature_type=FeatureType.CONTINUOUS,
chunker=self.chunker,
computation_params=self.computation_params or {},
threshold=self.thresholds[method],
).fit(
reference_data=reference_data[column_name],
timestamps=timestamps,
)
)
except Exception as ex:
raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex
self._column_to_models_mapping[column_name] = methods

for column_name in self.categorical_column_names:
self._column_to_models_mapping[column_name] += [
MethodFactory.create(
key=method,
feature_type=FeatureType.CATEGORICAL,
chunker=self.chunker,
threshold=self.thresholds[method],
).fit(
reference_data=reference_data[column_name],
timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None,
)
for method in self.categorical_method_names
]
methods = []
for method in self.categorical_method_names:
try:
methods.append(
MethodFactory.create(
key=method,
feature_type=FeatureType.CATEGORICAL,
chunker=self.chunker,
threshold=self.thresholds[method],
).fit(
reference_data=reference_data[column_name],
timestamps=timestamps,
)
)
except Exception as ex:
raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex
self._column_to_models_mapping[column_name] = methods

self.result = self._calculate(reference_data)
self.result.data['chunk', 'chunk', 'period'] = 'reference'
Expand Down
25 changes: 13 additions & 12 deletions nannyml/drift/univariate/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance

from nannyml._typing import Self
from nannyml.base import _column_is_categorical, _remove_missing_data
from nannyml.base import _remove_nans, _column_is_categorical
from nannyml.chunk import Chunker
from nannyml.exceptions import InvalidArgumentsException, NotFittedException
from nannyml.thresholds import Threshold, calculate_threshold_values
Expand Down Expand Up @@ -278,6 +278,7 @@ def __init__(self, **kwargs) -> None:
self._reference_proba_in_bins: np.ndarray

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None):
reference_data = _remove_nans(reference_data)
if _column_is_categorical(reference_data):
treat_as_type = 'cat'
else:
Expand Down Expand Up @@ -305,7 +306,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None

def _calculate(self, data: pd.Series):
reference_proba_in_bins = copy(self._reference_proba_in_bins)
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
if self._treat_as_type == 'cont':
Expand Down Expand Up @@ -374,7 +375,7 @@ def __init__(self, **kwargs) -> None:
self.n_bins = kwargs['computation_params'].get('n_bins', 10_000)

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_missing_data(reference_data)
reference_data = _remove_nans(reference_data)
if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact':
self._reference_data = reference_data
else:
Expand All @@ -389,7 +390,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
return self

def _calculate(self, data: pd.Series):
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
if not self._fitted:
Expand Down Expand Up @@ -443,13 +444,13 @@ def __init__(self, **kwargs) -> None:
self._fitted = False

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_missing_data(reference_data)
reference_data = _remove_nans(reference_data)
self._reference_data_vcs = reference_data.value_counts().loc[lambda v: v != 0]
self._fitted = True
return self

def _calculate(self, data: pd.Series):
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
if not self._fitted:
Expand Down Expand Up @@ -505,7 +506,7 @@ def __init__(self, **kwargs) -> None:
self._reference_proba: Optional[dict] = None

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_missing_data(reference_data)
reference_data = _remove_nans(reference_data)
ref_labels = reference_data.unique()
self._reference_proba = {label: (reference_data == label).sum() / len(reference_data) for label in ref_labels}

Expand All @@ -516,7 +517,7 @@ def _calculate(self, data: pd.Series):
raise NotFittedException(
"tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
)
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
data_labels = data.unique()
Expand Down Expand Up @@ -574,7 +575,7 @@ def __init__(self, **kwargs) -> None:
self.n_bins = kwargs['computation_params'].get('n_bins', 10_000)

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_missing_data(reference_data)
reference_data = _remove_nans(reference_data)
if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact':
self._reference_data = reference_data
else:
Expand All @@ -592,7 +593,7 @@ def _calculate(self, data: pd.Series):
raise NotFittedException(
"tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
)
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
if (
Expand Down Expand Up @@ -668,7 +669,7 @@ def __init__(self, **kwargs) -> None:
self._reference_proba_in_bins: np.ndarray

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_missing_data(reference_data)
reference_data = _remove_nans(reference_data)
if _column_is_categorical(reference_data):
treat_as_type = 'cat'
else:
Expand All @@ -695,7 +696,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
return self

def _calculate(self, data: pd.Series):
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
reference_proba_in_bins = copy(self._reference_proba_in_bins)
Expand Down
22 changes: 0 additions & 22 deletions nannyml/performance_calculation/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,25 +255,3 @@ def inner_wrapper(wrapped_class: Type[Metric]) -> Type[Metric]:
return wrapped_class

return inner_wrapper


def _common_data_cleaning(y_true: pd.Series, y_pred: Union[pd.Series, pd.DataFrame]):
y_true, y_pred = (
y_true.reset_index(drop=True),
y_pred.reset_index(drop=True),
)

if isinstance(y_pred, pd.DataFrame):
y_true = y_true[~y_pred.isna().all(axis=1)]
else:
y_true = y_true[~y_pred.isna()]
y_pred.dropna(inplace=True)

y_pred = y_pred[~y_true.isna()]
y_true.dropna(inplace=True)

# NaN values have been dropped. Try to infer types again
y_pred = y_pred.infer_objects()
y_true = y_true.infer_objects()

return y_true, y_pred
Loading

0 comments on commit 93ac6e7

Please sign in to comment.