Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling NaN values when fitting JS univariate drift #340

Merged
merged 7 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 32 additions & 6 deletions nannyml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import copy
import logging
from abc import ABC, abstractmethod
from typing import Generic, List, Optional, Tuple, TypeVar, Union
from typing import Generic, Iterable, List, Optional, Tuple, TypeVar, Union, overload

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -533,12 +533,38 @@
return column.dtype in ['object', 'string', 'category', 'bool']


def _remove_missing_data(column: pd.Series):
if isinstance(column, pd.Series):
column = column.dropna().reset_index(drop=True)
@overload
def _remove_nans(data: pd.Series) -> pd.Series:
...

Check warning on line 538 in nannyml/base.py

View check run for this annotation

Codecov / codecov/patch

nannyml/base.py#L538

Added line #L538 was not covered by tests

@overload
def _remove_nans(data: pd.DataFrame, columns: Optional[Iterable[Union[str, Iterable[str]]]]) -> pd.DataFrame:
...

Check warning on line 542 in nannyml/base.py

View check run for this annotation

Codecov / codecov/patch

nannyml/base.py#L542

Added line #L542 was not covered by tests


def _remove_nans(
data: Union[pd.Series, pd.DataFrame], columns: Optional[Iterable[Union[str, Iterable[str]]]] = None
) -> Tuple[pd.DataFrame, ...]:
"""Remove rows with NaN values in the specified columns.

If no columns are given, drop rows with NaN values in any column. If columns are given, drop rows with NaN values
in the specified columns. If a set of columns is given, drop rows with NaN values in all of the columns in the set.
"""
# If no columns are given, drop rows with NaN values in any columns
if columns is None:
mask = ~data.isna()
if isinstance(mask, pd.DataFrame):
mask = mask.all(axis=1)

Check warning on line 557 in nannyml/base.py

View check run for this annotation

Codecov / codecov/patch

nannyml/base.py#L557

Added line #L557 was not covered by tests
else:
column = column[~np.isnan(column)]
return column
mask = np.ones(len(data), dtype=bool)
for column_selector in columns:
nans = data[column_selector].isna()
if isinstance(nans, pd.DataFrame):
nans = nans.all(axis=1)
mask &= ~nans

# NaN values have been dropped. Try to infer types again
return data[mask].reset_index(drop=True).infer_objects()


def _column_is_continuous(column: pd.Series) -> bool:
Expand Down
63 changes: 37 additions & 26 deletions nannyml/drift/univariate/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from nannyml.chunk import Chunker
from nannyml.drift.univariate.methods import FeatureType, Method, MethodFactory
from nannyml.drift.univariate.result import Result
from nannyml.exceptions import InvalidArgumentsException
from nannyml.exceptions import CalculatorException, InvalidArgumentsException
from nannyml.thresholds import ConstantThreshold, StandardDeviationThreshold, Threshold
from nannyml.usage_logging import UsageEvent, log_usage

Expand Down Expand Up @@ -271,34 +271,45 @@
if column_name not in self.categorical_column_names:
self.categorical_column_names.append(column_name)

timestamps = reference_data[self.timestamp_column_name] if self.timestamp_column_name else None
for column_name in self.continuous_column_names:
self._column_to_models_mapping[column_name] += [
MethodFactory.create(
key=method,
feature_type=FeatureType.CONTINUOUS,
chunker=self.chunker,
computation_params=self.computation_params or {},
threshold=self.thresholds[method],
).fit(
reference_data=reference_data[column_name],
timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None,
)
for method in self.continuous_method_names
]
methods = []
for method in self.continuous_method_names:
try:
methods.append(
MethodFactory.create(
key=method,
feature_type=FeatureType.CONTINUOUS,
chunker=self.chunker,
computation_params=self.computation_params or {},
threshold=self.thresholds[method],
).fit(
reference_data=reference_data[column_name],
timestamps=timestamps,
)
)
except Exception as ex:
raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex

Check warning on line 292 in nannyml/drift/univariate/calculator.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/calculator.py#L291-L292

Added lines #L291 - L292 were not covered by tests
self._column_to_models_mapping[column_name] = methods

for column_name in self.categorical_column_names:
self._column_to_models_mapping[column_name] += [
MethodFactory.create(
key=method,
feature_type=FeatureType.CATEGORICAL,
chunker=self.chunker,
threshold=self.thresholds[method],
).fit(
reference_data=reference_data[column_name],
timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None,
)
for method in self.categorical_method_names
]
methods = []
for method in self.categorical_method_names:
try:
methods.append(
MethodFactory.create(
key=method,
feature_type=FeatureType.CATEGORICAL,
chunker=self.chunker,
threshold=self.thresholds[method],
).fit(
reference_data=reference_data[column_name],
timestamps=timestamps,
)
)
except Exception as ex:
raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex

Check warning on line 311 in nannyml/drift/univariate/calculator.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/calculator.py#L310-L311

Added lines #L310 - L311 were not covered by tests
self._column_to_models_mapping[column_name] = methods

self.result = self._calculate(reference_data)
self.result.data['chunk', 'chunk', 'period'] = 'reference'
Expand Down
25 changes: 13 additions & 12 deletions nannyml/drift/univariate/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance

from nannyml._typing import Self
from nannyml.base import _column_is_categorical, _remove_missing_data
from nannyml.base import _remove_nans, _column_is_categorical
from nannyml.chunk import Chunker
from nannyml.exceptions import InvalidArgumentsException, NotFittedException
from nannyml.thresholds import Threshold, calculate_threshold_values
Expand Down Expand Up @@ -278,6 +278,7 @@ def __init__(self, **kwargs) -> None:
self._reference_proba_in_bins: np.ndarray

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None):
reference_data = _remove_nans(reference_data)
if _column_is_categorical(reference_data):
treat_as_type = 'cat'
else:
Expand Down Expand Up @@ -305,7 +306,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None

def _calculate(self, data: pd.Series):
reference_proba_in_bins = copy(self._reference_proba_in_bins)
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
if self._treat_as_type == 'cont':
Expand Down Expand Up @@ -374,7 +375,7 @@ def __init__(self, **kwargs) -> None:
self.n_bins = kwargs['computation_params'].get('n_bins', 10_000)

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_missing_data(reference_data)
reference_data = _remove_nans(reference_data)
if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact':
self._reference_data = reference_data
else:
Expand All @@ -389,7 +390,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
return self

def _calculate(self, data: pd.Series):
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
if not self._fitted:
Expand Down Expand Up @@ -443,13 +444,13 @@ def __init__(self, **kwargs) -> None:
self._fitted = False

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_missing_data(reference_data)
reference_data = _remove_nans(reference_data)
self._reference_data_vcs = reference_data.value_counts().loc[lambda v: v != 0]
self._fitted = True
return self

def _calculate(self, data: pd.Series):
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
if not self._fitted:
Expand Down Expand Up @@ -505,7 +506,7 @@ def __init__(self, **kwargs) -> None:
self._reference_proba: Optional[dict] = None

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_missing_data(reference_data)
reference_data = _remove_nans(reference_data)
ref_labels = reference_data.unique()
self._reference_proba = {label: (reference_data == label).sum() / len(reference_data) for label in ref_labels}

Expand All @@ -516,7 +517,7 @@ def _calculate(self, data: pd.Series):
raise NotFittedException(
"tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
)
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
data_labels = data.unique()
Expand Down Expand Up @@ -574,7 +575,7 @@ def __init__(self, **kwargs) -> None:
self.n_bins = kwargs['computation_params'].get('n_bins', 10_000)

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_missing_data(reference_data)
reference_data = _remove_nans(reference_data)
if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact':
self._reference_data = reference_data
else:
Expand All @@ -592,7 +593,7 @@ def _calculate(self, data: pd.Series):
raise NotFittedException(
"tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
)
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
if (
Expand Down Expand Up @@ -668,7 +669,7 @@ def __init__(self, **kwargs) -> None:
self._reference_proba_in_bins: np.ndarray

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_missing_data(reference_data)
reference_data = _remove_nans(reference_data)
if _column_is_categorical(reference_data):
treat_as_type = 'cat'
else:
Expand All @@ -695,7 +696,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
return self

def _calculate(self, data: pd.Series):
data = _remove_missing_data(data)
data = _remove_nans(data)
if data.empty:
return np.nan
reference_proba_in_bins = copy(self._reference_proba_in_bins)
Expand Down
22 changes: 0 additions & 22 deletions nannyml/performance_calculation/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,25 +255,3 @@ def inner_wrapper(wrapped_class: Type[Metric]) -> Type[Metric]:
return wrapped_class

return inner_wrapper


def _common_data_cleaning(y_true: pd.Series, y_pred: Union[pd.Series, pd.DataFrame]):
y_true, y_pred = (
y_true.reset_index(drop=True),
y_pred.reset_index(drop=True),
)

if isinstance(y_pred, pd.DataFrame):
y_true = y_true[~y_pred.isna().all(axis=1)]
else:
y_true = y_true[~y_pred.isna()]
y_pred.dropna(inplace=True)

y_pred = y_pred[~y_true.isna()]
y_true.dropna(inplace=True)

# NaN values have been dropped. Try to infer types again
y_pred = y_pred.infer_objects()
y_true = y_true.infer_objects()

return y_true, y_pred
Loading
Loading