From 55d8f279d512e44861407086761079464699d74d Mon Sep 17 00:00:00 2001 From: chkoar Date: Mon, 3 Feb 2020 12:29:55 +0200 Subject: [PATCH] Better in-out support for pandas. --- imblearn/base.py | 49 +++---------------- .../over_sampling/_random_over_sampler.py | 18 +------ imblearn/over_sampling/_smote.py | 18 +------ .../_random_under_sampler.py | 18 +------ imblearn/utils/_validation.py | 45 +++++++++++++++++ imblearn/utils/estimator_checks.py | 27 ++++++---- 6 files changed, 75 insertions(+), 100 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index c5d6b0185..13033ca40 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -14,6 +14,7 @@ from sklearn.utils.multiclass import check_classification_targets from .utils import check_sampling_strategy, check_target_type +from .utils._validation import _OutputReconstructor class SamplerMixin(BaseEstimator, metaclass=ABCMeta): @@ -80,21 +81,10 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) - if self._X_columns is not None or self._y_name is not None: - import pandas as pd - - if self._X_columns is not None: - X_ = pd.DataFrame(output[0], columns=self._X_columns) - X_ = X_.astype(self._X_dtypes) - else: - X_ = output[0] - y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) - if self._y_name is not None: - y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) - + X_, y_ = self._reconstructor.reconstruct(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) # define an alias for back-compatibility @@ -137,22 +127,7 @@ def __init__(self, sampling_strategy="auto"): self.sampling_strategy = sampling_strategy def _check_X_y(self, X, y, accept_sparse=None): - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - + self._reconstructor = _OutputReconstructor(X, y) if accept_sparse is None: accept_sparse = ["csr", "csc"] y, binarize_y = check_target_type(y, indicate_one_vs_all=True) @@ -265,8 +240,8 @@ def fit_resample(self, X, y): y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ - # store the columns name to reconstruct a dataframe - self._columns = X.columns if hasattr(X, "loc") else None + self._reconstructor = _OutputReconstructor(X, y) + if self.validate: check_classification_targets(y) X, y, binarize_y = self._check_X_y( @@ -280,22 +255,12 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) if self.validate: - if self._X_columns is not None or self._y_name is not None: - import pandas as pd - - if self._X_columns is not None: - X_ = pd.DataFrame(output[0], columns=self._X_columns) - X_ = X_.astype(self._X_dtypes) - else: - X_ = output[0] y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) - - if self._y_name is not None: - y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) - + X_, y_ = self._reconstructor.reconstruct(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) + return output def _fit_resample(self, X, y): diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index afcb89da5..8addb2a87 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -15,6 +15,7 @@ from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _random_state_docstring +from ..utils._validation import _OutputReconstructor @Substitution( @@ -75,22 +76,7 @@ def __init__(self, sampling_strategy="auto", random_state=None): self.random_state = random_state def _check_X_y(self, X, y): - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - + self._reconstructor = _OutputReconstructor(X, y) y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index cea14cfd2..8d8de9a0a 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -31,6 +31,7 @@ from ..utils import Substitution from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring +from ..utils._validation import _OutputReconstructor class BaseSMOTE(BaseOverSampler): @@ -891,22 +892,7 @@ def _check_X_y(self, X, y): """Overwrite the checking to let pass some string for categorical features. """ - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - + self._reconstructor = _OutputReconstructor(X, y) y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None) return X, y, binarize_y diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 8d7c08c93..fb90f2bfb 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -15,6 +15,7 @@ from ...utils import check_target_type from ...utils import Substitution from ...utils._docstring import _random_state_docstring +from ...utils._validation import _OutputReconstructor @Substitution( @@ -81,22 +82,7 @@ def __init__( self.replacement = replacement def _check_X_y(self, X, y): - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - + self._reconstructor = _OutputReconstructor(X, y) y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index d1b0069b7..0f9f2f9c3 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -27,6 +27,51 @@ TARGET_KIND = ("binary", "multiclass", "multilabel-indicator") +class _OutputReconstructor: + """A class for converting input types to numpy and back.""" + + def __init__(self, X, y): + self.x_props = self._gets_props(X) + self.y_props = self._gets_props(y) + + def reconstruct(self, X, y): + X = self._transfrom(X, self.x_props) + y = self._transfrom(y, self.y_props) + return X, y + + def _gets_props(self, array): + props = {} + props["type"] = array.__class__.__name__ + props["columns"] = getattr(array, "columns", None) + props["name"] = getattr(array, "name", None) + props["dtypes"] = getattr(array, "dtypes", None) + return props + + def _transfrom(self, array, props): + type_ = props["type"].lower() + msg="Could not convert to {}".format(type_) + if type_ == "list": + ret = array.tolist() + elif type_ == "dataframe": + try: + import pandas as pd + ret = pd.DataFrame(array, columns=props["columns"]) + ret = ret.astype(props["dtypes"]) + except Exception: + warnings.warn(msg) + elif type_ == "series": + try: + import pandas as pd + ret = pd.Series(array, + dtype=props["dtypes"], + name=props["name"]) + except Exception: + warnings.warn(msg) + else: + ret = array + return ret + + def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a NN. diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 51a039f85..8f094397d 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -242,8 +242,9 @@ def check_samplers_pandas(name, Sampler): weights=[0.2, 0.3, 0.5], random_state=0, ) - X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) - y_pd = pd.Series(y, name="class") + X_df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) + y_df = pd.DataFrame(y) + y_s = pd.Series(y, name="class") sampler = Sampler() if isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] @@ -253,16 +254,22 @@ def check_samplers_pandas(name, Sampler): for sampler in samplers: set_random_state(sampler) - X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd) + X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) + X_res_df, y_res_df = sampler.fit_resample(X_df, y_df) X_res, y_res = sampler.fit_resample(X, y) - # check that we return a pandas dataframe if a dataframe was given in - assert isinstance(X_res_pd, pd.DataFrame) - assert isinstance(y_res_pd, pd.Series) - assert X_pd.columns.to_list() == X_res_pd.columns.to_list() - assert y_pd.name == y_res_pd.name - assert_allclose(X_res_pd.to_numpy(), X_res) - assert_allclose(y_res_pd.to_numpy(), y_res) + # check that we return the same type for dataframes or seires types + assert isinstance(X_res_df, pd.DataFrame) + assert isinstance(y_res_df, pd.DataFrame) + assert isinstance(y_res_s, pd.Series) + + assert X_df.columns.to_list() == X_res_df.columns.to_list() + assert y_df.columns.to_list() == y_res_df.columns.to_list() + assert y_s.name == y_res_s.name + + assert_allclose(X_res_df.to_numpy(), X_res) + assert_allclose(y_res_df.to_numpy().ravel(), y_res) + assert_allclose(y_res_s.to_numpy(), y_res) def check_samplers_multiclass_ova(name, Sampler):