From 4ba28030731012ee20df67152ae4aa7859733166 Mon Sep 17 00:00:00 2001 From: Christos Aridas Date: Wed, 12 Feb 2020 17:52:36 +0200 Subject: [PATCH] BUG Better in-out support with different arrays type (#681) --- imblearn/base.py | 49 +++------------- .../over_sampling/_random_over_sampler.py | 17 ------ imblearn/over_sampling/_smote.py | 16 ----- .../_prototype_selection/_nearmiss.py | 6 +- .../_random_under_sampler.py | 16 ----- imblearn/utils/_validation.py | 36 ++++++++++++ imblearn/utils/estimator_checks.py | 58 +++++++++++++++---- imblearn/utils/tests/test_validation.py | 51 ++++++++++++++++ 8 files changed, 146 insertions(+), 103 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index c5d6b0185..014e4dd9f 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -14,6 +14,7 @@ from sklearn.utils.multiclass import check_classification_targets from .utils import check_sampling_strategy, check_target_type +from .utils._validation import ArraysTransformer class SamplerMixin(BaseEstimator, metaclass=ABCMeta): @@ -72,6 +73,7 @@ def fit_resample(self, X, y): The corresponding label of `X_resampled`. """ check_classification_targets(y) + arrays_transformer = ArraysTransformer(X, y) X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( @@ -80,21 +82,10 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) - if self._X_columns is not None or self._y_name is not None: - import pandas as pd - - if self._X_columns is not None: - X_ = pd.DataFrame(output[0], columns=self._X_columns) - X_ = X_.astype(self._X_dtypes) - else: - X_ = output[0] - y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) - if self._y_name is not None: - y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) - + X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) # define an alias for back-compatibility @@ -137,22 +128,6 @@ def __init__(self, sampling_strategy="auto"): self.sampling_strategy = sampling_strategy def _check_X_y(self, X, y, accept_sparse=None): - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - if accept_sparse is None: accept_sparse = ["csr", "csc"] y, binarize_y = check_target_type(y, indicate_one_vs_all=True) @@ -265,8 +240,8 @@ def fit_resample(self, X, y): y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ - # store the columns name to reconstruct a dataframe - self._columns = X.columns if hasattr(X, "loc") else None + arrays_transformer = ArraysTransformer(X, y) + if self.validate: check_classification_targets(y) X, y, binarize_y = self._check_X_y( @@ -280,22 +255,12 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) if self.validate: - if self._X_columns is not None or self._y_name is not None: - import pandas as pd - - if self._X_columns is not None: - X_ = pd.DataFrame(output[0], columns=self._X_columns) - X_ = X_.astype(self._X_dtypes) - else: - X_ = output[0] y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) - - if self._y_name is not None: - y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) - + X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) + return output def _fit_resample(self, X, y): diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index afcb89da5..fbe2f17f9 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -16,7 +16,6 @@ from ..utils import Substitution from ..utils._docstring import _random_state_docstring - @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, random_state=_random_state_docstring, @@ -75,22 +74,6 @@ def __init__(self, sampling_strategy="auto", random_state=None): self.random_state = random_state def _check_X_y(self, X, y): - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index cea14cfd2..961ce55c5 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -891,22 +891,6 @@ def _check_X_y(self, X, y): """Overwrite the checking to let pass some string for categorical features. """ - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None) return X, y, binarize_y diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index af8a13dde..386463d5c 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -247,8 +247,10 @@ def _fit_resample(self, X, y): _safe_indexing(X, minority_class_indices) ) idx_vec_farthest = np.unique(idx_vec.reshape(-1)) - X_class_selected = _safe_indexing(X_class, idx_vec_farthest) - y_class_selected = _safe_indexing(y_class, idx_vec_farthest) + X_class_selected = _safe_indexing( + X_class, idx_vec_farthest) + y_class_selected = _safe_indexing( + y_class, idx_vec_farthest) dist_vec, idx_vec = self.nn_.kneighbors( X_class_selected, n_neighbors=self.nn_.n_neighbors diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 8d7c08c93..900d8e3fe 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -81,22 +81,6 @@ def __init__( self.replacement = replacement def _check_X_y(self, X, y): - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index d1b0069b7..dccc0dd4d 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -27,6 +27,42 @@ TARGET_KIND = ("binary", "multiclass", "multilabel-indicator") +class ArraysTransformer: + """A class to convert sampler ouput arrays to their orinal types.""" + + def __init__(self, X, y): + self.x_props = self._gets_props(X) + self.y_props = self._gets_props(y) + + def transform(self, X, y): + X = self._transfrom_one(X, self.x_props) + y = self._transfrom_one(y, self.y_props) + return X, y + + def _gets_props(self, array): + props = {} + props["type"] = array.__class__.__name__ + props["columns"] = getattr(array, "columns", None) + props["name"] = getattr(array, "name", None) + props["dtypes"] = getattr(array, "dtypes", None) + return props + + def _transfrom_one(self, array, props): + type_ = props["type"].lower() + if type_ == "list": + ret = array.tolist() + elif type_ == "dataframe": + import pandas as pd + ret = pd.DataFrame(array, columns=props["columns"]) + ret = ret.astype(props["dtypes"]) + elif type_ == "series": + import pandas as pd + ret = pd.Series(array, dtype=props["dtypes"], name=props["name"]) + else: + ret = array + return ret + + def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a NN. diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 51a039f85..7bd77c2f3 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -41,6 +41,7 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_sampling_strategy_fit_resample yield check_samplers_sparse yield check_samplers_pandas + yield check_samplers_list yield check_samplers_multiclass_ova yield check_samplers_preserve_dtype yield check_samplers_sample_indices @@ -242,8 +243,9 @@ def check_samplers_pandas(name, Sampler): weights=[0.2, 0.3, 0.5], random_state=0, ) - X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) - y_pd = pd.Series(y, name="class") + X_df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) + y_df = pd.DataFrame(y) + y_s = pd.Series(y, name="class") sampler = Sampler() if isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] @@ -253,16 +255,52 @@ def check_samplers_pandas(name, Sampler): for sampler in samplers: set_random_state(sampler) - X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd) + X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) + X_res_df, y_res_df = sampler.fit_resample(X_df, y_df) X_res, y_res = sampler.fit_resample(X, y) - # check that we return a pandas dataframe if a dataframe was given in - assert isinstance(X_res_pd, pd.DataFrame) - assert isinstance(y_res_pd, pd.Series) - assert X_pd.columns.to_list() == X_res_pd.columns.to_list() - assert y_pd.name == y_res_pd.name - assert_allclose(X_res_pd.to_numpy(), X_res) - assert_allclose(y_res_pd.to_numpy(), y_res) + # check that we return the same type for dataframes or series types + assert isinstance(X_res_df, pd.DataFrame) + assert isinstance(y_res_df, pd.DataFrame) + assert isinstance(y_res_s, pd.Series) + + assert X_df.columns.to_list() == X_res_df.columns.to_list() + assert y_df.columns.to_list() == y_res_df.columns.to_list() + assert y_s.name == y_res_s.name + + assert_allclose(X_res_df.to_numpy(), X_res) + assert_allclose(y_res_df.to_numpy().ravel(), y_res) + assert_allclose(y_res_s.to_numpy(), y_res) + + +def check_samplers_list(name, Sampler): + # Check that the can samplers handle simple lists + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) + X_list = X.tolist() + y_list = y.tolist() + sampler = Sampler() + if isinstance(sampler, NearMiss): + samplers = [Sampler(version=version) for version in (1, 2, 3)] + + else: + samplers = [sampler] + + for sampler in samplers: + set_random_state(sampler) + X_res, y_res = sampler.fit_resample(X, y) + X_res_list, y_res_list = sampler.fit_resample(X_list, y_list) + + assert isinstance(X_res_list, list) + assert isinstance(y_res_list, list) + + assert_allclose(X_res, X_res_list) + assert_allclose(y_res, y_res_list) def check_samplers_multiclass_ova(name, Sampler): diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 634f502f0..a40b47f4b 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -17,6 +17,7 @@ from imblearn.utils import check_neighbors_object from imblearn.utils import check_sampling_strategy from imblearn.utils import check_target_type +from imblearn.utils._validation import ArraysTransformer multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25) binary_target = np.array([1] * 25 + [0] * 100) @@ -315,3 +316,53 @@ def test_sampling_strategy_check_order( sampling_strategy, y, sampling_type ) assert sampling_strategy_ == expected_result + + +def test_arrays_transformer_plain_list(): + X = np.array([[0, 0], [1, 1]]) + y = np.array([[0, 0], [1, 1]]) + + arrays_transformer = ArraysTransformer(X.tolist(), y.tolist()) + X_res, y_res = arrays_transformer.transform(X, y) + assert isinstance(X_res, list) + assert isinstance(y_res, list) + + +def test_arrays_transformer_numpy(): + X = np.array([[0, 0], [1, 1]]) + y = np.array([[0, 0], [1, 1]]) + + arrays_transformer = ArraysTransformer(X, y) + X_res, y_res = arrays_transformer.transform(X, y) + assert isinstance(X_res, np.ndarray) + assert isinstance(y_res, np.ndarray) + + +def test_arrays_transformer_pandas(): + pd = pytest.importorskip("pandas") + + X = np.array([[0, 0], [1, 1]]) + y = np.array([0, 1]) + + X_df = pd.DataFrame(X, columns=["a", "b"]) + X_df = X_df.astype(int) + y_df = pd.DataFrame(y, columns=["target", ]) + y_df = y_df.astype(int) + y_s = pd.Series(y, name="target", dtype=int) + + # DataFrame and DataFrame case + arrays_transformer = ArraysTransformer(X_df, y_df) + X_res, y_res = arrays_transformer.transform(X, y) + assert isinstance(X_res, pd.DataFrame) + assert_array_equal(X_res.columns, X_df.columns) + assert_array_equal(X_res.dtypes, X_df.dtypes) + assert isinstance(y_res, pd.DataFrame) + assert_array_equal(y_res.columns, y_df.columns) + assert_array_equal(y_res.dtypes, y_df.dtypes) + + # DataFrames and Series case + arrays_transformer = ArraysTransformer(X_df, y_s) + _, y_res = arrays_transformer.transform(X, y) + assert isinstance(y_res, pd.Series) + assert_array_equal(y_res.name, y_s.name) + assert_array_equal(y_res.dtype, y_s.dtype)