Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Better in-out support #681

Merged
merged 9 commits into from
Feb 12, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 7 additions & 42 deletions imblearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from sklearn.utils.multiclass import check_classification_targets

from .utils import check_sampling_strategy, check_target_type
from .utils._validation import OutputFormater


class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
Expand Down Expand Up @@ -72,6 +73,7 @@ def fit_resample(self, X, y):
The corresponding label of `X_resampled`.
"""
check_classification_targets(y)
self._formater = OutputFormater(X, y)
X, y, binarize_y = self._check_X_y(X, y)

self.sampling_strategy_ = check_sampling_strategy(
Expand All @@ -80,21 +82,10 @@ def fit_resample(self, X, y):

output = self._fit_resample(X, y)

if self._X_columns is not None or self._y_name is not None:
import pandas as pd

if self._X_columns is not None:
X_ = pd.DataFrame(output[0], columns=self._X_columns)
X_ = X_.astype(self._X_dtypes)
else:
X_ = output[0]

y_ = (label_binarize(output[1], np.unique(y))
if binarize_y else output[1])

if self._y_name is not None:
y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)

X_, y_ = self._formater.format(output[0], y_)
return (X_, y_) if len(output) == 2 else (X_, y_, output[2])

# define an alias for back-compatibility
Expand Down Expand Up @@ -137,22 +128,6 @@ def __init__(self, sampling_strategy="auto"):
self.sampling_strategy = sampling_strategy

def _check_X_y(self, X, y, accept_sparse=None):
if hasattr(X, "loc"):
# store information to build dataframe
self._X_columns = X.columns
self._X_dtypes = X.dtypes
else:
self._X_columns = None
self._X_dtypes = None

if hasattr(y, "loc"):
# store information to build a series
self._y_name = y.name
self._y_dtype = y.dtype
else:
self._y_name = None
self._y_dtype = None

if accept_sparse is None:
accept_sparse = ["csr", "csc"]
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
Expand Down Expand Up @@ -265,8 +240,8 @@ def fit_resample(self, X, y):
y_resampled : array-like of shape (n_samples_new,)
The corresponding label of `X_resampled`.
"""
# store the columns name to reconstruct a dataframe
self._columns = X.columns if hasattr(X, "loc") else None
self._formater = OutputFormater(X, y)

if self.validate:
check_classification_targets(y)
X, y, binarize_y = self._check_X_y(
Expand All @@ -280,22 +255,12 @@ def fit_resample(self, X, y):
output = self._fit_resample(X, y)

if self.validate:
if self._X_columns is not None or self._y_name is not None:
import pandas as pd

if self._X_columns is not None:
X_ = pd.DataFrame(output[0], columns=self._X_columns)
X_ = X_.astype(self._X_dtypes)
else:
X_ = output[0]

y_ = (label_binarize(output[1], np.unique(y))
if binarize_y else output[1])

if self._y_name is not None:
y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)

X_, y_ = self._formater.format(output[0], y_)
return (X_, y_) if len(output) == 2 else (X_, y_, output[2])

return output

def _fit_resample(self, X, y):
Expand Down
17 changes: 0 additions & 17 deletions imblearn/over_sampling/_random_over_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring


@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
random_state=_random_state_docstring,
Expand Down Expand Up @@ -75,22 +74,6 @@ def __init__(self, sampling_strategy="auto", random_state=None):
self.random_state = random_state

def _check_X_y(self, X, y):
if hasattr(X, "loc"):
# store information to build dataframe
self._X_columns = X.columns
self._X_dtypes = X.dtypes
else:
self._X_columns = None
self._X_dtypes = None

if hasattr(y, "loc"):
# store information to build a series
self._y_name = y.name
self._y_dtype = y.dtype
else:
self._y_name = None
self._y_dtype = None

y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
force_all_finite=False)
Expand Down
16 changes: 0 additions & 16 deletions imblearn/over_sampling/_smote.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,22 +891,6 @@ def _check_X_y(self, X, y):
"""Overwrite the checking to let pass some string for categorical
features.
"""
if hasattr(X, "loc"):
# store information to build dataframe
self._X_columns = X.columns
self._X_dtypes = X.dtypes
else:
self._X_columns = None
self._X_dtypes = None

if hasattr(y, "loc"):
# store information to build a series
self._y_name = y.name
self._y_dtype = y.dtype
else:
self._y_name = None
self._y_dtype = None

y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
return X, y, binarize_y
Expand Down
6 changes: 4 additions & 2 deletions imblearn/under_sampling/_prototype_selection/_nearmiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,10 @@ def _fit_resample(self, X, y):
_safe_indexing(X, minority_class_indices)
)
idx_vec_farthest = np.unique(idx_vec.reshape(-1))
X_class_selected = _safe_indexing(X_class, idx_vec_farthest)
y_class_selected = _safe_indexing(y_class, idx_vec_farthest)
X_class_selected = _safe_indexing(
X_class, idx_vec_farthest)
y_class_selected = _safe_indexing(
y_class, idx_vec_farthest)

dist_vec, idx_vec = self.nn_.kneighbors(
X_class_selected, n_neighbors=self.nn_.n_neighbors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,22 +81,6 @@ def __init__(
self.replacement = replacement

def _check_X_y(self, X, y):
if hasattr(X, "loc"):
# store information to build dataframe
self._X_columns = X.columns
self._X_dtypes = X.dtypes
else:
self._X_columns = None
self._X_dtypes = None

if hasattr(y, "loc"):
# store information to build a series
self._y_name = y.name
self._y_dtype = y.dtype
else:
self._y_name = None
self._y_dtype = None

y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
force_all_finite=False)
Expand Down
36 changes: 36 additions & 0 deletions imblearn/utils/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,42 @@
TARGET_KIND = ("binary", "multiclass", "multilabel-indicator")


class OutputFormater:
"""A class for converting input types to numpy and back."""

def __init__(self, X, y):
chkoar marked this conversation as resolved.
Show resolved Hide resolved
self.x_props = self._gets_props(X)
chkoar marked this conversation as resolved.
Show resolved Hide resolved
self.y_props = self._gets_props(y)

def format(self, X, y):
chkoar marked this conversation as resolved.
Show resolved Hide resolved
X = self._transfrom(X, self.x_props)
y = self._transfrom(y, self.y_props)
return X, y

def _gets_props(self, array):
props = {}
props["type"] = array.__class__.__name__
props["columns"] = getattr(array, "columns", None)
props["name"] = getattr(array, "name", None)
props["dtypes"] = getattr(array, "dtypes", None)
return props

def _transfrom(self, array, props):
type_ = props["type"].lower()
if type_ == "list":
ret = array.tolist()
elif type_ == "dataframe":
import pandas as pd
ret = pd.DataFrame(array, columns=props["columns"])
ret = ret.astype(props["dtypes"])
elif type_ == "series":
import pandas as pd
ret = pd.Series(array, dtype=props["dtypes"], name=props["name"])
else:
ret = array
return ret


def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
"""Check the objects is consistent to be a NN.

Expand Down
58 changes: 48 additions & 10 deletions imblearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def _yield_sampler_checks(name, Estimator):
yield check_samplers_sampling_strategy_fit_resample
yield check_samplers_sparse
yield check_samplers_pandas
yield check_samplers_list
yield check_samplers_multiclass_ova
yield check_samplers_preserve_dtype
yield check_samplers_sample_indices
Expand Down Expand Up @@ -242,8 +243,9 @@ def check_samplers_pandas(name, Sampler):
weights=[0.2, 0.3, 0.5],
random_state=0,
)
X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
y_pd = pd.Series(y, name="class")
X_df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
y_df = pd.DataFrame(y)
y_s = pd.Series(y, name="class")
sampler = Sampler()
if isinstance(Sampler(), NearMiss):
samplers = [Sampler(version=version) for version in (1, 2, 3)]
Expand All @@ -253,16 +255,52 @@ def check_samplers_pandas(name, Sampler):

for sampler in samplers:
set_random_state(sampler)
X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd)
X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
X_res_df, y_res_df = sampler.fit_resample(X_df, y_df)
X_res, y_res = sampler.fit_resample(X, y)

# check that we return a pandas dataframe if a dataframe was given in
assert isinstance(X_res_pd, pd.DataFrame)
assert isinstance(y_res_pd, pd.Series)
assert X_pd.columns.to_list() == X_res_pd.columns.to_list()
assert y_pd.name == y_res_pd.name
assert_allclose(X_res_pd.to_numpy(), X_res)
assert_allclose(y_res_pd.to_numpy(), y_res)
# check that we return the same type for dataframes or series types
assert isinstance(X_res_df, pd.DataFrame)
assert isinstance(y_res_df, pd.DataFrame)
assert isinstance(y_res_s, pd.Series)

assert X_df.columns.to_list() == X_res_df.columns.to_list()
assert y_df.columns.to_list() == y_res_df.columns.to_list()
assert y_s.name == y_res_s.name

assert_allclose(X_res_df.to_numpy(), X_res)
assert_allclose(y_res_df.to_numpy().ravel(), y_res)
assert_allclose(y_res_s.to_numpy(), y_res)


def check_samplers_list(name, Sampler):
# Check that the can samplers handle simple lists
X, y = make_classification(
n_samples=1000,
n_classes=3,
n_informative=4,
weights=[0.2, 0.3, 0.5],
random_state=0,
)
X_list = X.tolist()
y_list = y.tolist()
sampler = Sampler()
if isinstance(sampler, NearMiss):
samplers = [Sampler(version=version) for version in (1, 2, 3)]

else:
samplers = [sampler]

for sampler in samplers:
set_random_state(sampler)
X_res, y_res = sampler.fit_resample(X, y)
X_res_list, y_res_list = sampler.fit_resample(X_list, y_list)

assert isinstance(X_res_list, list)
assert isinstance(y_res_list, list)

assert_allclose(X_res, X_res_list)
assert_allclose(y_res, y_res_list)


def check_samplers_multiclass_ova(name, Sampler):
Expand Down
41 changes: 41 additions & 0 deletions imblearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from imblearn.utils import check_neighbors_object
from imblearn.utils import check_sampling_strategy
from imblearn.utils import check_target_type
from imblearn.utils._validation import OutputFormater

multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25)
binary_target = np.array([1] * 25 + [0] * 100)
Expand Down Expand Up @@ -315,3 +316,43 @@ def test_sampling_strategy_check_order(
sampling_strategy, y, sampling_type
)
assert sampling_strategy_ == expected_result


def test_output_formater_plain_list():
chkoar marked this conversation as resolved.
Show resolved Hide resolved
X = np.array([[0, 0], [1, 1]])
y = np.array([[0, 0], [1, 1]])

formater = OutputFormater(X.tolist(), y.tolist())
X_res, y_res = formater.format(X, y)
assert isinstance(X_res, list)
assert isinstance(y_res, list)


def test_output_formater_pandas():
pd = pytest.importorskip("pandas")

X = np.array([[0, 0], [1, 1]])
y = np.array([0, 1])

X_df = pd.DataFrame(X, columns=["a", "b"])
X_df = X_df.astype(int)
y_df = pd.DataFrame(y, columns=["target", ])
y_df = y_df.astype(int)
y_s = pd.Series(y, name="target", dtype=int)

# DataFrame and DataFrame case
formater = OutputFormater(X_df, y_df)
X_res, y_res = formater.format(X, y)
assert isinstance(X_res, pd.DataFrame)
assert_array_equal(X_res.columns, X_df.columns)
assert_array_equal(X_res.dtypes, X_df.dtypes)
assert isinstance(y_res, pd.DataFrame)
assert_array_equal(y_res.columns, y_df.columns)
assert_array_equal(y_res.dtypes, y_df.dtypes)

# DataFrames and Series case
formater = OutputFormater(X_df, y_s)
_, y_res = formater.format(X, y)
assert isinstance(y_res, pd.Series)
assert_array_equal(y_res.name, y_s.name)
assert_array_equal(y_res.dtype, y_s.dtype)