From 0ac2c92d8d103dc716f99db23bb6fae66d3d7601 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Aug 2018 15:45:08 +0200 Subject: [PATCH 01/15] API: define fit_resample only without any fit --- doc/combine.rst | 4 +- doc/ensemble.rst | 4 +- doc/introduction.rst | 2 +- doc/miscellaneous.rst | 2 +- doc/over_sampling.rst | 10 +- doc/under_sampling.rst | 24 +-- .../plot_over_sampling_benchmark_lfw.py | 2 +- examples/combine/plot_comparison_combine.py | 2 +- examples/combine/plot_smote_enn.py | 2 +- examples/combine/plot_smote_tomek.py | 2 +- examples/ensemble/plot_balance_cascade.py | 2 +- examples/ensemble/plot_easy_ensemble.py | 2 +- examples/over-sampling/plot_adasyn.py | 2 +- .../plot_comparison_over_sampling.py | 4 +- .../plot_random_over_sampling.py | 2 +- examples/over-sampling/plot_smote.py | 2 +- examples/plot_outlier_rejections.py | 2 +- examples/plot_sampling_strategy_usage.py | 18 +- .../under-sampling/plot_cluster_centroids.py | 4 +- .../plot_comparison_under_sampling.py | 2 +- .../plot_condensed_nearest_neighbour.py | 2 +- .../under-sampling/plot_enn_renn_allknn.py | 6 +- .../plot_illustration_tomek_links.py | 2 +- .../plot_instance_hardness_threshold.py | 2 +- examples/under-sampling/plot_nearmiss.py | 2 +- .../plot_neighbourhood_cleaning_rule.py | 2 +- .../plot_one_sided_selection.py | 2 +- .../plot_random_under_sampler.py | 2 +- examples/under-sampling/plot_tomek_links.py | 2 +- imblearn/base.py | 170 +++++------------- imblearn/combine/_smote_enn.py | 34 +--- imblearn/combine/_smote_tomek.py | 35 +--- imblearn/combine/tests/test_smote_enn.py | 14 +- imblearn/combine/tests/test_smote_tomek.py | 12 +- imblearn/datasets/_imbalance.py | 2 +- imblearn/ensemble/_balance_cascade.py | 35 +--- imblearn/ensemble/_easy_ensemble.py | 6 +- imblearn/ensemble/base.py | 23 +-- .../ensemble/tests/test_balance_cascade.py | 16 +- imblearn/ensemble/tests/test_easy_ensemble.py | 10 +- imblearn/keras/_generator.py | 2 +- imblearn/over_sampling/_adasyn.py | 4 +- .../over_sampling/_random_over_sampler.py | 4 +- imblearn/over_sampling/_smote.py | 20 ++- imblearn/over_sampling/tests/test_adasyn.py | 18 +- .../tests/test_random_over_sampler.py | 16 +- imblearn/over_sampling/tests/test_smote.py | 38 ++-- imblearn/pipeline.py | 60 +++---- imblearn/tensorflow/_generator.py | 2 +- imblearn/tests/test_base.py | 12 +- imblearn/tests/test_pipeline.py | 44 ++--- .../_cluster_centroids.py | 4 +- .../tests/test_cluster_centroids.py | 30 ++-- .../_condensed_nearest_neighbour.py | 4 +- .../_edited_nearest_neighbours.py | 20 +-- .../_instance_hardness_threshold.py | 4 +- .../_prototype_selection/_nearmiss.py | 4 +- .../_neighbourhood_cleaning_rule.py | 6 +- .../_one_sided_selection.py | 6 +- .../_random_under_sampler.py | 4 +- .../_prototype_selection/_tomek_links.py | 4 +- .../_prototype_selection/tests/test_allknn.py | 24 +-- .../tests/test_condensed_nearest_neighbour.py | 18 +- .../tests/test_edited_nearest_neighbours.py | 20 +-- .../tests/test_instance_hardness_threshold.py | 20 +-- .../tests/test_nearmiss.py | 24 +-- .../tests/test_neighbourhood_cleaning_rule.py | 24 +-- .../tests/test_one_sided_selection.py | 14 +- .../tests/test_random_under_sampler.py | 18 +- ...test_repeated_edited_nearest_neighbours.py | 22 +-- .../tests/test_tomek_links.py | 10 +- imblearn/utils/__init__.py | 3 +- imblearn/utils/_validation.py | 41 +---- imblearn/utils/estimator_checks.py | 46 +++-- imblearn/utils/tests/test_validation.py | 30 ---- 75 files changed, 428 insertions(+), 665 deletions(-) diff --git a/doc/combine.rst b/doc/combine.rst index 165fcc7f0..c8cd21ff9 100644 --- a/doc/combine.rst +++ b/doc/combine.rst @@ -33,12 +33,12 @@ to their former samplers:: [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.combine import SMOTEENN >>> smote_enn = SMOTEENN(random_state=0) - >>> X_resampled, y_resampled = smote_enn.fit_sample(X, y) + >>> X_resampled, y_resampled = smote_enn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4060), (1, 4381), (2, 3502)] >>> from imblearn.combine import SMOTETomek >>> smote_tomek = SMOTETomek(random_state=0) - >>> X_resampled, y_resampled = smote_tomek.fit_sample(X, y) + >>> X_resampled, y_resampled = smote_tomek.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4499), (1, 4566), (2, 4413)] diff --git a/doc/ensemble.rst b/doc/ensemble.rst index bc49fb699..814afb2c1 100644 --- a/doc/ensemble.rst +++ b/doc/ensemble.rst @@ -33,7 +33,7 @@ under-sampling the original set:: [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.ensemble import EasyEnsemble >>> ee = EasyEnsemble(random_state=0, n_subsets=10) - >>> X_resampled, y_resampled = ee.fit_sample(X, y) + >>> X_resampled, y_resampled = ee.fit_resample(X, y) >>> print(X_resampled.shape) (10, 192, 2) >>> print(sorted(Counter(y_resampled[0]).items())) @@ -55,7 +55,7 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with >>> bc = BalanceCascade(random_state=0, ... estimator=LogisticRegression(random_state=0), ... n_max_subset=4) - >>> X_resampled, y_resampled = bc.fit_sample(X, y) + >>> X_resampled, y_resampled = bc.fit_resample(X, y) >>> print(X_resampled.shape) (4, 192, 2) >>> print(sorted(Counter(y_resampled[0]).items())) diff --git a/doc/introduction.rst b/doc/introduction.rst index 24c9aca36..0c5bf7a4a 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -26,7 +26,7 @@ and adding a sampling functionality through the ``sample`` method: Fitting and sampling can also be done in one step:: - data_resampled, targets_resampled = obj.fit_sample(data, targets) + data_resampled, targets_resampled = obj.fit_resample(data, targets) Imbalanced-learn samplers accept the same inputs that in scikit-learn: diff --git a/doc/miscellaneous.rst b/doc/miscellaneous.rst index 5734f5c66..9ec380ee4 100644 --- a/doc/miscellaneous.rst +++ b/doc/miscellaneous.rst @@ -28,7 +28,7 @@ to retain the 10 first elements of the array ``X`` and ``y``:: >>> def func(X, y): ... return X[:10], y[:10] >>> sampler = FunctionSampler(func=func) - >>> X_res, y_res = sampler.fit_sample(X, y) + >>> X_res, y_res = sampler.fit_resample(X, y) >>> np.all(X_res == X[:10]) True >>> np.all(y_res == y[:10]) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index 296e44e4b..4e7ea4d4d 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -27,7 +27,7 @@ randomly sampling with replacement the current available samples. The ... class_sep=0.8, random_state=0) >>> from imblearn.over_sampling import RandomOverSampler >>> ros = RandomOverSampler(random_state=0) - >>> X_resampled, y_resampled = ros.fit_sample(X, y) + >>> X_resampled, y_resampled = ros.fit_resample(X, y) >>> from collections import Counter >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] @@ -59,7 +59,7 @@ In addition, :class:`RandomOverSampler` allows to sample heterogeneous data >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], ... dtype=np.object) >>> y_hetero = np.array([0, 0, 1]) - >>> X_resampled, y_resampled = ros.fit_sample(X_hetero, y_hetero) + >>> X_resampled, y_resampled = ros.fit_resample(X_hetero, y_hetero) >>> print(X_resampled) [['xxx' 1 1.0] ['yyy' 2 2.0] @@ -82,11 +82,11 @@ to over-sample minority classes: (i) the Synthetic Minority Oversampling Techniq can be used in the same manner:: >>> from imblearn.over_sampling import SMOTE, ADASYN - >>> X_resampled, y_resampled = SMOTE().fit_sample(X, y) + >>> X_resampled, y_resampled = SMOTE().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] >>> clf_smote = LinearSVC().fit(X_resampled, y_resampled) - >>> X_resampled, y_resampled = ADASYN().fit_sample(X, y) + >>> X_resampled, y_resampled = ADASYN().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4673), (1, 4662), (2, 4674)] >>> clf_adasyn = LinearSVC().fit(X_resampled, y_resampled) @@ -147,7 +147,7 @@ The :class:`BorderlineSMOTE` and :class:`SVMSMOTE` offer some variant of the SMO algorithm:: >>> from imblearn.over_sampling import BorderlineSMOTE - >>> X_resampled, y_resampled = BorderlineSMOTE().fit_sample(X, y) + >>> X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index a45375c4b..c621d40cb 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -32,7 +32,7 @@ K-means method instead of the original samples:: [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.under_sampling import ClusterCentroids >>> cc = ClusterCentroids(random_state=0) - >>> X_resampled, y_resampled = cc.fit_sample(X, y) + >>> X_resampled, y_resampled = cc.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] @@ -82,7 +82,7 @@ randomly selecting a subset of data for the targeted classes:: >>> from imblearn.under_sampling import RandomUnderSampler >>> rus = RandomUnderSampler(random_state=0) - >>> X_resampled, y_resampled = rus.fit_sample(X, y) + >>> X_resampled, y_resampled = rus.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] @@ -99,7 +99,7 @@ by considering independently each targeted class:: >>> print(np.vstack({tuple(row) for row in X_resampled}).shape) (192, 2) >>> rus = RandomUnderSampler(random_state=0, replacement=True) - >>> X_resampled, y_resampled = rus.fit_sample(X, y) + >>> X_resampled, y_resampled = rus.fit_resample(X, y) >>> print(np.vstack({tuple(row) for row in X_resampled}).shape) (181, 2) @@ -109,7 +109,7 @@ In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], ... dtype=np.object) >>> y_hetero = np.array([0, 0, 1]) - >>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y_hetero) + >>> X_resampled, y_resampled = rus.fit_resample(X_hetero, y_hetero) >>> print(X_resampled) [['xxx' 1 1.0] ['zzz' 3 3.0]] @@ -126,7 +126,7 @@ be selected with the parameter ``version``:: >>> from imblearn.under_sampling import NearMiss >>> nm1 = NearMiss(version=1) - >>> X_resampled_nm1, y_resampled = nm1.fit_sample(X, y) + >>> X_resampled_nm1, y_resampled = nm1.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] @@ -261,7 +261,7 @@ the sample inspected to keep it in the dataset:: [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.under_sampling import EditedNearestNeighbours >>> enn = EditedNearestNeighbours() - >>> X_resampled, y_resampled = enn.fit_sample(X, y) + >>> X_resampled, y_resampled = enn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 213), (2, 4568)] @@ -275,7 +275,7 @@ Generally, repeating the algorithm will delete more data:: >>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours >>> renn = RepeatedEditedNearestNeighbours() - >>> X_resampled, y_resampled = renn.fit_sample(X, y) + >>> X_resampled, y_resampled = renn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 208), (2, 4551)] @@ -285,7 +285,7 @@ internal nearest neighbors algorithm is increased at each iteration:: >>> from imblearn.under_sampling import AllKNN >>> allknn = AllKNN() - >>> X_resampled, y_resampled = allknn.fit_sample(X, y) + >>> X_resampled, y_resampled = allknn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 220), (2, 4601)] @@ -323,7 +323,7 @@ The :class:`CondensedNearestNeighbour` can be used in the following manner:: >>> from imblearn.under_sampling import CondensedNearestNeighbour >>> cnn = CondensedNearestNeighbour(random_state=0) - >>> X_resampled, y_resampled = cnn.fit_sample(X, y) + >>> X_resampled, y_resampled = cnn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 24), (2, 115)] @@ -338,7 +338,7 @@ used as:: >>> from imblearn.under_sampling import OneSidedSelection >>> oss = OneSidedSelection(random_state=0) - >>> X_resampled, y_resampled = oss.fit_sample(X, y) + >>> X_resampled, y_resampled = oss.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 174), (2, 4403)] @@ -352,7 +352,7 @@ neighbors classifier. The class can be used as:: >>> from imblearn.under_sampling import NeighbourhoodCleaningRule >>> ncr = NeighbourhoodCleaningRule() - >>> X_resampled, y_resampled = ncr.fit_sample(X, y) + >>> X_resampled, y_resampled = ncr.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 234), (2, 4666)] @@ -380,7 +380,7 @@ removed. The class can be used as:: >>> from imblearn.under_sampling import InstanceHardnessThreshold >>> iht = InstanceHardnessThreshold(random_state=0, ... estimator=LogisticRegression()) - >>> X_resampled, y_resampled = iht.fit_sample(X, y) + >>> X_resampled, y_resampled = iht.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] diff --git a/examples/applications/plot_over_sampling_benchmark_lfw.py b/examples/applications/plot_over_sampling_benchmark_lfw.py index e65e6446b..83d8c15f1 100644 --- a/examples/applications/plot_over_sampling_benchmark_lfw.py +++ b/examples/applications/plot_over_sampling_benchmark_lfw.py @@ -39,7 +39,7 @@ def sample(self, X, y): def fit(self, X, y): return self - def fit_sample(self, X, y): + def fit_resample(self, X, y): return self.sample(X, y) diff --git a/examples/combine/plot_comparison_combine.py b/examples/combine/plot_comparison_combine.py index 63180b5fa..acba13272 100644 --- a/examples/combine/plot_comparison_combine.py +++ b/examples/combine/plot_comparison_combine.py @@ -47,7 +47,7 @@ def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3, def plot_resampling(X, y, sampling, ax): - X_res, y_res = sampling.fit_sample(X, y) + X_res, y_res = sampling.fit_resample(X, y) ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k') # make nice plotting ax.spines['top'].set_visible(False) diff --git a/examples/combine/plot_smote_enn.py b/examples/combine/plot_smote_enn.py index b7d80e1ad..087e41aea 100644 --- a/examples/combine/plot_smote_enn.py +++ b/examples/combine/plot_smote_enn.py @@ -32,7 +32,7 @@ # Apply SMOTE + ENN sm = SMOTEENN() -X_resampled, y_resampled = sm.fit_sample(X, y) +X_resampled, y_resampled = sm.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately diff --git a/examples/combine/plot_smote_tomek.py b/examples/combine/plot_smote_tomek.py index fc2596303..42d7bd885 100644 --- a/examples/combine/plot_smote_tomek.py +++ b/examples/combine/plot_smote_tomek.py @@ -32,7 +32,7 @@ # Apply SMOTE + Tomek links sm = SMOTETomek() -X_resampled, y_resampled = sm.fit_sample(X, y) +X_resampled, y_resampled = sm.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately diff --git a/examples/ensemble/plot_balance_cascade.py b/examples/ensemble/plot_balance_cascade.py index 0999551be..4af50318f 100644 --- a/examples/ensemble/plot_balance_cascade.py +++ b/examples/ensemble/plot_balance_cascade.py @@ -32,7 +32,7 @@ # Apply Balance Cascade method bc = BalanceCascade() -X_resampled, y_resampled = bc.fit_sample(X, y) +X_resampled, y_resampled = bc.fit_resample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) diff --git a/examples/ensemble/plot_easy_ensemble.py b/examples/ensemble/plot_easy_ensemble.py index eab121073..6f22dc919 100644 --- a/examples/ensemble/plot_easy_ensemble.py +++ b/examples/ensemble/plot_easy_ensemble.py @@ -32,7 +32,7 @@ # Apply Easy Ensemble ee = EasyEnsemble(n_subsets=3) -X_resampled, y_resampled = ee.fit_sample(X, y) +X_resampled, y_resampled = ee.fit_resample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) diff --git a/examples/over-sampling/plot_adasyn.py b/examples/over-sampling/plot_adasyn.py index 19f234faa..c6248a9cf 100644 --- a/examples/over-sampling/plot_adasyn.py +++ b/examples/over-sampling/plot_adasyn.py @@ -33,7 +33,7 @@ # Apply the random over-sampling ada = ADASYN() -X_resampled, y_resampled = ada.fit_sample(X, y) +X_resampled, y_resampled = ada.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py index 41d395594..29a7b657c 100644 --- a/examples/over-sampling/plot_comparison_over_sampling.py +++ b/examples/over-sampling/plot_comparison_over_sampling.py @@ -49,7 +49,7 @@ def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3, def plot_resampling(X, y, sampling, ax): - X_res, y_res = sampling.fit_sample(X, y) + X_res, y_res = sampling.fit_resample(X, y) ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k') # make nice plotting ax.spines['top'].set_visible(False) @@ -144,7 +144,7 @@ def sample(self, X, y): def _sample(self, X, y): pass - def fit_sample(self, X, y): + def fit_resample(self, X, y): return X, y diff --git a/examples/over-sampling/plot_random_over_sampling.py b/examples/over-sampling/plot_random_over_sampling.py index 5165bc52d..a01817e8a 100644 --- a/examples/over-sampling/plot_random_over_sampling.py +++ b/examples/over-sampling/plot_random_over_sampling.py @@ -32,7 +32,7 @@ # Apply the random over-sampling ros = RandomOverSampler() -X_resampled, y_resampled = ros.fit_sample(X, y) +X_resampled, y_resampled = ros.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately diff --git a/examples/over-sampling/plot_smote.py b/examples/over-sampling/plot_smote.py index 591720c2e..808382cb7 100644 --- a/examples/over-sampling/plot_smote.py +++ b/examples/over-sampling/plot_smote.py @@ -57,7 +57,7 @@ def plot_resampling(ax, X, y, title): y_resampled = [] X_res_vis = [] for method in sm: - X_res, y_res = method.fit_sample(X, y) + X_res, y_res = method.fit_resample(X, y) X_resampled.append(X_res) y_resampled.append(y_res) X_res_vis.append(pca.transform(X_res)) diff --git a/examples/plot_outlier_rejections.py b/examples/plot_outlier_rejections.py index cb740bae9..1edf1afd3 100644 --- a/examples/plot_outlier_rejections.py +++ b/examples/plot_outlier_rejections.py @@ -73,7 +73,7 @@ def outlier_rejection(X, y): reject_sampler = FunctionSampler(func=outlier_rejection) -X_inliers, y_inliers = reject_sampler.fit_sample(X_train, y_train) +X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train) plot_scatter(X_inliers, y_inliers, 'Training data without outliers') pipe = make_pipeline(FunctionSampler(func=outlier_rejection), diff --git a/examples/plot_sampling_strategy_usage.py b/examples/plot_sampling_strategy_usage.py index 7819add0b..315328b78 100644 --- a/examples/plot_sampling_strategy_usage.py +++ b/examples/plot_sampling_strategy_usage.py @@ -87,7 +87,7 @@ def my_autopct(pct): sampling_strategy = 0.8 rus = RandomUnderSampler(sampling_strategy=sampling_strategy) -X_res, y_res = rus.fit_sample(binary_X, binary_y) +X_res, y_res = rus.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an under-sampling method: \n ' 'sampling_strategy={} \n y: {}' @@ -102,7 +102,7 @@ def my_autopct(pct): # class, respectively. ros = RandomOverSampler(sampling_strategy=sampling_strategy) -X_res, y_res = ros.fit_sample(binary_X, binary_y) +X_res, y_res = ros.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an over-sampling method: \n ' 'sampling_strategy={} \n y: {}' @@ -122,7 +122,7 @@ def my_autopct(pct): sampling_strategy = 'not minority' rus = RandomUnderSampler(sampling_strategy=sampling_strategy) -X_res, y_res = rus.fit_sample(X, y) +X_res, y_res = rus.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by under-sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) @@ -131,7 +131,7 @@ def my_autopct(pct): sampling_strategy = 'not majority' ros = RandomOverSampler(sampling_strategy=sampling_strategy) -X_res, y_res = ros.fit_sample(X, y) +X_res, y_res = ros.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by over-sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) @@ -143,7 +143,7 @@ def my_autopct(pct): sampling_strategy = 'not minority' tl = TomekLinks(sampling_strategy) -X_res, y_res = tl.fit_sample(X, y) +X_res, y_res = tl.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) @@ -162,7 +162,7 @@ def my_autopct(pct): sampling_strategy = {0: 10, 1: 15, 2: 20} rus = RandomUnderSampler(sampling_strategy=sampling_strategy) -X_res, y_res = rus.fit_sample(X, y) +X_res, y_res = rus.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by under-sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) @@ -171,7 +171,7 @@ def my_autopct(pct): sampling_strategy = {0: 25, 1: 35, 2: 47} ros = RandomOverSampler(sampling_strategy=sampling_strategy) -X_res, y_res = ros.fit_sample(X, y) +X_res, y_res = ros.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by over-sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) @@ -187,7 +187,7 @@ def my_autopct(pct): sampling_strategy = [0, 1, 2] tl = TomekLinks(sampling_strategy=sampling_strategy) -X_res, y_res = tl.fit_sample(X, y) +X_res, y_res = tl.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) @@ -212,7 +212,7 @@ def ratio_multiplier(y): X_res, y_res = (RandomUnderSampler(sampling_strategy=ratio_multiplier) - .fit_sample(X, y)) + .fit_resample(X, y)) print('Information of the iris data set after balancing using a callable' ' mode:\n ratio={} \n y: {}'.format(ratio_multiplier, Counter(y_res))) diff --git a/examples/under-sampling/plot_cluster_centroids.py b/examples/under-sampling/plot_cluster_centroids.py index d13b669e3..678d4613a 100644 --- a/examples/under-sampling/plot_cluster_centroids.py +++ b/examples/under-sampling/plot_cluster_centroids.py @@ -33,12 +33,12 @@ # Apply Cluster Centroids cc = ClusterCentroids() -X_resampled, y_resampled = cc.fit_sample(X, y) +X_resampled, y_resampled = cc.fit_resample(X, y) X_res_vis_soft = pca.transform(X_resampled) # Use hard voting instead of soft voting cc = ClusterCentroids(voting='hard') -X_resampled, y_resampled = cc.fit_sample(X, y) +X_resampled, y_resampled = cc.fit_resample(X, y) X_res_vis_hard = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately diff --git a/examples/under-sampling/plot_comparison_under_sampling.py b/examples/under-sampling/plot_comparison_under_sampling.py index 3a2c427a3..a175d6193 100644 --- a/examples/under-sampling/plot_comparison_under_sampling.py +++ b/examples/under-sampling/plot_comparison_under_sampling.py @@ -53,7 +53,7 @@ def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3, def plot_resampling(X, y, sampling, ax): - X_res, y_res = sampling.fit_sample(X, y) + X_res, y_res = sampling.fit_resample(X, y) ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k') # make nice plotting ax.spines['top'].set_visible(False) diff --git a/examples/under-sampling/plot_condensed_nearest_neighbour.py b/examples/under-sampling/plot_condensed_nearest_neighbour.py index 1bb4597b4..632cc68bf 100644 --- a/examples/under-sampling/plot_condensed_nearest_neighbour.py +++ b/examples/under-sampling/plot_condensed_nearest_neighbour.py @@ -33,7 +33,7 @@ # Apply Condensed Nearest Neighbours cnn = CondensedNearestNeighbour(return_indices=True) -X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X, y) +X_resampled, y_resampled, idx_resampled = cnn.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() diff --git a/examples/under-sampling/plot_enn_renn_allknn.py b/examples/under-sampling/plot_enn_renn_allknn.py index e8b50dec6..66c6c3419 100644 --- a/examples/under-sampling/plot_enn_renn_allknn.py +++ b/examples/under-sampling/plot_enn_renn_allknn.py @@ -58,7 +58,7 @@ def plot_resampling(ax, X, y, title): # Apply the ENN print('ENN') enn = EditedNearestNeighbours(return_indices=True) -X_resampled, y_resampled, idx_resampled = enn.fit_sample(X, y) +X_resampled, y_resampled, idx_resampled = enn.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / @@ -72,7 +72,7 @@ def plot_resampling(ax, X, y, title): # Apply the RENN print('RENN') renn = RepeatedEditedNearestNeighbours(return_indices=True) -X_resampled, y_resampled, idx_resampled = renn.fit_sample(X, y) +X_resampled, y_resampled, idx_resampled = renn.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / @@ -86,7 +86,7 @@ def plot_resampling(ax, X, y, title): # Apply the AllKNN print('AllKNN') allknn = AllKNN(return_indices=True) -X_resampled, y_resampled, idx_resampled = allknn.fit_sample(X, y) +X_resampled, y_resampled, idx_resampled = allknn.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / diff --git a/examples/under-sampling/plot_illustration_tomek_links.py b/examples/under-sampling/plot_illustration_tomek_links.py index 3f3ff469b..ca5dd9f78 100644 --- a/examples/under-sampling/plot_illustration_tomek_links.py +++ b/examples/under-sampling/plot_illustration_tomek_links.py @@ -79,7 +79,7 @@ def make_plot_despine(ax): title_arr, [TomekLinks(sampling_strategy='auto'), TomekLinks(sampling_strategy='all')]): - X_res, y_res = sampler.fit_sample(np.vstack((X_minority, X_majority)), + X_res, y_res = sampler.fit_resample(np.vstack((X_minority, X_majority)), np.array([0] * X_minority.shape[0] + [1] * X_majority.shape[0])) ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1], diff --git a/examples/under-sampling/plot_instance_hardness_threshold.py b/examples/under-sampling/plot_instance_hardness_threshold.py index cdea26699..8aab71899 100644 --- a/examples/under-sampling/plot_instance_hardness_threshold.py +++ b/examples/under-sampling/plot_instance_hardness_threshold.py @@ -62,7 +62,7 @@ def plot_resampling(ax, X, y, title): iht = InstanceHardnessThreshold(sampling_strategy=sampling_strategy, estimator=LogisticRegression(), return_indices=True) - X_res, y_res, idx_res = iht.fit_sample(X, y) + X_res, y_res, idx_res = iht.fit_resample(X, y) X_res_vis = pca.transform(X_res) plot_resampling(ax, X_res_vis, y_res, 'Instance Hardness Threshold ({})' diff --git a/examples/under-sampling/plot_nearmiss.py b/examples/under-sampling/plot_nearmiss.py index f1e232399..e6e7ecee8 100644 --- a/examples/under-sampling/plot_nearmiss.py +++ b/examples/under-sampling/plot_nearmiss.py @@ -57,7 +57,7 @@ def plot_resampling(ax, X, y, title): X_res_vis = [] idx_samples_removed = [] for method in nm: - X_res, y_res, idx_res = method.fit_sample(X, y) + X_res, y_res, idx_res = method.fit_resample(X, y) X_resampled.append(X_res) y_resampled.append(y_res) X_res_vis.append(pca.transform(X_res)) diff --git a/examples/under-sampling/plot_neighbourhood_cleaning_rule.py b/examples/under-sampling/plot_neighbourhood_cleaning_rule.py index ea437396c..136de12ea 100644 --- a/examples/under-sampling/plot_neighbourhood_cleaning_rule.py +++ b/examples/under-sampling/plot_neighbourhood_cleaning_rule.py @@ -33,7 +33,7 @@ # Apply neighbourhood cleaning rule ncl = NeighbourhoodCleaningRule(return_indices=True) -X_resampled, y_resampled, idx_resampled = ncl.fit_sample(X, y) +X_resampled, y_resampled, idx_resampled = ncl.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() diff --git a/examples/under-sampling/plot_one_sided_selection.py b/examples/under-sampling/plot_one_sided_selection.py index 5d859c8b3..d361e160f 100644 --- a/examples/under-sampling/plot_one_sided_selection.py +++ b/examples/under-sampling/plot_one_sided_selection.py @@ -33,7 +33,7 @@ # Apply One-Sided Selection oss = OneSidedSelection(return_indices=True) -X_resampled, y_resampled, idx_resampled = oss.fit_sample(X, y) +X_resampled, y_resampled, idx_resampled = oss.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() diff --git a/examples/under-sampling/plot_random_under_sampler.py b/examples/under-sampling/plot_random_under_sampler.py index f8b3f8fdb..122f3e099 100644 --- a/examples/under-sampling/plot_random_under_sampler.py +++ b/examples/under-sampling/plot_random_under_sampler.py @@ -33,7 +33,7 @@ # Apply the random under-sampling rus = RandomUnderSampler(return_indices=True) -X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, y) +X_resampled, y_resampled, idx_resampled = rus.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() diff --git a/examples/under-sampling/plot_tomek_links.py b/examples/under-sampling/plot_tomek_links.py index 39feb4aae..06265d561 100644 --- a/examples/under-sampling/plot_tomek_links.py +++ b/examples/under-sampling/plot_tomek_links.py @@ -34,7 +34,7 @@ # remove Tomek links tl = TomekLinks(return_indices=True) -X_resampled, y_resampled, idx_resampled = tl.fit_sample(X_syn, y_syn) +X_resampled, y_resampled, idx_resampled = tl.fit_resample(X_syn, y_syn) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) diff --git a/imblearn/base.py b/imblearn/base.py index 0cb7dfecf..c5e419f40 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -16,9 +16,7 @@ from sklearn.externals import six from sklearn.preprocessing import label_binarize from sklearn.utils import check_X_y -from sklearn.utils.validation import check_is_fitted - -from .utils import check_sampling_strategy, check_target_type, hash_X_y +from .utils import check_sampling_strategy, check_target_type from .utils.deprecation import deprecate_parameter @@ -31,7 +29,11 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): _estimator_type = 'sampler' - def sample(self, X, y): + def fit(self, X, y): + self.fit_resample(X, y) + return self + + def fit_resample(self, X, y): """Resample the dataset. Parameters @@ -44,21 +46,22 @@ def sample(self, X, y): Returns ------- - X_resampled : {ndarray, sparse matrix}, shape \ + X_resampled : {array-like, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : array-like, shape (n_samples_new,) The corresponding label of `X_resampled` """ - # Check the consistency of X and y + self._deprecate_ratio() + X, y, binarize_y = self._check_X_y(X, y) - check_is_fitted(self, 'sampling_strategy_') - self._check_X_y_hash(X, y) + self.sampling_strategy_ = check_sampling_strategy( + self.sampling_strategy, y, self._sampling_type) - output = self._sample(X, y) + output = self._fit_resample(X, y) if binarize_y: y_sampled = label_binarize(output[1], np.unique(y)) @@ -66,35 +69,13 @@ def sample(self, X, y): return output[0], y_sampled else: return output[0], y_sampled, output[2] - else: - return output - - def fit_sample(self, X, y): - """Fit the statistics and resample the data directly. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : array-like, shape (n_samples,) - Corresponding label for each sample in X. - - Returns - ------- - X_resampled : {array-like, sparse matrix}, shape \ -(n_samples_new, n_features) - The array containing the resampled data. - - y_resampled : array-like, shape (n_samples_new,) - The corresponding label of `X_resampled` - - """ + return output - return self.fit(X, y).sample(X, y) + # define an alias for back-compatibility + fit_sample = fit_resample @abstractmethod - def _sample(self, X, y): + def _fit_resample(self, X, y): """Base method defined in each sampler to defined the sampling strategy. @@ -118,18 +99,6 @@ def _sample(self, X, y): """ pass - def __getstate__(self): - """Prevent logger from being pickled.""" - object_dictionary = self.__dict__.copy() - del object_dictionary['logger'] - return object_dictionary - - def __setstate__(self, dict): - """Re-open the logger.""" - logger = logging.getLogger(self.__module__) - self.__dict__.update(dict) - self.logger = logger - class BaseSampler(SamplerMixin): """Base class for sampling algorithms. @@ -150,13 +119,6 @@ def _check_X_y(X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) return X, y, binarize_y - def _check_X_y_hash(self, X, y): - """Private function to check that the X and y in fitting are the same - than in sampling.""" - X_hash, y_hash = hash_X_y(X, y) - if self.X_hash_ != X_hash or self.y_hash_ != y_hash: - raise RuntimeError("X and y need to be same array earlier fitted.") - @property def ratio_(self): # FIXME: remove in 0.6 @@ -171,38 +133,24 @@ def _deprecate_ratio(self): deprecate_parameter(self, '0.4', 'ratio', 'sampling_strategy') self.sampling_strategy = self.ratio - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : array-like, shape (n_samples,) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - self._deprecate_ratio() - X, y, _ = self._check_X_y(X, y) - self.X_hash_, self.y_hash_ = hash_X_y(X, y) - # _sampling_type is defined in the children base class - self.sampling_strategy_ = check_sampling_strategy( - self.sampling_strategy, y, self._sampling_type) + def __getstate__(self): + """Prevent logger from being pickled.""" + object_dictionary = self.__dict__.copy() + del object_dictionary['logger'] + return object_dictionary - return self + def __setstate__(self, dict): + """Re-open the logger.""" + logger = logging.getLogger(self.__module__) + self.__dict__.update(dict) + self.logger = logger def _identity(X, y): return X, y -class FunctionSampler(SamplerMixin): +class FunctionSampler(BaseSampler): """Construct a sampler from calling an arbitrary callable. Read more in the :ref:`User Guide `. @@ -241,7 +189,7 @@ class FunctionSampler(SamplerMixin): >>> def func(X, y): ... return X[:10], y[:10] >>> sampler = FunctionSampler(func=func) - >>> X_res, y_res = sampler.fit_sample(X, y) + >>> X_res, y_res = sampler.fit_resample(X, y) >>> np.all(X_res == X[:10]) True >>> np.all(y_res == y[:10]) @@ -253,67 +201,29 @@ class FunctionSampler(SamplerMixin): >>> from imblearn.under_sampling import RandomUnderSampler >>> def func(X, y, sampling_strategy, random_state): ... return RandomUnderSampler(sampling_strategy=sampling_strategy, - ... random_state=random_state).fit_sample(X, y) + ... random_state=random_state).fit_resample(X, y) >>> sampler = FunctionSampler(func=func, ... kw_args={'sampling_strategy': 'auto', ... 'random_state': 0}) - >>> X_res, y_res = sampler.fit_sample(X, y) + >>> X_res, y_res = sampler.fit_resample(X, y) >>> print('Resampled dataset shape {}'.format( ... sorted(Counter(y_res).items()))) Resampled dataset shape [(0, 100), (1, 100)] """ + _sampling_type = 'bypass' + def __init__(self, func=None, accept_sparse=True, kw_args=None): + super(FunctionSampler, self).__init__() self.func = func self.accept_sparse = accept_sparse self.kw_args = kw_args self.logger = logging.getLogger(__name__) - def fit(self, X, y): - y = check_target_type(y) - X, y = check_X_y( - X, - y, - accept_sparse=['csr', 'csc'] if self.accept_sparse else False) - self.X_hash_, self.y_hash_ = hash_X_y(X, y) - # when using a sampler, ratio_ is supposed to exist after fit - self.sampling_strategy_ = 'is_fitted' - - return self - - @property - def ratio_(self): - # FIXME: remove in 0.6 - warnings.warn("'ratio' and 'ratio_' are deprecated. Use " - "'sampling_strategy' and 'sampling_strategy_' instead.", - DeprecationWarning) - return self.sampling_strategy_ - - def _sample(self, X, y, func=None, kw_args=None): - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y( - X, - y, - accept_sparse=['csr', 'csc'] if self.accept_sparse else False) - check_is_fitted(self, 'sampling_strategy_') - X_hash, y_hash = hash_X_y(X, y) - if self.X_hash_ != X_hash or self.y_hash_ != y_hash: - raise RuntimeError("X and y need to be same array earlier fitted.") - - if func is None: - func = _identity - - output = func(X, y, **(kw_args if self.kw_args else {})) - - if binarize_y: - y_sampled = label_binarize(output[1], np.unique(y)) - if len(output) == 2: - return output[0], y_sampled - else: - return output[0], y_sampled, output[2] - else: - return output - - def sample(self, X, y): - return self._sample(X, y, func=self.func, kw_args=self.kw_args) + def _fit_resample(self, X, y): + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'] + if self.accept_sparse else False) + func = _identity if self.func is None else self.func + output = func(X, y, **(self.kw_args if self.kw_args else {})) + return output diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py index 4618264c5..4bce0052f 100644 --- a/imblearn/combine/_smote_enn.py +++ b/imblearn/combine/_smote_enn.py @@ -15,7 +15,7 @@ from ..over_sampling import SMOTE from ..over_sampling.base import BaseOverSampler from ..under_sampling import EditedNearestNeighbours -from ..utils import check_target_type, hash_X_y +from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _random_state_docstring @@ -79,11 +79,12 @@ class SMOTEENN(BaseSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sme = SMOTEENN(random_state=42) - >>> X_res, y_res = sme.fit_sample(X, y) + >>> X_res, y_res = sme.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 881}}) """ + _sampling_type = 'over-sampling' def __init__(self, sampling_strategy='auto', @@ -124,32 +125,11 @@ def _validate_estimator(self): else: self.enn_ = EditedNearestNeighbours(sampling_strategy='all') - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : array-like, shape (n_samples,) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ + def _fit_resample(self, X, y): + self._validate_estimator() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) self.sampling_strategy_ = self.sampling_strategy - self.X_hash_, self.y_hash_ = hash_X_y(X, y) - - return self - - def _sample(self, X, y): - self._validate_estimator() - X_res, y_res = self.smote_.fit_sample(X, y) - return self.enn_.fit_sample(X_res, y_res) + X_res, y_res = self.smote_.fit_resample(X, y) + return self.enn_.fit_resample(X_res, y_res) diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py index ce6143ab4..43340e91d 100644 --- a/imblearn/combine/_smote_tomek.py +++ b/imblearn/combine/_smote_tomek.py @@ -16,7 +16,7 @@ from ..over_sampling import SMOTE from ..over_sampling.base import BaseOverSampler from ..under_sampling import TomekLinks -from ..utils import check_target_type, hash_X_y +from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _random_state_docstring @@ -86,12 +86,14 @@ class SMOTETomek(BaseSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> smt = SMOTETomek(random_state=42) - >>> X_res, y_res = smt.fit_sample(X, y) + >>> X_res, y_res = smt.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ + _sampling_type = 'over-sampling' + def __init__(self, sampling_strategy='auto', random_state=None, @@ -132,32 +134,11 @@ def _validate_estimator(self): else: self.tomek_ = TomekLinks(sampling_strategy='all') - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : array-like, shape (n_samples,) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ + def _fit_resample(self, X, y): + self._validate_estimator() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) self.sampling_strategy_ = self.sampling_strategy - self.X_hash_, self.y_hash_ = hash_X_y(X, y) - - return self - - def _sample(self, X, y): - self._validate_estimator() - X_res, y_res = self.smote_.fit_sample(X, y) - return self.tomek_.fit_sample(X_res, y_res) + X_res, y_res = self.smote_.fit_resample(X, y) + return self.tomek_.fit_resample(X_res, y_res) diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py index 793a7a967..4cd921868 100644 --- a/imblearn/combine/tests/test_smote_enn.py +++ b/imblearn/combine/tests/test_smote_enn.py @@ -34,7 +34,7 @@ def test_sample_regular(): smote = SMOTEENN(random_state=RND_SEED) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ 0.61319159, -0.11571667 @@ -51,7 +51,7 @@ def test_sample_regular_pass_smote_enn(): enn=EditedNearestNeighbours( sampling_strategy='all', random_state=RND_SEED), random_state=RND_SEED) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ 0.61319159, -0.11571667 @@ -66,7 +66,7 @@ def test_sample_regular_half(): sampling_strategy = {0: 10, 1: 12} smote = SMOTEENN( sampling_strategy=sampling_strategy, random_state=RND_SEED) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) @@ -80,7 +80,7 @@ def test_validate_estimator_init(): enn = EditedNearestNeighbours( random_state=RND_SEED, sampling_strategy='all') smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) - X_resampled, y_resampled = smt.fit_sample(X, Y) + X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ 0.61319159, -0.11571667 ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], @@ -92,7 +92,7 @@ def test_validate_estimator_init(): def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) - X_resampled, y_resampled = smt.fit_sample(X, Y) + X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ 0.61319159, -0.11571667 ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], @@ -107,7 +107,7 @@ def test_error_wrong_object(): enn = 'rnd' smt = SMOTEENN(smote=smote, random_state=RND_SEED) with raises(ValueError, match="smote needs to be a SMOTE"): - smt.fit_sample(X, Y) + smt.fit_resample(X, Y) smt = SMOTEENN(enn=enn, random_state=RND_SEED) with raises(ValueError, match="enn needs to be an "): - smt.fit_sample(X, Y) + smt.fit_resample(X, Y) diff --git a/imblearn/combine/tests/test_smote_tomek.py b/imblearn/combine/tests/test_smote_tomek.py index 362653bd5..06a519c51 100644 --- a/imblearn/combine/tests/test_smote_tomek.py +++ b/imblearn/combine/tests/test_smote_tomek.py @@ -34,7 +34,7 @@ def test_sample_regular(): smote = SMOTETomek(random_state=RND_SEED) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ 0.62366841, -0.21312976 ], [1.61091956, -0.40283504], [-0.37162401, @@ -54,7 +54,7 @@ def test_sample_regular_half(): sampling_strategy = {0: 9, 1: 12} smote = SMOTETomek( sampling_strategy=sampling_strategy, random_state=RND_SEED) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [0.62366841, -0.21312976], [ 1.61091956, -0.40283504 ], [-0.37162401, -2.19400981], [0.74680821, @@ -72,7 +72,7 @@ def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) tomek = TomekLinks(random_state=RND_SEED, sampling_strategy='all') smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) - X_resampled, y_resampled = smt.fit_sample(X, Y) + X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ 0.62366841, -0.21312976 ], [1.61091956, -0.40283504], [-0.37162401, @@ -90,7 +90,7 @@ def test_validate_estimator_init(): def test_validate_estimator_default(): smt = SMOTETomek(random_state=RND_SEED) - X_resampled, y_resampled = smt.fit_sample(X, Y) + X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ 0.62366841, -0.21312976 ], [1.61091956, -0.40283504], [-0.37162401, @@ -111,7 +111,7 @@ def test_error_wrong_object(): tomek = 'rnd' smt = SMOTETomek(smote=smote, random_state=RND_SEED) with raises(ValueError, match="smote needs to be a SMOTE"): - smt.fit_sample(X, Y) + smt.fit_resample(X, Y) smt = SMOTETomek(tomek=tomek, random_state=RND_SEED) with raises(ValueError, match="tomek needs to be a TomekLinks"): - smt.fit_sample(X, Y) + smt.fit_resample(X, Y) diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py index 332065a66..6634febb4 100644 --- a/imblearn/datasets/_imbalance.py +++ b/imblearn/datasets/_imbalance.py @@ -121,7 +121,7 @@ def make_imbalance(X, sampling_strategy=sampling_strategy_, replacement=False, random_state=random_state) - X_resampled, y_resampled = rus.fit_sample(X, y) + X_resampled, y_resampled = rus.fit_resample(X, y) LOGGER.info('Make the dataset imbalanced: %s', Counter(y_resampled)) return X_resampled, y_resampled diff --git a/imblearn/ensemble/_balance_cascade.py b/imblearn/ensemble/_balance_cascade.py index 7532446bf..753876c8f 100644 --- a/imblearn/ensemble/_balance_cascade.py +++ b/imblearn/ensemble/_balance_cascade.py @@ -12,6 +12,7 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import check_random_state, safe_indexing from sklearn.model_selection import cross_val_predict +from sklearn.utils import check_X_y from .base import BaseEnsembleSampler from ..under_sampling.base import BaseUnderSampler @@ -92,7 +93,7 @@ class BalanceCascade(BaseEnsembleSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> bc = BalanceCascade(random_state=42) - >>> X_res, y_res = bc.fit_sample(X, y) + >>> X_res, y_res = bc.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res[0])) \ # doctest: +ELLIPSIS Resampled dataset shape Counter({{...}}) @@ -113,29 +114,6 @@ def __init__(self, self.estimator = estimator self.n_max_subset = n_max_subset - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : array-like, shape (n_samples,) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - super(BalanceCascade, self).fit(X, y) - y = check_target_type(y) - self.sampling_strategy_ = check_sampling_strategy( - self.sampling_strategy, y, 'under-sampling') - return self - def _validate_estimator(self): """Private function to create the classifier""" @@ -151,9 +129,12 @@ def _validate_estimator(self): self.logger.debug(self.estimator_) - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() + self.sampling_strategy_ = check_sampling_strategy( + self.sampling_strategy, y, 'under-sampling') + random_state = check_random_state(self.random_state) # array to know which samples are available to be taken @@ -168,9 +149,9 @@ def _sample(self, X, y): target_stats = Counter( safe_indexing(y, np.flatnonzero(samples_mask))) # store the index of the data to under-sample - index_under_sample = np.empty((0, ), dtype=y.dtype) + index_under_sample = np.empty((0, ), dtype=np.int) # value which will be picked at each round - index_constant = np.empty((0, ), dtype=y.dtype) + index_constant = np.empty((0, ), dtype=np.int) for target_class in target_stats.keys(): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index 7dba2553d..1a343d05c 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -94,7 +94,7 @@ class EasyEnsemble(BaseEnsembleSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> ee = EasyEnsemble(random_state=42) - >>> X_res, y_res = ee.fit_sample(X, y) + >>> X_res, y_res = ee.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res[0])) Resampled dataset shape Counter({{0: 100, 1: 100}}) @@ -114,7 +114,7 @@ def __init__(self, self.replacement = replacement self.n_subsets = n_subsets - def _sample(self, X, y): + def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) X_resampled = [] @@ -128,7 +128,7 @@ def _sample(self, X, y): return_indices=True, random_state=random_state.randint(MAX_INT), replacement=self.replacement) - sel_x, sel_y, sel_idx = rus.fit_sample(X, y) + sel_x, sel_y, sel_idx = rus.fit_resample(X, y) X_resampled.append(sel_x) y_resampled.append(sel_y) if self.return_indices: diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py index ed012d9db..0563124bf 100644 --- a/imblearn/ensemble/base.py +++ b/imblearn/ensemble/base.py @@ -4,16 +4,14 @@ # Authors: Guillaume Lemaitre # License: MIT -import warnings - import numpy as np from sklearn.preprocessing import label_binarize from sklearn.utils import check_X_y -from sklearn.utils.validation import check_is_fitted from ..base import BaseSampler from ..utils import check_target_type +from ..utils import check_sampling_strategy class BaseEnsembleSampler(BaseSampler): @@ -25,15 +23,7 @@ class BaseEnsembleSampler(BaseSampler): _sampling_type = 'ensemble' - @property - def ratio_(self): - warnings.warn( - "'ratio' and 'ratio_' are deprecated. " - "Use 'sampling_strategy' and 'sampling_strategy_' instead.", - DeprecationWarning) - return self.sampling_strategy_ - - def sample(self, X, y): + def fit_resample(self, X, y): """Resample the dataset. Parameters @@ -56,13 +46,12 @@ def sample(self, X, y): """ # Ensemble are a bit specific since they are returning an array of # resampled arrays. - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y, binarize_y = self._check_X_y(X, y) - check_is_fitted(self, 'sampling_strategy_') - self._check_X_y_hash(X, y) + self.sampling_strategy_ = check_sampling_strategy( + self.sampling_strategy, y, self._sampling_type) - output = self._sample(X, y) + output = self._fit_resample(X, y) if binarize_y: y_resampled = output[1] diff --git a/imblearn/ensemble/tests/test_balance_cascade.py b/imblearn/ensemble/tests/test_balance_cascade.py index a56d2a607..3ffc71672 100644 --- a/imblearn/ensemble/tests/test_balance_cascade.py +++ b/imblearn/ensemble/tests/test_balance_cascade.py @@ -32,13 +32,13 @@ Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) -def test_fit_sample_auto(): +def test_fit_resample_auto(): sampling_strategy = 'auto' bc = BalanceCascade( sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=True) - X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = bc.fit_resample(X, Y) X_gt = np.array( [[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, @@ -70,11 +70,11 @@ def test_fit_sample_auto(): assert_array_equal(idx_under, idx_gt) -def test_fit_sample_half(): +def test_fit_resample_half(): sampling_strategy = {0: 8, 1: 10} bc = BalanceCascade( sampling_strategy=sampling_strategy, random_state=RND_SEED) - X_resampled, y_resampled = bc.fit_sample(X, Y) + X_resampled, y_resampled = bc.fit_resample(X, Y) X_gt = np.array([[[-0.41635887, -0.38299653], [0.53366841, -0.30312976], [ 1.25192108, -0.22367336 ], [1.70580611, -0.11219234], [1.52091956, -0.49283504], [ @@ -91,7 +91,7 @@ def test_fit_sample_half(): assert_array_equal(y_resampled, y_gt) -def test_fit_sample_auto_early_stop(): +def test_fit_resample_auto_early_stop(): sampling_strategy = 'auto' estimator = LinearSVC(random_state=RND_SEED) bc = BalanceCascade( @@ -100,7 +100,7 @@ def test_fit_sample_auto_early_stop(): return_indices=False, estimator=estimator, n_max_subset=1) - X_resampled, y_resampled = bc.fit_sample(X, Y) + X_resampled, y_resampled = bc.fit_resample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [ 0.70472253, -0.73309052 ], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [ @@ -124,7 +124,7 @@ def test_give_classifier_obj(): random_state=RND_SEED, return_indices=False, estimator=estimator) - X_resampled, y_resampled = bc.fit_sample(X, Y) + X_resampled, y_resampled = bc.fit_resample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [ 0.70472253, -0.73309052 ], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [ @@ -149,4 +149,4 @@ def test_give_classifier_wrong_obj(): return_indices=True, estimator=classifier) with raises(ValueError, match="Invalid parameter `estimator`"): - bc.fit_sample(X, Y) + bc.fit_resample(X, Y) diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index b3ac7eaea..06599e473 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -46,7 +46,7 @@ def test_ee_init(): @pytest.mark.filterwarnings('ignore:Class EasyEnsemble is deprecated') -def test_fit_sample_auto(): +def test_fit_resample_auto(): # Define the sampling_strategy parameter sampling_strategy = 'auto' @@ -58,7 +58,7 @@ def test_fit_sample_auto(): n_subsets=3) # Get the different subset - X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = ee.fit_resample(X, Y) X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [0.5220963, 0.11349303], @@ -79,7 +79,7 @@ def test_fit_sample_auto(): @pytest.mark.filterwarnings('ignore:Class EasyEnsemble is deprecated') -def test_fit_sample_half(): +def test_fit_resample_half(): # Define the sampling_strategy parameter sampling_strategy = {0: 2, 1: 3, 2: 3} @@ -90,7 +90,7 @@ def test_fit_sample_half(): n_subsets=3) # Get the different subset - X_resampled, y_resampled = ee.fit_sample(X, Y) + X_resampled, y_resampled = ee.fit_resample(X, Y) X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556], [1.35269503, 0.44812421], [-1.23195149, 0.15427291], @@ -119,7 +119,7 @@ def test_random_state_none(): ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None) # Get the different subset - X_resampled, y_resampled = ee.fit_sample(X, Y) + X_resampled, y_resampled = ee.fit_resample(X, Y) @pytest.mark.parametrize("n_estimators", [10, 20]) diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 2ccff0759..a92e8ea9d 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -124,7 +124,7 @@ def _sample(self): self.sampler_.set_params(return_indices=True) set_random_state(self.sampler_, random_state) - _, _, self.indices_ = self.sampler_.fit_sample(self.X, self.y) + _, _, self.indices_ = self.sampler_.fit_resample(self.X, self.y) # shuffle the indices since the sampler are packing them by class random_state.shuffle(self.indices_) diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index 9ee6bdd04..00e3c57a4 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -82,7 +82,7 @@ class ADASYN(BaseOverSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> ada = ADASYN(random_state=42) - >>> X_res, y_res = ada.fit_sample(X, y) + >>> X_res, y_res = ada.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 904, 1: 900}}) @@ -106,7 +106,7 @@ def _validate_estimator(self): 'n_neighbors', self.n_neighbors, additional_neighbor=1) self.nn_.set_params(**{'n_jobs': self.n_jobs}) - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 8d070f4c6..b01a95a8e 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -67,7 +67,7 @@ class RandomOverSampler(BaseOverSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> ros = RandomOverSampler(random_state=42) - >>> X_res, y_res = ros.fit_sample(X, y) + >>> X_res, y_res = ros.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) @@ -88,7 +88,7 @@ def _check_X_y(X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) return X, y, binarize_y - def _sample(self, X, y): + def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) target_stats = Counter(y) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 189fc56bd..3675f9af7 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -250,7 +250,7 @@ class BorderlineSMOTE(BaseSMOTE): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sm = BorderlineSMOTE(random_state=42) - >>> X_res, y_res = sm.fit_sample(X, y) + >>> X_res, y_res = sm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) @@ -279,6 +279,10 @@ def _validate_estimator(self): '"borderline-1" and "borderline-2".' 'Got {} instead.'.format(self.kind)) + # FIXME: rename _sample -> _fit_resample in 0.6 + def _fit_resample(self, X, y): + return self._sample(X, y) + def _sample(self, X, y): self._validate_estimator() @@ -426,7 +430,7 @@ class SVMSMOTE(BaseSMOTE): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sm = SVMSMOTE(random_state=42) - >>> X_res, y_res = sm.fit_sample(X, y) + >>> X_res, y_res = sm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) @@ -461,6 +465,10 @@ def _validate_estimator(self): raise_isinstance_error('svm_estimator', [SVC], self.svm_estimator) + # FIXME: rename _sample -> _fit_resample in 0.6 + def _fit_resample(self, X, y): + return self._sample(X, y) + def _sample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) @@ -645,7 +653,7 @@ class SMOTE(SVMSMOTE, BorderlineSMOTE): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sm = SMOTE(random_state=42) - >>> X_res, y_res = sm.fit_sample(X, y) + >>> X_res, y_res = sm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) @@ -727,10 +735,10 @@ def _validate_estimator(self): self.nn_m_.set_params(**{'n_jobs': self.n_jobs}) # FIXME: to be removed in 0.6 - def fit(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() - BaseSMOTE.fit(self, X, y) - return self + print(self._sample) + return self._sample(X, y) def _sample(self, X, y): # FIXME: uncomment in version 0.6 diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index 8534e53ca..94223f9bc 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -37,15 +37,9 @@ def test_ada_init(): assert ada.random_state == RND_SEED -def test_ada_fit(): +def test_ada_fit_resample(): ada = ADASYN(random_state=RND_SEED) - ada.fit(X, Y) - assert ada.sampling_strategy_ == {0: 4} - - -def test_ada_fit_sample(): - ada = ADASYN(random_state=RND_SEED) - X_resampled, y_resampled = ada.fit_sample(X, Y) + X_resampled, y_resampled = ada.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ @@ -72,13 +66,13 @@ def test_ada_fit_sampling_strategy_error(): sampling_strategy = {0: 9, 1: 12} ada = ADASYN(sampling_strategy=sampling_strategy, random_state=RND_SEED) with raises(ValueError, match="No samples will be generated."): - ada.fit_sample(X, Y) + ada.fit_resample(X, Y) -def test_ada_fit_sample_nn_obj(): +def test_ada_fit_resample_nn_obj(): nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) - X_resampled, y_resampled = ada.fit_sample(X, Y) + X_resampled, y_resampled = ada.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ @@ -105,4 +99,4 @@ def test_ada_wrong_nn_obj(): nn = 'rnd' ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) with raises(ValueError, match="has to be one of"): - ada.fit_sample(X, Y) + ada.fit_resample(X, Y) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index c9bd37a42..484f6110a 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -29,9 +29,9 @@ def test_ros_init(): assert ros.random_state == RND_SEED -def test_ros_fit_sample(): +def test_ros_fit_resample(): ros = RandomOverSampler(random_state=RND_SEED) - X_resampled, y_resampled = ros.fit_sample(X, Y) + X_resampled, y_resampled = ros.fit_resample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ 0.20792588, 1.49407907 ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [ @@ -45,11 +45,11 @@ def test_ros_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_ros_fit_sample_half(): +def test_ros_fit_resample_half(): sampling_strategy = {0: 3, 1: 7} ros = RandomOverSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED) - X_resampled, y_resampled = ros.fit_sample(X, Y) + X_resampled, y_resampled = ros.fit_resample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ 0.20792588, 1.49407907 ], [0.47104475, 0.44386323], [0.22950086, @@ -63,7 +63,7 @@ def test_ros_fit_sample_half(): def test_random_over_sampling_return_indices(): ros = RandomOverSampler(return_indices=True, random_state=RND_SEED) - X_resampled, y_resampled, sample_indices = ros.fit_sample(X, Y) + X_resampled, y_resampled, sample_indices = ros.fit_resample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ 0.20792588, 1.49407907 ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [ @@ -78,12 +78,12 @@ def test_random_over_sampling_return_indices(): assert_array_equal(np.sort(np.unique(sample_indices)), np.arange(len(X))) -def test_multiclass_fit_sample(): +def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 ros = RandomOverSampler(random_state=RND_SEED) - X_resampled, y_resampled = ros.fit_sample(X, y) + X_resampled, y_resampled = ros.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 5 assert count_y_res[1] == 5 @@ -95,7 +95,7 @@ def test_random_over_sampling_heterogeneous_data(): dtype=np.object) y = np.array([0, 0, 1]) ros = RandomOverSampler(random_state=RND_SEED) - X_res, y_res = ros.fit_sample(X_hetero, y) + X_res, y_res = ros.fit_resample(X_hetero, y) assert X_res.shape[0] == 4 assert y_res.shape[0] == 4 diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 5e5a22800..4f42795df 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -35,12 +35,12 @@ def test_smote_wrong_kind(): kind = 'rnd' smote = SMOTE(kind=kind, random_state=RND_SEED) with pytest.raises(ValueError, match="Unknown kind for SMOTE"): - smote.fit_sample(X, Y) + smote.fit_resample(X, Y) def test_sample_regular(): smote = SMOTE(random_state=RND_SEED) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ @@ -67,7 +67,7 @@ def test_sample_regular_half(): sampling_strategy = {0: 9, 1: 12} smote = SMOTE( sampling_strategy=sampling_strategy, random_state=RND_SEED) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ @@ -91,7 +91,7 @@ def test_sample_regular_half(): def test_sample_borderline1(): kind = 'borderline1' smote = SMOTE(random_state=RND_SEED, kind=kind) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ @@ -118,7 +118,7 @@ def test_sample_borderline1(): def test_sample_borderline2(): kind = 'borderline2' smote = SMOTE(random_state=RND_SEED, kind=kind) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ @@ -146,7 +146,7 @@ def test_sample_borderline2(): def test_sample_svm(): kind = 'svm' smote = SMOTE(random_state=RND_SEED, kind=kind) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], @@ -179,13 +179,13 @@ def test_sample_svm(): @pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') @pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and') -def test_fit_sample_nn_obj(): +def test_fit_resample_nn_obj(): kind = 'borderline1' nn_m = NearestNeighbors(n_neighbors=11) nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ @@ -211,7 +211,7 @@ def test_fit_sample_nn_obj(): def test_sample_regular_with_nn(): nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ @@ -243,18 +243,18 @@ def test_wrong_nn(): smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) with pytest.raises(ValueError, match="has to be one of"): - smote.fit_sample(X, Y) + smote.fit_resample(X, Y) nn_k = 'rnd' nn_m = NearestNeighbors(n_neighbors=10) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) with pytest.raises(ValueError, match="has to be one of"): - smote.fit_sample(X, Y) + smote.fit_resample(X, Y) kind = 'regular' nn_k = 'rnd' smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k) with pytest.raises(ValueError, match="has to be one of"): - smote.fit_sample(X, Y) + smote.fit_resample(X, Y) @pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') @@ -267,7 +267,7 @@ def test_sample_with_nn_svm(): svm = SVC(random_state=RND_SEED) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) - X_resampled, y_resampled = smote.fit_sample(X, Y) + X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], @@ -308,13 +308,13 @@ def test_sample_regular_wrong_svm(): random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) with pytest.raises(ValueError, match="has to be one of"): - smote.fit_sample(X, Y) + smote.fit_resample(X, Y) def test_borderline_smote_wrong_kind(): bsmote = BorderlineSMOTE(kind='rand') with pytest.raises(ValueError, match='The possible "kind" of algorithm'): - bsmote.fit_sample(X, Y) + bsmote.fit_resample(X, Y) @pytest.mark.parametrize('kind', ['borderline-1', 'borderline-2']) @@ -324,8 +324,8 @@ def test_borderline_smote(kind): k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11)) - X_res_1, y_res_1 = bsmote.fit_sample(X, Y) - X_res_2, y_res_2 = bsmote_nn.fit_sample(X, Y) + X_res_1, y_res_1 = bsmote.fit_resample(X, Y) + X_res_2, y_res_2 = bsmote_nn.fit_resample(X, Y) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) @@ -338,8 +338,8 @@ def test_svm_smote(): m_neighbors=NearestNeighbors(n_neighbors=11), svm_estimator=SVC(random_state=42)) - X_res_1, y_res_1 = svm_smote.fit_sample(X, Y) - X_res_2, y_res_2 = svm_smote_nn.fit_sample(X, Y) + X_res_1, y_res_1 = svm_smote.fit_resample(X, Y) + X_res_2, y_res_2 = svm_smote_nn.fit_resample(X, Y) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index ecb4c8b6c..95b0f0257 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -45,7 +45,7 @@ class Pipeline(pipeline.Pipeline): ---------- steps : list List of (name, transform) tuples (implementing - fit/transform/fit_sample) that are chained, in the order in which they + fit/transform/fit_resample) that are chained, in the order in which they are chained, with the last object an estimator. memory : Instance of joblib.Memory or string, optional (default=None) @@ -110,12 +110,6 @@ class Pipeline(pipeline.Pipeline): # BaseEstimator interface - def __init__(self, steps, memory=None): - # shallow copy of steps - self.steps = tosequence(steps) - self._validate_steps() - self.memory = memory - def _validate_steps(self): names, estimators = zip(*self.steps) @@ -129,16 +123,18 @@ def _validate_steps(self): for t in transformers: if t is None: continue - if (not (hasattr(t, "fit") or hasattr(t, "fit_transform") or - hasattr(t, "fit_sample")) or not - (hasattr(t, "transform") or hasattr(t, "sample"))): + if (not (hasattr(t, "fit") or + hasattr(t, "fit_transform") or + hasattr(t, "fit_resample")) or + not (hasattr(t, "transform") or + hasattr(t, "fit_resample"))): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or sample " "(but not both) '%s' (type %s) doesn't)" % (t, type(t))) - if ((hasattr(t, "fit_sample") and hasattr(t, "fit_transform")) or - (hasattr(t, "sample") and hasattr(t, "transform"))): + if (hasattr(t, "fit_resample") and (hasattr(t, "fit_transform") or + hasattr(t, "transform"))): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or sample." @@ -171,7 +167,7 @@ def _fit(self, X, y=None, **fit_params): " 'memory={!r}' instead.".format(memory)) fit_transform_one_cached = memory.cache(_fit_transform_one) - fit_sample_one_cached = memory.cache(_fit_sample_one) + fit_resample_one_cached = memory.cache(_fit_resample_one) fit_params_steps = dict((name, {}) for name, step in self.steps if step is not None) @@ -196,8 +192,8 @@ def _fit(self, X, y=None, **fit_params): Xt, fitted_transformer = fit_transform_one_cached( cloned_transformer, None, Xt, yt, **fit_params_steps[name]) - elif hasattr(cloned_transformer, "sample"): - Xt, yt, fitted_transformer = fit_sample_one_cached( + elif hasattr(cloned_transformer, "fit_resample"): + Xt, yt, fitted_transformer = fit_resample_one_cached( cloned_transformer, Xt, yt, **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer @@ -278,11 +274,11 @@ def fit_transform(self, X, y=None, **fit_params): return last_step.fit(Xt, yt, **fit_params).transform(Xt) @if_delegate_has_method(delegate='_final_estimator') - def fit_sample(self, X, y=None, **fit_params): + def fit_resample(self, X, y=None, **fit_params): """Fit the model and sample with the final estimator Fits all the transformers/samplers one after the other and - transform/sample the data, then uses fit_sample on transformed + transform/sample the data, then uses fit_resample on transformed data with the final estimator. Parameters @@ -313,8 +309,8 @@ def fit_sample(self, X, y=None, **fit_params): Xt, yt, fit_params = self._fit(X, y, **fit_params) if last_step is None: return Xt - elif hasattr(last_step, 'fit_sample'): - return last_step.fit_sample(Xt, yt, **fit_params) + elif hasattr(last_step, 'fit_resample'): + return last_step.fit_resample(Xt, yt, **fit_params) @if_delegate_has_method(delegate='_final_estimator') def sample(self, X, y): @@ -335,15 +331,15 @@ def sample(self, X, y): for name, transform in self.steps[:-1]: if transform is None: continue - if hasattr(transform, "fit_sample"): + if hasattr(transform, "fit_resample"): # XXX: Calling sample in pipeline it means that the # last estimator is a sampler. Samplers don't carry - # the sampled data. So, call 'fit_sample' in all intermediate + # the sampled data. So, call 'fit_resample' in all intermediate # steps to get the sampled data for the last estimator. - Xt, y = transform.fit_sample(Xt, y) + Xt, y = transform.fit_resample(Xt, y) else: Xt = transform.transform(Xt) - return self.steps[-1][-1].fit_sample(Xt, y) + return self.steps[-1][-1].fit_resample(Xt, y) @if_delegate_has_method(delegate='_final_estimator') def predict(self, X): @@ -365,7 +361,7 @@ def predict(self, X): for _, transform in self.steps[:-1]: if transform is None: continue - if hasattr(transform, "fit_sample"): + if hasattr(transform, "fit_resample"): pass else: Xt = transform.transform(Xt) @@ -421,7 +417,7 @@ def predict_proba(self, X): for _, transform in self.steps[:-1]: if transform is None: continue - if hasattr(transform, "fit_sample"): + if hasattr(transform, "fit_resample"): pass else: Xt = transform.transform(Xt) @@ -447,7 +443,7 @@ def decision_function(self, X): for _, transform in self.steps[:-1]: if transform is None: continue - if hasattr(transform, "fit_sample"): + if hasattr(transform, "fit_resample"): pass else: Xt = transform.transform(Xt) @@ -473,7 +469,7 @@ def predict_log_proba(self, X): for _, transform in self.steps[:-1]: if transform is None: continue - if hasattr(transform, "fit_sample"): + if hasattr(transform, "fit_resample"): pass else: Xt = transform.transform(Xt) @@ -506,7 +502,7 @@ def _transform(self, X): for name, transform in self.steps: if transform is None: continue - if hasattr(transform, "fit_sample"): + if hasattr(transform, "fit_resample"): pass else: Xt = transform.transform(Xt) @@ -541,7 +537,7 @@ def _inverse_transform(self, X): for name, transform in self.steps[::-1]: if transform is None: continue - if hasattr(transform, "fit_sample"): + if hasattr(transform, "fit_resample"): pass else: Xt = transform.inverse_transform(Xt) @@ -573,7 +569,7 @@ def score(self, X, y=None, sample_weight=None): for _, transform in self.steps[:-1]: if transform is None: continue - if hasattr(transform, "fit_sample"): + if hasattr(transform, "fit_resample"): pass else: Xt = transform.transform(Xt) @@ -594,8 +590,8 @@ def _fit_transform_one(transformer, weight, X, y, **fit_params): return res * weight, transformer -def _fit_sample_one(sampler, X, y, **fit_params): - X_res, y_res = sampler.fit_sample(X, y, **fit_params) +def _fit_resample_one(sampler, X, y, **fit_params): + X_res, y_res = sampler.fit_resample(X, y, **fit_params) return X_res, y_res, sampler diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index 1a21c106c..0f124e88c 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -129,7 +129,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, sampler_.set_params(return_indices=True) set_random_state(sampler_, random_state) - _, _, indices = sampler_.fit_sample(X, y) + _, _, indices = sampler_.fit_resample(X, y) # shuffle the indices since the sampler are packing them by class random_state.shuffle(indices) diff --git a/imblearn/tests/test_base.py b/imblearn/tests/test_base.py index 4450d4515..ca8203093 100644 --- a/imblearn/tests/test_base.py +++ b/imblearn/tests/test_base.py @@ -28,14 +28,14 @@ def test_function_sampler_reject_sparse(): TypeError, match="A sparse matrix was passed, " "but dense data is required"): - sampler.fit(X_sparse, y) + sampler.fit_resample(X_sparse, y) @pytest.mark.parametrize("X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)]) def test_function_sampler_identity(X, y): sampler = FunctionSampler() - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) assert_allclose_dense_sparse(X_res, X) assert_array_equal(y_res, y) @@ -47,7 +47,7 @@ def func(X, y): return X[:10], y[:10] sampler = FunctionSampler(func=func) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) assert_allclose_dense_sparse(X_res, X[:10]) assert_array_equal(y_res, y[:10]) @@ -58,12 +58,12 @@ def test_function_sampler_func_kwargs(X, y): def func(X, y, sampling_strategy, random_state): rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=random_state) - return rus.fit_sample(X, y) + return rus.fit_resample(X, y) sampler = FunctionSampler( func=func, kw_args={'sampling_strategy': 'auto', 'random_state': 0}) - X_res, y_res = sampler.fit_sample(X, y) - X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) + X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y) assert_allclose_dense_sparse(X_res, X_res_2) assert_array_equal(y_res, y_res_2) diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index db11504ed..15f876b63 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -12,7 +12,6 @@ import numpy as np from pytest import raises -from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_allclose @@ -143,19 +142,13 @@ def fit(self, X, y): class DummySampler(NoTrans): """Samplers which returns a balanced number of samples""" - def fit(self, X, y): + def fit_resample(self, X, y): self.means_ = np.mean(X, axis=0) # store timestamp to figure out whether the result of 'fit' has been # cached or not self.timestamp_ = time.time() - return self - - def sample(self, X, y): return X, y - def fit_sample(self, X, y): - return self.fit(X, y).sample(X, y) - class FitTransformSample(NoTrans): """Estimator implementing both transform and sample @@ -164,9 +157,12 @@ class FitTransformSample(NoTrans): def fit(self, X, y, should_succeed=False): pass - def sample(self, X, y=None): + def fit_resample(self, X, y=None): return X, y + def fit_transform(self, X, y=None): + return self.fit(X, y).transform(X) + def transform(self, X, y=None): return X @@ -807,20 +803,17 @@ def test_pipeline_sample(): pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: - X_trans, y_trans = pipeline.fit(X, y).sample(X, y) - X_trans2, y_trans2 = pipeline.fit_sample(X, y) - X_trans3, y_trans3 = rus.fit_sample(X, y) + X_trans, y_trans = pipeline.fit_resample(X, y) + X_trans2, y_trans2 = rus.fit_resample(X, y) assert_allclose(X_trans, X_trans2, rtol=R_TOL) - assert_allclose(X_trans, X_trans3, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) - assert_allclose(y_trans, y_trans3, rtol=R_TOL) pca = PCA() pipeline = Pipeline([('pca', PCA()), ('rus', rus)]) - X_trans, y_trans = pipeline.fit(X, y).sample(X, y) + X_trans, y_trans = pipeline.fit_resample(X, y) X_pca = pca.fit_transform(X) - X_trans2, y_trans2 = rus.fit_sample(X_pca, y) + X_trans2, y_trans2 = rus.fit_resample(X_pca, y) # We round the value near to zero. It seems that PCA has some issue # with that X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0 @@ -936,8 +929,7 @@ def test_pipeline_none_sampler_sample(): rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(None, rus) - pipe.fit(X, y) - pipe.sample(X, y) + pipe.fit_resample(X, y) def test_pipeline_none_transformer(): @@ -1045,12 +1037,12 @@ def test_pipeline_fit_then_sample_with_sampler_last_estimator(): rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn) - X_fit_sample_resampled, y_fit_sample_resampled = pipeline.fit_sample(X, y) + X_fit_resample_resampled, y_fit_resample_resampled = pipeline.fit_resample(X, y) pipeline = make_pipeline(rus, enn) pipeline.fit(X, y) - X_fit_then_sample_res, y_fit_then_sample_res = pipeline.sample(X, y) - assert_array_equal(X_fit_sample_resampled, X_fit_then_sample_res) - assert_array_equal(y_fit_sample_resampled, y_fit_then_sample_res) + X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) + assert_array_equal(X_fit_resample_resampled, X_fit_then_sample_res) + assert_array_equal(y_fit_resample_resampled, y_fit_then_sample_res) def test_pipeline_fit_then_sample_3_samplers_with_sampler_last_estimator(): @@ -1069,12 +1061,12 @@ def test_pipeline_fit_then_sample_3_samplers_with_sampler_last_estimator(): rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn, rus) - X_fit_sample_resampled, y_fit_sample_resampled = pipeline.fit_sample(X, y) + X_fit_resample_resampled, y_fit_resample_resampled = pipeline.fit_resample(X, y) pipeline = make_pipeline(rus, enn, rus) pipeline.fit(X, y) - X_fit_then_sample_res, y_fit_then_sample_res = pipeline.sample(X, y) - assert_array_equal(X_fit_sample_resampled, X_fit_then_sample_res) - assert_array_equal(y_fit_sample_resampled, y_fit_then_sample_res) + X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) + assert_array_equal(X_fit_resample_resampled, X_fit_then_sample_res) + assert_array_equal(y_fit_resample_resampled, y_fit_then_sample_res) def test_make_pipeline_memory(): diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 94c49bbdc..190d8f9e5 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -87,7 +87,7 @@ class ClusterCentroids(BaseUnderSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> cc = ClusterCentroids(random_state=42) - >>> X_res, y_res = cc.fit_sample(X, y) + >>> X_res, y_res = cc.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) ... # doctest: +ELLIPSIS Resampled dataset shape Counter({{...}}) @@ -135,7 +135,7 @@ def _generate_sample(self, X, y, centroids, target_class): return X_new, y_new - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() if self.voting == 'auto': diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index 4983ae06e..cda6d5549 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -23,20 +23,20 @@ R_TOL = 1e-4 -def test_fit_sample_check_voting(): +def test_fit_resample_check_voting(): cc = ClusterCentroids(random_state=RND_SEED) - cc.fit_sample(X, Y) + cc.fit_resample(X, Y) assert cc.voting_ == 'soft' cc = ClusterCentroids(random_state=RND_SEED) - cc.fit_sample(sparse.csr_matrix(X), Y) + cc.fit_resample(sparse.csr_matrix(X), Y) assert cc.voting_ == 'hard' -def test_fit_sample_auto(): +def test_fit_resample_auto(): sampling_strategy = 'auto' cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED) - X_resampled, y_resampled = cc.fit_sample(X, Y) + X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.06738818, -0.529627], [0.17901516, 0.69860992], [0.094035, -2.55298982]]) @@ -45,11 +45,11 @@ def test_fit_sample_auto(): assert_array_equal(y_resampled, y_gt) -def test_fit_sample_half(): +def test_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED) - X_resampled, y_resampled = cc.fit_sample(X, Y) + X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.13347175, 0.12167502], [ 0.47104475, 0.44386323 ], [0.09125309, -0.85409574], [0.19220316, 0.32337101], @@ -61,19 +61,19 @@ def test_fit_sample_half(): assert_array_equal(y_resampled, y_gt) -def test_multiclass_fit_sample(): +def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 cc = ClusterCentroids(random_state=RND_SEED) - X_resampled, y_resampled = cc.fit_sample(X, y) + X_resampled, y_resampled = cc.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2 -def test_fit_sample_object(): +def test_fit_resample_object(): sampling_strategy = 'auto' cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( @@ -81,7 +81,7 @@ def test_fit_sample_object(): random_state=RND_SEED, estimator=cluster) - X_resampled, y_resampled = cc.fit_sample(X, Y) + X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.06738818, -0.529627], [0.17901516, 0.69860992], [0.094035, -2.55298982]]) @@ -100,7 +100,7 @@ def test_fit_hard_voting(): estimator=cluster, voting=voting) - X_resampled, y_resampled = cc.fit_sample(X, Y) + X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.094035, -2.55298982]]) @@ -111,7 +111,7 @@ def test_fit_hard_voting(): assert np.any(np.all(x == X, axis=1)) -def test_fit_sample_error(): +def test_fit_resample_error(): sampling_strategy = 'auto' cluster = 'rnd' cc = ClusterCentroids( @@ -119,7 +119,7 @@ def test_fit_sample_error(): random_state=RND_SEED, estimator=cluster) with raises(ValueError, match="has to be a KMeans clustering"): - cc.fit_sample(X, Y) + cc.fit_resample(X, Y) voting = 'unknown' cc = ClusterCentroids( @@ -127,4 +127,4 @@ def test_fit_sample_error(): voting=voting, random_state=RND_SEED) with raises(ValueError, match="needs to be one of"): - cc.fit_sample(X, Y) + cc.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 3624302cf..3b6388203 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -91,7 +91,7 @@ class CondensedNearestNeighbour(BaseCleaningSampler): >>> print('Original dataset shape %s' % Counter(y)) # doctest: +SKIP Original dataset shape Counter({{1: 500, -1: 268}}) # doctest: +SKIP >>> cnn = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP - >>> X_res, y_res = cnn.fit_sample(X, y) #doctest: +SKIP + >>> X_res, y_res = cnn.fit_resample(X, y) #doctest: +SKIP >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +SKIP Resampled dataset shape Counter({{-1: 268, 1: 227}}) # doctest: +SKIP @@ -128,7 +128,7 @@ def _validate_estimator(self): ' inhereited from KNeighborsClassifier.' ' Got {} instead.'.format(type(self.n_neighbors))) - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 1949a20cf..d89435b21 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -102,7 +102,7 @@ class EditedNearestNeighbours(BaseCleaningSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> enn = EditedNearestNeighbours() - >>> X_res, y_res = enn.fit_sample(X, y) + >>> X_res, y_res = enn.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 887, 0: 100}}) @@ -138,7 +138,7 @@ def _validate_estimator(self): if self.kind_sel not in SEL_KIND: raise NotImplementedError - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() idx_under = np.empty((0, ), dtype=int) @@ -257,7 +257,7 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> renn = RepeatedEditedNearestNeighbours() - >>> X_res, y_res = renn.fit_sample(X, y) + >>> X_res, y_res = renn.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 887, 0: 100}}) @@ -303,7 +303,7 @@ def _validate_estimator(self): n_jobs=self.n_jobs, ratio=self.ratio) - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() X_, y_ = X, y @@ -316,9 +316,9 @@ def _sample(self, X, y): prev_len = y_.shape[0] if self.return_indices: - X_enn, y_enn, idx_enn = self.enn_.fit_sample(X_, y_) + X_enn, y_enn, idx_enn = self.enn_.fit_resample(X_, y_) else: - X_enn, y_enn = self.enn_.fit_sample(X_, y_) + X_enn, y_enn = self.enn_.fit_resample(X_, y_) # Check the stopping criterion # 1. If there is no changes for the vector y @@ -444,7 +444,7 @@ class without early stopping. >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> allknn = AllKNN() - >>> X_res, y_res = allknn.fit_sample(X, y) + >>> X_res, y_res = allknn.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 887, 0: 100}}) @@ -489,7 +489,7 @@ def _validate_estimator(self): n_jobs=self.n_jobs, ratio=self.ratio) - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() X_, y_ = X, y @@ -503,9 +503,9 @@ def _sample(self, X, y): self.enn_.n_neighbors = curr_size_ngh if self.return_indices: - X_enn, y_enn, idx_enn = self.enn_.fit_sample(X_, y_) + X_enn, y_enn, idx_enn = self.enn_.fit_resample(X_, y_) else: - X_enn, y_enn = self.enn_.fit_sample(X_, y_) + X_enn, y_enn = self.enn_.fit_resample(X_, y_) # Check the stopping criterion # 1. If the number of samples in the other class become inferior to diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 1ecb8ec64..69624db31 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -89,7 +89,7 @@ class InstanceHardnessThreshold(BaseCleaningSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> iht = InstanceHardnessThreshold(random_state=42) - >>> X_res, y_res = iht.fit_sample(X, y) + >>> X_res, y_res = iht.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 840, 0: 100}}) @@ -125,7 +125,7 @@ def _validate_estimator(self): raise ValueError('Invalid parameter `estimator`. Got {}.'.format( type(self.estimator))) - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() target_stats = Counter(y) diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index 4467784e3..8048b8e2c 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -97,7 +97,7 @@ class NearMiss(BaseUnderSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> nm = NearMiss() - >>> X_res, y_res = nm.fit_sample(X, y) + >>> X_res, y_res = nm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 100, 1: 100}}) @@ -211,7 +211,7 @@ def _validate_estimator(self): raise ValueError('Parameter `version` must be 1, 2 or 3, got' ' {}'.format(self.version)) - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() idx_under = np.empty((0, ), dtype=int) diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index 22191d0e9..716802d25 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -96,7 +96,7 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> ncr = NeighbourhoodCleaningRule() - >>> X_res, y_res = ncr.fit_sample(X, y) + >>> X_res, y_res = ncr.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 877, 0: 100}}) @@ -139,7 +139,7 @@ def _validate_estimator(self): "'threshold_cleaning' is a value between 0 and 1." " Got {} instead.".format(self.threshold_cleaning)) - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() enn = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, @@ -148,7 +148,7 @@ def _sample(self, X, y): kind_sel='mode', n_jobs=self.n_jobs, ratio=self.ratio) - _, _, index_not_a1 = enn.fit_sample(X, y) + _, _, index_not_a1 = enn.fit_resample(X, y) index_a1 = np.ones(y.shape, dtype=bool) index_a1[index_not_a1] = False index_a1 = np.flatnonzero(index_a1) diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index aa2ba1464..c23f547e8 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -85,7 +85,7 @@ class OneSidedSelection(BaseCleaningSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> oss = OneSidedSelection(random_state=42) - >>> X_res, y_res = oss.fit_sample(X, y) + >>> X_res, y_res = oss.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 495, 0: 100}}) @@ -122,7 +122,7 @@ def _validate_estimator(self): ' inhereited from KNeighborsClassifier.' ' Got {} instead.'.format(type(self.n_neighbors))) - def _sample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) @@ -170,7 +170,7 @@ def _sample(self, X, y): # apply Tomek cleaning tl = TomekLinks( sampling_strategy=self.sampling_strategy_, return_indices=True) - X_cleaned, y_cleaned, idx_cleaned = tl.fit_sample( + X_cleaned, y_cleaned, idx_cleaned = tl.fit_resample( X_resampled, y_resampled) idx_under = safe_indexing(idx_under, idx_cleaned) diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 80cd0aad0..d1ac8f4cc 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -68,7 +68,7 @@ class RandomUnderSampler(BaseUnderSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> rus = RandomUnderSampler(random_state=42) - >>> X_res, y_res = rus.fit_sample(X, y) + >>> X_res, y_res = rus.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 100, 1: 100}}) @@ -92,7 +92,7 @@ def _check_X_y(X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) return X, y, binarize_y - def _sample(self, X, y): + def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) idx_under = np.empty((0, ), dtype=int) diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index 52d1b1cf6..39de438e7 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -74,7 +74,7 @@ class TomekLinks(BaseCleaningSampler): >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> tl = TomekLinks() - >>> X_res, y_res = tl.fit_sample(X, y) + >>> X_res, y_res = tl.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 897, 0: 100}}) @@ -134,7 +134,7 @@ def is_tomek(y, nn_index, class_type): return links - def _sample(self, X, y): + def _fit_resample(self, X, y): # check for deprecated random_state if self.random_state is not None: deprecate_parameter(self, '0.4', 'random_state') diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index 60c6275f4..4ea4d5977 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -47,9 +47,9 @@ R_TOL = 1e-4 -def test_allknn_fit_sample(): +def test_allknn_fit_resample(): allknn = AllKNN() - X_resampled, y_resampled = allknn.fit_sample(X, Y) + X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 @@ -90,15 +90,15 @@ def test_all_knn_allow_minority(): random_state=0) allknn = AllKNN(allow_minority=True) - X_res_1, y_res_1 = allknn.fit_sample(X, y) + X_res_1, y_res_1 = allknn.fit_resample(X, y) allknn = AllKNN() - X_res_2, y_res_2 = allknn.fit_sample(X, y) + X_res_2, y_res_2 = allknn.fit_resample(X, y) assert len(y_res_1) < len(y_res_2) -def test_allknn_fit_sample_with_indices(): +def test_allknn_fit_resample_with_indices(): allknn = AllKNN(return_indices=True) - X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = allknn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 @@ -130,9 +130,9 @@ def test_allknn_fit_sample_with_indices(): assert_allclose(idx_under, idx_gt, rtol=R_TOL) -def test_allknn_fit_sample_mode(): +def test_allknn_fit_resample_mode(): allknn = AllKNN(kind_sel='mode') - X_resampled, y_resampled = allknn.fit_sample(X, Y) + X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 @@ -162,10 +162,10 @@ def test_allknn_fit_sample_mode(): assert_array_equal(y_resampled, y_gt) -def test_allknn_fit_sample_with_nn_object(): +def test_allknn_fit_resample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) allknn = AllKNN(n_neighbors=nn, kind_sel='mode') - X_resampled, y_resampled = allknn.fit_sample(X, Y) + X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 @@ -199,11 +199,11 @@ def test_alknn_not_good_object(): nn = 'rnd' allknn = AllKNN(n_neighbors=nn, kind_sel='mode') with raises(ValueError): - allknn.fit_sample(X, Y) + allknn.fit_resample(X, Y) def test_deprecation_random_state(): allknn = AllKNN(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): - allknn.fit_sample(X, Y) + allknn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py index e45a51b24..fba3c0937 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py @@ -37,9 +37,9 @@ def test_cnn_init(): assert cnn.n_jobs == 1 -def test_cnn_fit_sample(): +def test_cnn_fit_resample(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) - X_resampled, y_resampled = cnn.fit_sample(X, Y) + X_resampled, y_resampled = cnn.fit_resample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ 0.05230552, 0.09043907 @@ -52,9 +52,9 @@ def test_cnn_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_cnn_fit_sample_with_indices(): +def test_cnn_fit_resample_with_indices(): cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) - X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = cnn.fit_resample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ 0.05230552, 0.09043907 @@ -69,10 +69,10 @@ def test_cnn_fit_sample_with_indices(): assert_array_equal(idx_under, idx_gt) -def test_cnn_fit_sample_with_object(): +def test_cnn_fit_resample_with_object(): knn = KNeighborsClassifier(n_neighbors=1) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) - X_resampled, y_resampled = cnn.fit_sample(X, Y) + X_resampled, y_resampled = cnn.fit_resample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ 0.05230552, 0.09043907 @@ -85,13 +85,13 @@ def test_cnn_fit_sample_with_object(): assert_array_equal(y_resampled, y_gt) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=1) - X_resampled, y_resampled = cnn.fit_sample(X, Y) + X_resampled, y_resampled = cnn.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) -def test_cnn_fit_sample_with_wrong_object(): +def test_cnn_fit_resample_with_wrong_object(): knn = 'rnd' cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) with raises(ValueError, match="has to be a int or an "): - cnn.fit_sample(X, Y) + cnn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index bc8c825b6..a5f85df9c 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -39,9 +39,9 @@ def test_enn_init(): assert enn.n_jobs == 1 -def test_enn_fit_sample(): +def test_enn_fit_resample(): enn = EditedNearestNeighbours() - X_resampled, y_resampled = enn.fit_sample(X, Y) + X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ 2.59928271, 0.93323465 @@ -52,9 +52,9 @@ def test_enn_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_enn_fit_sample_with_indices(): +def test_enn_fit_resample_with_indices(): enn = EditedNearestNeighbours(return_indices=True) - X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = enn.fit_resample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ 2.59928271, 0.93323465 @@ -67,9 +67,9 @@ def test_enn_fit_sample_with_indices(): assert_array_equal(idx_under, idx_gt) -def test_enn_fit_sample_mode(): +def test_enn_fit_resample_mode(): enn = EditedNearestNeighbours(kind_sel='mode') - X_resampled, y_resampled = enn.fit_sample(X, Y) + X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ 2.59928271, 0.93323465 @@ -84,10 +84,10 @@ def test_enn_fit_sample_mode(): assert_array_equal(y_resampled, y_gt) -def test_enn_fit_sample_with_nn_object(): +def test_enn_fit_resample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') - X_resampled, y_resampled = enn.fit_sample(X, Y) + X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ 2.59928271, 0.93323465 @@ -106,11 +106,11 @@ def test_enn_not_good_object(): nn = 'rnd' enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with raises(ValueError, match="has to be one of"): - enn.fit_sample(X, Y) + enn.fit_resample(X, Y) def test_deprecation_random_state(): enn = EditedNearestNeighbours(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): - enn.fit_sample(X, Y) + enn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index a5fbf6931..8a4bd5d71 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -36,9 +36,9 @@ def test_iht_init(): assert iht.random_state == RND_SEED -def test_iht_fit_sample(): +def test_iht_fit_resample(): iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED) - X_resampled, y_resampled = iht.fit_sample(X, Y) + X_resampled, y_resampled = iht.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 @@ -52,10 +52,10 @@ def test_iht_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_iht_fit_sample_with_indices(): +def test_iht_fit_resample_with_indices(): iht = InstanceHardnessThreshold( ESTIMATOR, return_indices=True, random_state=RND_SEED) - X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = iht.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 @@ -71,11 +71,11 @@ def test_iht_fit_sample_with_indices(): assert_array_equal(idx_under, idx_gt) -def test_iht_fit_sample_half(): +def test_iht_fit_resample_half(): sampling_strategy = {0: 6, 1: 8} iht = InstanceHardnessThreshold( ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED) - X_resampled, y_resampled = iht.fit_sample(X, Y) + X_resampled, y_resampled = iht.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 @@ -90,10 +90,10 @@ def test_iht_fit_sample_half(): assert_array_equal(y_resampled, y_gt) -def test_iht_fit_sample_class_obj(): +def test_iht_fit_resample_class_obj(): est = GradientBoostingClassifier(random_state=RND_SEED) iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) - X_resampled, y_resampled = iht.fit_sample(X, Y) + X_resampled, y_resampled = iht.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 @@ -107,9 +107,9 @@ def test_iht_fit_sample_class_obj(): assert_array_equal(y_resampled, y_gt) -def test_iht_fit_sample_wrong_class_obj(): +def test_iht_fit_resample_wrong_class_obj(): from sklearn.cluster import KMeans est = KMeans() iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) with raises(ValueError, match="Invalid parameter `estimator`"): - iht.fit_sample(X, Y) + iht.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py index b120e85bb..b84021113 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py @@ -32,7 +32,7 @@ def test_nearmiss_wrong_version(): version = 1000 nm = NearMiss(version=version) with raises(ValueError, match="must be 1, 2 or 3"): - nm.fit_sample(X, Y) + nm.fit_resample(X, Y) def test_nm_wrong_nn_obj(): @@ -44,7 +44,7 @@ def test_nm_wrong_nn_obj(): return_indices=True, n_neighbors=nn) with raises(ValueError, match="has to be one of"): - nm.fit_sample(X, Y) + nm.fit_resample(X, Y) nn3 = 'rnd' nn = NearestNeighbors(n_neighbors=3) nm3 = NearMiss( @@ -54,10 +54,10 @@ def test_nm_wrong_nn_obj(): n_neighbors=nn, n_neighbors_ver3=nn3) with raises(ValueError, match="has to be one of"): - nm3.fit_sample(X, Y) + nm3.fit_resample(X, Y) -def test_nm_fit_sample_auto(): +def test_nm_fit_resample_auto(): sampling_strategy = 'auto' X_gt = [ np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ @@ -83,12 +83,12 @@ def test_nm_fit_sample_auto(): ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) - X_resampled, y_resampled = nm.fit_sample(X, Y) + X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) -def test_nm_fit_sample_auto_indices(): +def test_nm_fit_resample_auto_indices(): sampling_strategy = 'auto' X_gt = [ np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ @@ -122,13 +122,13 @@ def test_nm_fit_sample_auto_indices(): sampling_strategy=sampling_strategy, version=version, return_indices=True) - X_resampled, y_resampled, idx_under = nm.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) assert_array_equal(idx_under, idx_gt[version_idx]) -def test_nm_fit_sample_float_sampling_strategy(): +def test_nm_fit_resample_float_sampling_strategy(): sampling_strategy = {0: 3, 1: 4, 2: 4} X_gt = [ np.array([[-0.20497017, -0.26630228], [-0.80809175, -1.09917302], [ @@ -158,12 +158,12 @@ def test_nm_fit_sample_float_sampling_strategy(): for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) - X_resampled, y_resampled = nm.fit_sample(X, Y) + X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) -def test_nm_fit_sample_nn_obj(): +def test_nm_fit_resample_nn_obj(): sampling_strategy = 'auto' nn = NearestNeighbors(n_neighbors=3) X_gt = [ @@ -193,7 +193,7 @@ def test_nm_fit_sample_nn_obj(): sampling_strategy=sampling_strategy, version=version, n_neighbors=nn) - X_resampled, y_resampled = nm.fit_sample(X, Y) + X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) @@ -202,4 +202,4 @@ def test_deprecation_random_state(): nm = NearMiss(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): - nm.fit_sample(X, Y) + nm.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py index 9d2c51920..c2b3a0143 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py @@ -31,19 +31,19 @@ def test_ncr_error(): match=("'threshold_cleaning' is a value between" " 0 and 1")): NeighbourhoodCleaningRule( - threshold_cleaning=threshold_cleaning).fit_sample(X, Y) + threshold_cleaning=threshold_cleaning).fit_resample(X, Y) threshold_cleaning = 10 with raises( ValueError, match=("'threshold_cleaning' is a value between" " 0 and 1")): NeighbourhoodCleaningRule( - threshold_cleaning=threshold_cleaning).fit_sample(X, Y) + threshold_cleaning=threshold_cleaning).fit_resample(X, Y) -def test_ncr_fit_sample(): +def test_ncr_fit_resample(): ncr = NeighbourhoodCleaningRule() - X_resampled, y_resampled = ncr.fit_sample(X, Y) + X_resampled, y_resampled = ncr.fit_resample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ -0.20413357, 0.64628718 @@ -56,9 +56,9 @@ def test_ncr_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_ncr_fit_sample_mode(): +def test_ncr_fit_resample_mode(): ncr = NeighbourhoodCleaningRule(kind_sel='mode') - X_resampled, y_resampled = ncr.fit_sample(X, Y) + X_resampled, y_resampled = ncr.fit_resample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ -0.20413357, 0.64628718 @@ -71,9 +71,9 @@ def test_ncr_fit_sample_mode(): assert_array_equal(y_resampled, y_gt) -def test_ncr_fit_sample_with_indices(): +def test_ncr_fit_resample_with_indices(): ncr = NeighbourhoodCleaningRule(return_indices=True) - X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = ncr.fit_resample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ -0.20413357, 0.64628718 @@ -88,10 +88,10 @@ def test_ncr_fit_sample_with_indices(): assert_array_equal(idx_under, idx_gt) -def test_ncr_fit_sample_nn_obj(): +def test_ncr_fit_resample_nn_obj(): nn = NearestNeighbors(n_neighbors=4) ncr = NeighbourhoodCleaningRule(return_indices=True, n_neighbors=nn) - X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = ncr.fit_resample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ -0.20413357, 0.64628718 @@ -110,11 +110,11 @@ def test_ncr_wrong_nn_obj(): nn = 'rnd' ncr = NeighbourhoodCleaningRule(return_indices=True, n_neighbors=nn) with raises(ValueError, match="has to be one of"): - ncr.fit_sample(X, Y) + ncr.fit_resample(X, Y) def test_deprecation_random_state(): ncr = NeighbourhoodCleaningRule(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): - ncr.fit_sample(X, Y) + ncr.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py index cce6c386d..2e8c1af2c 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py @@ -35,9 +35,9 @@ def test_oss_init(): assert oss.random_state == RND_SEED -def test_oss_fit_sample(): +def test_oss_fit_resample(): oss = OneSidedSelection(random_state=RND_SEED) - X_resampled, y_resampled = oss.fit_sample(X, Y) + X_resampled, y_resampled = oss.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 @@ -51,9 +51,9 @@ def test_oss_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_oss_fit_sample_with_indices(): +def test_oss_fit_resample_with_indices(): oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) - X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = oss.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 @@ -72,7 +72,7 @@ def test_oss_fit_sample_with_indices(): def test_oss_with_object(): knn = KNeighborsClassifier(n_neighbors=1) oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) - X_resampled, y_resampled = oss.fit_sample(X, Y) + X_resampled, y_resampled = oss.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 @@ -86,7 +86,7 @@ def test_oss_with_object(): assert_array_equal(y_resampled, y_gt) knn = 1 oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) - X_resampled, y_resampled = oss.fit_sample(X, Y) + X_resampled, y_resampled = oss.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -95,4 +95,4 @@ def test_oss_with_wrong_object(): knn = 'rnd' oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) with raises(ValueError, match="has to be a int"): - oss.fit_sample(X, Y) + oss.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py index 109bf0235..eecb23b64 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py @@ -21,9 +21,9 @@ Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) -def test_rus_fit_sample(): +def test_rus_fit_resample(): rus = RandomUnderSampler(random_state=RND_SEED, replacement=True) - X_resampled, y_resampled = rus.fit_sample(X, Y) + X_resampled, y_resampled = rus.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], @@ -34,10 +34,10 @@ def test_rus_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_rus_fit_sample_with_indices(): +def test_rus_fit_resample_with_indices(): rus = RandomUnderSampler( return_indices=True, random_state=RND_SEED, replacement=True) - X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = rus.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], @@ -49,13 +49,13 @@ def test_rus_fit_sample_with_indices(): assert_array_equal(idx_under, idx_gt) -def test_rus_fit_sample_half(): +def test_rus_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED, replacement=True) - X_resampled, y_resampled = rus.fit_sample(X, Y) + X_resampled, y_resampled = rus.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [ 0.92923648, 0.76103773 @@ -67,12 +67,12 @@ def test_rus_fit_sample_half(): assert_array_equal(y_resampled, y_gt) -def test_multiclass_fit_sample(): +def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 rus = RandomUnderSampler(random_state=RND_SEED) - X_resampled, y_resampled = rus.fit_sample(X, y) + X_resampled, y_resampled = rus.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 @@ -84,7 +84,7 @@ def test_random_under_sampling_heterogeneous_data(): dtype=np.object) y = np.array([0, 0, 1]) rus = RandomUnderSampler(random_state=RND_SEED) - X_res, y_res = rus.fit_sample(X_hetero, y) + X_res, y_res = rus.fit_resample(X_hetero, y) assert X_res.shape[0] == 2 assert y_res.shape[0] == 2 diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index b50c8dbab..10cf1e1c3 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -57,12 +57,12 @@ def test_renn_iter_wrong(): max_iter = -1 renn = RepeatedEditedNearestNeighbours(max_iter=max_iter) with raises(ValueError): - renn.fit_sample(X, Y) + renn.fit_resample(X, Y) -def test_renn_fit_sample(): +def test_renn_fit_resample(): renn = RepeatedEditedNearestNeighbours() - X_resampled, y_resampled = renn.fit_sample(X, Y) + X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 @@ -88,9 +88,9 @@ def test_renn_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_renn_fit_sample_with_indices(): +def test_renn_fit_resample_with_indices(): renn = RepeatedEditedNearestNeighbours(return_indices=True) - X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = renn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 @@ -121,9 +121,9 @@ def test_renn_fit_sample_with_indices(): assert_array_equal(idx_under, idx_gt) -def test_renn_fit_sample_mode_object(): +def test_renn_fit_resample_mode_object(): renn = RepeatedEditedNearestNeighbours(kind_sel='mode') - X_resampled, y_resampled = renn.fit_sample(X, Y) + X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 @@ -155,10 +155,10 @@ def test_renn_fit_sample_mode_object(): assert_array_equal(y_resampled, y_gt) -def test_renn_fit_sample_mode(): +def test_renn_fit_resample_mode(): nn = NearestNeighbors(n_neighbors=4) renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') - X_resampled, y_resampled = renn.fit_sample(X, Y) + X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 @@ -194,11 +194,11 @@ def test_renn_not_good_object(): nn = 'rnd' renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with raises(ValueError): - renn.fit_sample(X, Y) + renn.fit_resample(X, Y) def test_deprecation_random_state(): renn = RepeatedEditedNearestNeighbours(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): - renn.fit_sample(X, Y) + renn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py index c2b9d84f2..22aaa156c 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py @@ -32,9 +32,9 @@ def test_tl_init(): assert tl.n_jobs == 1 -def test_tl_fit_sample(): +def test_tl_fit_resample(): tl = TomekLinks() - X_resampled, y_resampled = tl.fit_sample(X, Y) + X_resampled, y_resampled = tl.fit_resample(X, Y) X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [ 1.34192108, -0.13367336 @@ -51,9 +51,9 @@ def test_tl_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_tl_fit_sample_with_indices(): +def test_tl_fit_resample_with_indices(): tl = TomekLinks(return_indices=True) - X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y) + X_resampled, y_resampled, idx_under = tl.fit_resample(X, Y) X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [ 1.34192108, -0.13367336 @@ -77,4 +77,4 @@ def test_deprecation_random_state(): tl = TomekLinks(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): - tl.fit_sample(X, Y) + tl.fit_resample(X, Y) diff --git a/imblearn/utils/__init__.py b/imblearn/utils/__init__.py index eda8af1f9..ce53cca31 100644 --- a/imblearn/utils/__init__.py +++ b/imblearn/utils/__init__.py @@ -6,11 +6,10 @@ from ._validation import check_neighbors_object from ._validation import check_target_type -from ._validation import hash_X_y from ._validation import check_ratio from ._validation import check_sampling_strategy __all__ = [ - 'Substitution', 'check_neighbors_object', 'check_target_type', 'hash_X_y', + 'Substitution', 'check_neighbors_object', 'check_target_type', 'check_sampling_strategy', 'check_ratio' ] diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index bc6d9ecc9..27120364c 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -14,14 +14,14 @@ from sklearn.base import clone from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors -from sklearn.externals import six, joblib +from sklearn.externals import six from sklearn.utils.multiclass import type_of_target from sklearn.utils.deprecation import deprecated from ..exceptions import raise_isinstance_error SAMPLING_KIND = ('over-sampling', 'under-sampling', 'clean-sampling', - 'ensemble') + 'ensemble', 'bypass') TARGET_KIND = ('binary', 'multiclass', 'multilabel-indicator') @@ -95,41 +95,6 @@ def check_target_type(y, indicate_one_vs_all=False): return y.argmax(axis=1) if type_y == 'multilabel-indicator' else y -def hash_X_y(X, y, n_samples=10, n_features=5): - """Compute hash of the input arrays. - - Parameters - ---------- - X : array_like, shape (n_samples, n_features) - The ``X`` array. - - y : ndarray, shape (n_samples) - The ``y`` array. - - n_samples : int, optional - The number of samples to use to compute the hash. Default is 100. - - n_features : int, optional - The number of features to use to compute the hash. Default is 10. - - Returns - ------- - X_hash: str - Hash identifier of the ``X`` matrix. - y_hash: str - Hash identifier of the ``y`` matrix. - """ - row_idx = slice(None, None, max(1, X.shape[0] // n_samples)) - col_idx = slice(None, None, max(1, X.shape[1] // n_features)) - - X_subset = (X.iloc[row_idx, col_idx] - if hasattr(X, 'iloc') else X[row_idx, col_idx]) - y_subset = (y.iloc[row_idx] - if hasattr(y, 'iloc') else y[row_idx]) - - return joblib.hash(X_subset), joblib.hash(y_subset) - - def _sampling_strategy_all(y, sampling_type): """Returns sampling target by targeting all classes.""" target_stats = Counter(y) @@ -461,7 +426,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): raise ValueError("The target 'y' needs to have more than 1 class." " Got {} class instead".format(np.unique(y).size)) - if sampling_type == 'ensemble': + if sampling_type in ('ensemble', 'bypass'): return sampling_strategy if isinstance(sampling_strategy, six.string_types): diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index f8e2bd7ff..c525a4a4b 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -48,8 +48,6 @@ def monkey_patch_check_dtype_object(name, estimator_orig): estimator = clone(estimator_orig) estimator.fit(X, y) - if hasattr(estimator, "sample"): - estimator.sample(X, y) try: estimator.fit(X, y.astype(object)) @@ -71,9 +69,9 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_no_fit_error yield check_samplers_X_consistancy_sample yield check_samplers_fit - yield check_samplers_fit_sample - yield check_samplers_ratio_fit_sample - yield check_samplers_sampling_strategy_fit_sample + yield check_samplers_fit_resample + yield check_samplers_ratio_fit_resample + yield check_samplers_sampling_strategy_fit_resample yield check_samplers_sparse yield check_samplers_pandas yield check_samplers_multiclass_ova @@ -172,7 +170,7 @@ def check_samplers_fit(name, Sampler): assert hasattr(sampler, 'sampling_strategy_') -def check_samplers_fit_sample(name, Sampler): +def check_samplers_fit_resample(name, Sampler): sampler = Sampler() X, y = make_classification( n_samples=1000, @@ -181,7 +179,7 @@ def check_samplers_fit_sample(name, Sampler): weights=[0.2, 0.3, 0.5], random_state=0) target_stats = Counter(y) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) if isinstance(sampler, BaseOverSampler): target_stats_res = Counter(y_res) n_samples = max(target_stats.values()) @@ -203,7 +201,7 @@ def check_samplers_fit_sample(name, Sampler): # FIXME remove in 0.6 -> ratio will be deprecated -def check_samplers_ratio_fit_sample(name, Sampler): +def check_samplers_ratio_fit_resample(name, Sampler): if name not in DONT_SUPPORT_RATIO: # in this test we will force all samplers to not change the class 1 X, y = make_classification( @@ -217,27 +215,27 @@ def check_samplers_ratio_fit_sample(name, Sampler): if isinstance(sampler, BaseOverSampler): ratio = {2: 498, 0: 498} sampler.set_params(ratio=ratio) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat elif isinstance(sampler, BaseUnderSampler): ratio = {2: 201, 0: 201} sampler.set_params(ratio=ratio) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat elif isinstance(sampler, BaseCleaningSampler): ratio = {2: 201, 0: 201} sampler.set_params(ratio=ratio) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat if isinstance(sampler, BaseEnsembleSampler): ratio = {2: 201, 0: 201} sampler.set_params(ratio=ratio) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) y_ensemble = y_res[0] assert Counter(y_ensemble)[1] == expected_stat -def check_samplers_sampling_strategy_fit_sample(name, Sampler): +def check_samplers_sampling_strategy_fit_resample(name, Sampler): # in this test we will force all samplers to not change the class 1 X, y = make_classification( n_samples=1000, @@ -250,22 +248,22 @@ def check_samplers_sampling_strategy_fit_sample(name, Sampler): if isinstance(sampler, BaseOverSampler): sampling_strategy = {2: 498, 0: 498} sampler.set_params(sampling_strategy=sampling_strategy) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat elif isinstance(sampler, BaseUnderSampler): sampling_strategy = {2: 201, 0: 201} sampler.set_params(sampling_strategy=sampling_strategy) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat elif isinstance(sampler, BaseCleaningSampler): sampling_strategy = {2: 201, 0: 201} sampler.set_params(sampling_strategy=sampling_strategy) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat if isinstance(sampler, BaseEnsembleSampler): sampling_strategy = {2: 201, 0: 201} sampler.set_params(sampling_strategy=sampling_strategy) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) y_ensemble = y_res[0] assert Counter(y_ensemble)[1] == expected_stat @@ -300,8 +298,8 @@ def check_samplers_sparse(name, Sampler): for sampler in samplers: set_random_state(sampler) - X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y) - X_res, y_res = sampler.fit_sample(X, y) + X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y) + X_res, y_res = sampler.fit_resample(X, y) if not isinstance(sampler, BaseEnsembleSampler): assert sparse.issparse(X_res_sparse) assert_allclose(X_res_sparse.A, X_res) @@ -339,8 +337,8 @@ def check_samplers_pandas(name, Sampler): for sampler in samplers: set_random_state(sampler) - X_res_pd, y_res_pd = sampler.fit_sample(X_pd, y_pd) - X_res, y_res = sampler.fit_sample(X, y) + X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd) + X_res, y_res = sampler.fit_resample(X, y) assert_allclose(X_res_pd, X_res) assert_allclose(y_res_pd, y_res) @@ -356,8 +354,8 @@ def check_samplers_multiclass_ova(name, Sampler): y_ova = label_binarize(y, np.unique(y)) sampler = Sampler() set_random_state(sampler) - X_res, y_res = sampler.fit_sample(X, y) - X_res_ova, y_res_ova = sampler.fit_sample(X, y_ova) + X_res, y_res = sampler.fit_resample(X, y) + X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova) assert_allclose(X_res, X_res_ova) if issubclass(Sampler, BaseEnsembleSampler): for batch_y, batch_y_ova in zip(y_res, y_res_ova): @@ -380,6 +378,6 @@ def check_samplers_preserve_dtype(name, Sampler): y = y.astype(np.int32) sampler = Sampler() set_random_state(sampler) - X_res, y_res = sampler.fit_sample(X, y) + X_res, y_res = sampler.fit_resample(X, y) assert X.dtype == X_res.dtype assert y.dtype == y_res.dtype diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index b09b3b03c..773896e5e 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -11,15 +11,12 @@ from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors -from sklearn.utils import check_random_state -from sklearn.externals import joblib from sklearn.utils.testing import assert_array_equal from imblearn.utils.testing import warns from imblearn.utils import check_neighbors_object from imblearn.utils import check_ratio from imblearn.utils import check_sampling_strategy -from imblearn.utils import hash_X_y from imblearn.utils import check_target_type multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25) @@ -362,33 +359,6 @@ def sampling_strategy_func(y, multiplier): assert sampling_strategy_ == {1: 25, 2: 0, 3: 50} -def test_hash_X_y(): - rng = check_random_state(0) - X = rng.randn(2000, 20) - y = np.array([0] * 500 + [1] * 1500) - assert hash_X_y(X, y, 10, 10) == (joblib.hash(X[::200, ::2]), - joblib.hash(y[::200])) - - X = rng.randn(5, 2) - y = np.array([0] * 2 + [1] * 3) - # all data will be used in this case - assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y)) - - -def test_hash_X_y_pandas(): - pd = pytest.importorskip("pandas") - rng = check_random_state(0) - X = pd.DataFrame(rng.randn(2000, 20)) - y = pd.Series([0] * 500 + [1] * 1500) - assert hash_X_y(X, y, 10, 10) == (joblib.hash(X.iloc[::200, ::2]), - joblib.hash(y.iloc[::200])) - - X = pd.DataFrame(rng.randn(5, 2)) - y = pd.Series([0] * 2 + [1] * 3) - # all data will be used in this case - assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y)) - - @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_result", [({3: 25, 1: 25, 2: 25}, 'under-sampling', From c87267984e9a5adefceaee0c2deccb767b1f186f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Aug 2018 17:42:34 +0200 Subject: [PATCH 02/15] PEP8 --- imblearn/base.py | 31 ++++++++++++++++++++++----- imblearn/ensemble/_balance_cascade.py | 3 +-- imblearn/ensemble/base.py | 2 -- imblearn/over_sampling/_smote.py | 1 - imblearn/pipeline.py | 11 +++++----- imblearn/tests/test_pipeline.py | 6 ++++-- 6 files changed, 36 insertions(+), 18 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index c5e419f40..656e113ec 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -30,7 +30,28 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): _estimator_type = 'sampler' def fit(self, X, y): - self.fit_resample(X, y) + """Check inputs and statistics of the sampler. + + You should use ``fit_resample`` in all cases. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Data array. + + y : array-like, shape (n_samples,) + Target array. + + Returns + ------- + self : object + Return the instance itself. + + """ + self._deprecate_ratio() + X, y, _ = self._check_X_y(X, y) + self.sampling_strategy_ = check_sampling_strategy( + self.sampling_strategy, y, self._sampling_type) return self def fit_resample(self, X, y): @@ -67,8 +88,7 @@ def fit_resample(self, X, y): y_sampled = label_binarize(output[1], np.unique(y)) if len(output) == 2: return output[0], y_sampled - else: - return output[0], y_sampled, output[2] + return output[0], y_sampled, output[2] return output # define an alias for back-compatibility @@ -200,8 +220,9 @@ class FunctionSampler(BaseSampler): >>> from collections import Counter >>> from imblearn.under_sampling import RandomUnderSampler >>> def func(X, y, sampling_strategy, random_state): - ... return RandomUnderSampler(sampling_strategy=sampling_strategy, - ... random_state=random_state).fit_resample(X, y) + ... return RandomUnderSampler( + ... sampling_strategy=sampling_strategy, + ... random_state=random_state).fit_resample(X, y) >>> sampler = FunctionSampler(func=func, ... kw_args={'sampling_strategy': 'auto', ... 'random_state': 0}) diff --git a/imblearn/ensemble/_balance_cascade.py b/imblearn/ensemble/_balance_cascade.py index 753876c8f..67526d233 100644 --- a/imblearn/ensemble/_balance_cascade.py +++ b/imblearn/ensemble/_balance_cascade.py @@ -12,11 +12,10 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import check_random_state, safe_indexing from sklearn.model_selection import cross_val_predict -from sklearn.utils import check_X_y from .base import BaseEnsembleSampler from ..under_sampling.base import BaseUnderSampler -from ..utils import check_sampling_strategy, check_target_type +from ..utils import check_sampling_strategy from ..utils import Substitution from ..utils._docstring import _random_state_docstring diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py index 0563124bf..968b57ff4 100644 --- a/imblearn/ensemble/base.py +++ b/imblearn/ensemble/base.py @@ -7,10 +7,8 @@ import numpy as np from sklearn.preprocessing import label_binarize -from sklearn.utils import check_X_y from ..base import BaseSampler -from ..utils import check_target_type from ..utils import check_sampling_strategy diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 3675f9af7..34c56a560 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -737,7 +737,6 @@ def _validate_estimator(self): # FIXME: to be removed in 0.6 def _fit_resample(self, X, y): self._validate_estimator() - print(self._sample) return self._sample(X, y) def _sample(self, X, y): diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index 95b0f0257..66017b371 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -19,7 +19,6 @@ from sklearn.base import clone from sklearn.externals import six from sklearn.externals.joblib import Memory -from sklearn.utils import tosequence from sklearn.utils.metaestimators import if_delegate_has_method __all__ = ['Pipeline', 'make_pipeline'] @@ -45,8 +44,8 @@ class Pipeline(pipeline.Pipeline): ---------- steps : list List of (name, transform) tuples (implementing - fit/transform/fit_resample) that are chained, in the order in which they - are chained, with the last object an estimator. + fit/transform/fit_resample) that are chained, in the order in which + they are chained, with the last object an estimator. memory : Instance of joblib.Memory or string, optional (default=None) Used to cache the fitted transformers of the pipeline. By default, @@ -126,15 +125,15 @@ def _validate_steps(self): if (not (hasattr(t, "fit") or hasattr(t, "fit_transform") or hasattr(t, "fit_resample")) or - not (hasattr(t, "transform") or - hasattr(t, "fit_resample"))): + not (hasattr(t, "transform") or + hasattr(t, "fit_resample"))): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or sample " "(but not both) '%s' (type %s) doesn't)" % (t, type(t))) if (hasattr(t, "fit_resample") and (hasattr(t, "fit_transform") or - hasattr(t, "transform"))): + hasattr(t, "transform"))): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or sample." diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index 15f876b63..6033f0617 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -1037,7 +1037,8 @@ def test_pipeline_fit_then_sample_with_sampler_last_estimator(): rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn) - X_fit_resample_resampled, y_fit_resample_resampled = pipeline.fit_resample(X, y) + X_fit_resample_resampled, y_fit_resample_resampled = \ + pipeline.fit_resample(X, y) pipeline = make_pipeline(rus, enn) pipeline.fit(X, y) X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) @@ -1061,7 +1062,8 @@ def test_pipeline_fit_then_sample_3_samplers_with_sampler_last_estimator(): rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn, rus) - X_fit_resample_resampled, y_fit_resample_resampled = pipeline.fit_resample(X, y) + X_fit_resample_resampled, y_fit_resample_resampled = \ + pipeline.fit_resample(X, y) pipeline = make_pipeline(rus, enn, rus) pipeline.fit(X, y) X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) From bbecd3094cb3c6f836449614215ffaa572251f26 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Aug 2018 17:48:12 +0200 Subject: [PATCH 03/15] DOC: add whats new entry --- doc/introduction.rst | 6 +----- doc/whats_new/v0.0.4.rst | 5 +++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/introduction.rst b/doc/introduction.rst index 0c5bf7a4a..6b8aa8cf3 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -18,14 +18,10 @@ and adding a sampling functionality through the ``sample`` method: estimator = obj.fit(data, targets) -:Sampler: +:Resampler: To resample a data sets, each sampler implements:: - data_resampled, targets_resampled = obj.sample(data, targets) - - Fitting and sampling can also be done in one step:: - data_resampled, targets_resampled = obj.fit_resample(data, targets) Imbalanced-learn samplers accept the same inputs that in scikit-learn: diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index 732f9c12e..ad5cc53f4 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -18,6 +18,11 @@ API - Enable to use a ``list`` for the cleaning methods to specify the class to sample. :issue:`411` by :user:`Guillaume Lemaitre `. +- Replace ``fit_sample`` by ``fit_resample``. An alias is still available for + backward compatibility. In addition, ``sample`` has been removed to avoid + resampling on different set of data. + :issue:`xxx` by :user:`Guillaume Lemaitre `. + New features ............ From f7120d805152b17b08fdb3dcefea53f79729de42 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Aug 2018 17:59:19 +0200 Subject: [PATCH 04/15] DOC add issue number --- doc/whats_new/v0.0.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index ad5cc53f4..0bbe287ca 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -21,7 +21,7 @@ API - Replace ``fit_sample`` by ``fit_resample``. An alias is still available for backward compatibility. In addition, ``sample`` has been removed to avoid resampling on different set of data. - :issue:`xxx` by :user:`Guillaume Lemaitre `. + :issue:`462` by :user:`Guillaume Lemaitre `. New features ............ From 00f8e4419049405cac19aca26770d5e92e4113b6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Aug 2018 18:09:45 +0200 Subject: [PATCH 05/15] PEP8 examples --- examples/applications/porto_seguro_keras_under_sampling.py | 2 +- examples/under-sampling/plot_illustration_tomek_links.py | 4 ++-- examples/under-sampling/plot_instance_hardness_threshold.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/applications/porto_seguro_keras_under_sampling.py b/examples/applications/porto_seguro_keras_under_sampling.py index c154362d9..f1a006660 100644 --- a/examples/applications/porto_seguro_keras_under_sampling.py +++ b/examples/applications/porto_seguro_keras_under_sampling.py @@ -49,7 +49,7 @@ ############################################################################### from sklearn.compose import ColumnTransformer -from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import FunctionTransformer diff --git a/examples/under-sampling/plot_illustration_tomek_links.py b/examples/under-sampling/plot_illustration_tomek_links.py index ca5dd9f78..4f070b699 100644 --- a/examples/under-sampling/plot_illustration_tomek_links.py +++ b/examples/under-sampling/plot_illustration_tomek_links.py @@ -80,8 +80,8 @@ def make_plot_despine(ax): [TomekLinks(sampling_strategy='auto'), TomekLinks(sampling_strategy='all')]): X_res, y_res = sampler.fit_resample(np.vstack((X_minority, X_majority)), - np.array([0] * X_minority.shape[0] + - [1] * X_majority.shape[0])) + np.array([0] * X_minority.shape[0] + + [1] * X_majority.shape[0])) ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1], label='Minority class', s=200, marker='_') ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1], diff --git a/examples/under-sampling/plot_instance_hardness_threshold.py b/examples/under-sampling/plot_instance_hardness_threshold.py index 8aab71899..9d2456b99 100644 --- a/examples/under-sampling/plot_instance_hardness_threshold.py +++ b/examples/under-sampling/plot_instance_hardness_threshold.py @@ -53,9 +53,9 @@ def plot_resampling(ax, X, y, title): axs = [a for ax in axs for a in ax] for ax, sampling_strategy in zip(axs, (0, - {1: 25, 0: 10}, - {1: 14, 0: 10}, - {1: 10, 0: 10})): + {1: 25, 0: 10}, + {1: 14, 0: 10}, + {1: 10, 0: 10})): if sampling_strategy == 0: c0, c1 = plot_resampling(ax, X_vis, y, 'Original set') else: From 9046477d70e44050e72d3637f3b354682e6eac54 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Aug 2018 18:18:36 +0200 Subject: [PATCH 06/15] DOC fix import --- doc/api.rst | 1 - .../plot_comparison_over_sampling.py | 18 ++++-------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index fccdae47f..2bb58ddc1 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -247,4 +247,3 @@ Imbalance-learn provides some fast-prototyping tools. utils.check_neighbors_object utils.check_ratio utils.check_sampling_strategy - utils.hash_X_y diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py index 29a7b657c..587a258d0 100644 --- a/examples/over-sampling/plot_comparison_over_sampling.py +++ b/examples/over-sampling/plot_comparison_over_sampling.py @@ -23,8 +23,7 @@ from imblearn.over_sampling import ADASYN from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE from imblearn.over_sampling import RandomOverSampler -from imblearn.base import SamplerMixin -from imblearn.utils import hash_X_y +from imblearn.base import BaseSampler print(__doc__) @@ -131,20 +130,11 @@ def plot_decision_function(X, y, clf, ax): # Make an identity sampler -class FakeSampler(SamplerMixin): +class FakeSampler(BaseSampler): - def fit(self, X, y): - self.ratio_ = 1 - self.X_hash_ = hash_X_y(X, y) - return self + _sampling_type = 'bypass' - def sample(self, X, y): - return X, - - def _sample(self, X, y): - pass - - def fit_resample(self, X, y): + def _fit_resample(self, X, y): return X, y From 8b3aa5066f0f035fbc4af7f29d3409e2983d367f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Aug 2018 22:31:12 +0200 Subject: [PATCH 07/15] iter --- imblearn/ensemble/base.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py index 968b57ff4..f5de813b1 100644 --- a/imblearn/ensemble/base.py +++ b/imblearn/ensemble/base.py @@ -58,7 +58,5 @@ def fit_resample(self, X, y): [label_binarize(batch_y, classes) for batch_y in y_resampled]) if len(output) == 2: return output[0], y_resampled_encoded - else: - return output[0], y_resampled_encoded, output[2] - else: - return output + return output[0], y_resampled_encoded, output[2] + return output From 24fd62d631aef35e0d8b90711b292edd329e2b18 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Aug 2018 22:38:10 +0200 Subject: [PATCH 08/15] TST remove sample in pipeline --- imblearn/pipeline.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index 66017b371..18cea4059 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -311,35 +311,6 @@ def fit_resample(self, X, y=None, **fit_params): elif hasattr(last_step, 'fit_resample'): return last_step.fit_resample(Xt, yt, **fit_params) - @if_delegate_has_method(delegate='_final_estimator') - def sample(self, X, y): - """Sample the data with the final estimator - - Applies transformers/samplers to the data, and the sample - method of the final estimator. Valid only if the final - estimator implements sample. - - Parameters - ---------- - X : iterable - Data to predict on. Must fulfill input requirements of first step - of the pipeline. - - """ - Xt = X - for name, transform in self.steps[:-1]: - if transform is None: - continue - if hasattr(transform, "fit_resample"): - # XXX: Calling sample in pipeline it means that the - # last estimator is a sampler. Samplers don't carry - # the sampled data. So, call 'fit_resample' in all intermediate - # steps to get the sampled data for the last estimator. - Xt, y = transform.fit_resample(Xt, y) - else: - Xt = transform.transform(Xt) - return self.steps[-1][-1].fit_resample(Xt, y) - @if_delegate_has_method(delegate='_final_estimator') def predict(self, X): """Apply transformers/samplers to the data, and predict with the final From 59725c78029b23783406575d97175d9a21b22a17 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Aug 2018 23:58:22 +0200 Subject: [PATCH 09/15] TST: make sure samplers common test are run --- imblearn/base.py | 3 ++- imblearn/ensemble/base.py | 1 + imblearn/utils/estimator_checks.py | 24 +----------------------- 3 files changed, 4 insertions(+), 24 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 656e113ec..661f928da 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -16,6 +16,7 @@ from sklearn.externals import six from sklearn.preprocessing import label_binarize from sklearn.utils import check_X_y + from .utils import check_sampling_strategy, check_target_type from .utils.deprecation import deprecate_parameter @@ -72,7 +73,7 @@ def fit_resample(self, X, y): The array containing the resampled data. y_resampled : array-like, shape (n_samples_new,) - The corresponding label of `X_resampled` + The corresponding label of `X_resampled`. """ self._deprecate_ratio() diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py index f5de813b1..e069d0f52 100644 --- a/imblearn/ensemble/base.py +++ b/imblearn/ensemble/base.py @@ -42,6 +42,7 @@ def fit_resample(self, X, y): The corresponding label of `X_resampled` """ + self._deprecate_ratio() # Ensemble are a bit specific since they are returning an array of # resampled arrays. X, y, binarize_y = self._check_X_y(X, y) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index c525a4a4b..42e0a7201 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -66,8 +66,6 @@ def monkey_patch_check_dtype_object(name, estimator_orig): def _yield_sampler_checks(name, Estimator): yield check_target_type yield check_samplers_one_label - yield check_samplers_no_fit_error - yield check_samplers_X_consistancy_sample yield check_samplers_fit yield check_samplers_fit_resample yield check_samplers_ratio_fit_resample @@ -80,7 +78,7 @@ def _yield_sampler_checks(name, Estimator): def _yield_all_checks(name, estimator): # trigger our checks if this is a SamplerMixin - if hasattr(estimator, 'sample'): + if hasattr(estimator, 'fit_resample'): for check in _yield_sampler_checks(name, estimator): yield check @@ -140,26 +138,6 @@ def check_samplers_one_label(name, Sampler): raise exc -def check_samplers_no_fit_error(name, Sampler): - sampler = Sampler() - X = np.random.random((20, 2)) - y = np.array([1] * 5 + [0] * 15) - with pytest.raises(NotFittedError, match="instance is not fitted yet."): - sampler.sample(X, y) - - -def check_samplers_X_consistancy_sample(name, Sampler): - sampler = Sampler() - X = np.random.random((30, 2)) - y = np.array([1] * 20 + [0] * 10) - sampler.fit(X, y) - X_different = np.random.random((40, 2)) - y_different = y = np.array([1] * 25 + [0] * 15) - msg = "X and y need to be same array earlier" - with pytest.raises(RuntimeError, match=msg): - sampler.sample(X_different, y_different) - - def check_samplers_fit(name, Sampler): sampler = Sampler() X = np.random.random((30, 2)) From 6189206c92160a4c5cf8e8625cc5cd0d9dc14c1e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Aug 2018 00:05:59 +0200 Subject: [PATCH 10/15] PEP8 --- imblearn/utils/estimator_checks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 42e0a7201..3ffcbe4bd 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -22,7 +22,6 @@ from sklearn.preprocessing import label_binarize from sklearn.utils.estimator_checks import check_estimator \ as sklearn_check_estimator, check_parameters_default_constructible -from sklearn.exceptions import NotFittedError from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import set_random_state From 61f53a72df1d20f18806e6c5c9c762aaddc96bb4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Aug 2018 01:02:02 +0200 Subject: [PATCH 11/15] EHN: resample additional arrays apart from X and y --- imblearn/base.py | 23 ++++++++++++++++--- .../_condensed_nearest_neighbour.py | 12 ++++++---- .../_edited_nearest_neighbours.py | 16 +++++++------ .../_instance_hardness_threshold.py | 12 ++++++---- .../_prototype_selection/_nearmiss.py | 12 ++++++---- .../_neighbourhood_cleaning_rule.py | 14 ++++++----- .../_one_sided_selection.py | 11 +++++---- .../_random_under_sampler.py | 13 +++++++---- imblearn/utils/estimator_checks.py | 18 +++++++++++++++ 9 files changed, 91 insertions(+), 40 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 661f928da..9c57a7b58 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -16,6 +16,8 @@ from sklearn.externals import six from sklearn.preprocessing import label_binarize from sklearn.utils import check_X_y +from sklearn.utils import check_consistent_length +from sklearn.utils import indexable from .utils import check_sampling_strategy, check_target_type from .utils.deprecation import deprecate_parameter @@ -55,7 +57,7 @@ def fit(self, X, y): self.sampling_strategy, y, self._sampling_type) return self - def fit_resample(self, X, y): + def fit_resample(self, X, y, *arrays): """Resample the dataset. Parameters @@ -66,6 +68,11 @@ def fit_resample(self, X, y): y : array-like, shape (n_samples,) Corresponding label for each sample in X. + *arrays : sequence of indexables with same length / shape[0] + Allowed inputs are lists, numpy arrays, scipy-sparse matrices or + pandas dataframes. It is the placeholder to sample + ``sample_weight`` array. + Returns ------- X_resampled : {array-like, sparse matrix}, shape \ @@ -75,15 +82,20 @@ def fit_resample(self, X, y): y_resampled : array-like, shape (n_samples_new,) The corresponding label of `X_resampled`. + *arrays : sequence of indexables, shape (n_samples_new,) or \ +(n_samples_new, n_features) + """ self._deprecate_ratio() + arrays = indexable(*arrays) X, y, binarize_y = self._check_X_y(X, y) + check_consistent_length(X, y, *arrays) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type) - output = self._fit_resample(X, y) + output = self._fit_resample(X, y, *arrays) if binarize_y: y_sampled = label_binarize(output[1], np.unique(y)) @@ -96,7 +108,7 @@ def fit_resample(self, X, y): fit_sample = fit_resample @abstractmethod - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, *arrays): """Base method defined in each sampler to defined the sampling strategy. @@ -108,6 +120,11 @@ def _fit_resample(self, X, y): y : array-like, shape (n_samples,) Corresponding label for each sample in X. + *arrays : sequence of indexables with same length / shape[0] + Allowed inputs are lists, numpy arrays, scipy-sparse matrices or + pandas dataframes. It is the placeholder to sample + ``sample_weight`` array. + Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 3b6388203..a5c49ed2a 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -8,6 +8,7 @@ from __future__ import division from collections import Counter +from itertools import chain import numpy as np @@ -128,7 +129,7 @@ def _validate_estimator(self): ' inhereited from KNeighborsClassifier.' ' Got {} instead.'.format(type(self.n_neighbors))) - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, *arrays): self._validate_estimator() random_state = check_random_state(self.random_state) @@ -201,8 +202,9 @@ def _fit_resample(self, X, y): idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)), axis=0) + resampled_arrays = list(chain.from_iterable( + (safe_indexing(array, idx_under),) for array in (X, y, *arrays))) + if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) - else: - return safe_indexing(X, idx_under), safe_indexing(y, idx_under) + return resampled_arrays + [idx_under] + return resampled_arrays diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index d89435b21..72c62e73d 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -9,6 +9,7 @@ from __future__ import division from collections import Counter +from itertools import chain import numpy as np from scipy.stats import mode @@ -138,7 +139,7 @@ def _validate_estimator(self): if self.kind_sel not in SEL_KIND: raise NotImplementedError - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, *arrays): self._validate_estimator() idx_under = np.empty((0, ), dtype=int) @@ -168,11 +169,12 @@ def _fit_resample(self, X, y): np.flatnonzero(y == target_class)[index_target_class]), axis=0) + resampled_arrays = list(chain.from_iterable( + (safe_indexing(array, idx_under),) for array in (X, y, *arrays))) + if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) - else: - return safe_indexing(X, idx_under), safe_indexing(y, idx_under) + return resampled_arrays + [idx_under] + return resampled_arrays @Substitution( @@ -303,7 +305,7 @@ def _validate_estimator(self): n_jobs=self.n_jobs, ratio=self.ratio) - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, *arrays): self._validate_estimator() X_, y_ = X, y @@ -489,7 +491,7 @@ def _validate_estimator(self): n_jobs=self.n_jobs, ratio=self.ratio) - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, *arrays): self._validate_estimator() X_, y_ = X, y diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 69624db31..eb0aae76b 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -9,6 +9,7 @@ from __future__ import division from collections import Counter +from itertools import chain import numpy as np @@ -125,7 +126,7 @@ def _validate_estimator(self): raise ValueError('Invalid parameter `estimator`. Got {}.'.format( type(self.estimator))) - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, *arrays): self._validate_estimator() target_stats = Counter(y) @@ -167,8 +168,9 @@ def _fit_resample(self, X, y): np.flatnonzero(y == target_class)[index_target_class]), axis=0) + resampled_arrays = list(chain.from_iterable( + (safe_indexing(array, idx_under),) for array in (X, y, *arrays))) + if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) - else: - return safe_indexing(X, idx_under), safe_indexing(y, idx_under) + return resampled_arrays + [idx_under] + return resampled_arrays \ No newline at end of file diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index 8048b8e2c..dbd351ef4 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -8,6 +8,7 @@ import warnings from collections import Counter +from itertools import chain import numpy as np @@ -211,7 +212,7 @@ def _validate_estimator(self): raise ValueError('Parameter `version` must be 1, 2 or 3, got' ' {}'.format(self.version)) - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, *arrays): self._validate_estimator() idx_under = np.empty((0, ), dtype=int) @@ -277,8 +278,9 @@ def _fit_resample(self, X, y): np.flatnonzero(y == target_class)[index_target_class]), axis=0) + resampled_arrays = list(chain.from_iterable( + (safe_indexing(array, idx_under),) for array in (X, y, *arrays))) + if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) - else: - return safe_indexing(X, idx_under), safe_indexing(y, idx_under) + return resampled_arrays + [idx_under] + return resampled_arrays diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index 716802d25..147f13753 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -7,6 +7,7 @@ from __future__ import division, print_function from collections import Counter +from itertools import chain import numpy as np from scipy.stats import mode @@ -139,7 +140,7 @@ def _validate_estimator(self): "'threshold_cleaning' is a value between 0 and 1." " Got {} instead.".format(self.threshold_cleaning)) - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, *arrays): self._validate_estimator() enn = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, @@ -186,9 +187,10 @@ def _fit_resample(self, X, y): selected_samples[union_a1_a2] = False index_target_class = np.flatnonzero(selected_samples) + resampled_arrays = list(chain.from_iterable( + (safe_indexing(array, index_target_class),) + for array in (X, y, *arrays))) + if self.return_indices: - return (safe_indexing(X, index_target_class), safe_indexing( - y, index_target_class), index_target_class) - else: - return (safe_indexing(X, index_target_class), safe_indexing( - y, index_target_class)) + return resampled_arrays + [index_target_class] + return resampled_arrays diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index c23f547e8..c5d0f5b7d 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -7,6 +7,7 @@ from __future__ import division from collections import Counter +from itertools import chain import numpy as np @@ -122,7 +123,7 @@ def _validate_estimator(self): ' inhereited from KNeighborsClassifier.' ' Got {} instead.'.format(type(self.n_neighbors))) - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, *arrays): self._validate_estimator() random_state = check_random_state(self.random_state) @@ -174,7 +175,9 @@ def _fit_resample(self, X, y): X_resampled, y_resampled) idx_under = safe_indexing(idx_under, idx_cleaned) + resampled_arrays = list(chain.from_iterable( + (safe_indexing(array, idx_under),) for array in arrays)) + if self.return_indices: - return (X_cleaned, y_cleaned, idx_under) - else: - return X_cleaned, y_cleaned + return [X_cleaned, y_cleaned] + resampled_arrays +[idx_under] + return [X_cleaned, y_cleaned] + resampled_arrays diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index d1ac8f4cc..10a14ec91 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -6,6 +6,8 @@ from __future__ import division +from itertools import chain + import numpy as np from sklearn.utils import check_X_y, check_random_state, safe_indexing @@ -92,7 +94,7 @@ def _check_X_y(X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) return X, y, binarize_y - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, *arrays): random_state = check_random_state(self.random_state) idx_under = np.empty((0, ), dtype=int) @@ -112,8 +114,9 @@ def _fit_resample(self, X, y): np.flatnonzero(y == target_class)[index_target_class]), axis=0) + resampled_arrays = list(chain.from_iterable( + (safe_indexing(array, idx_under),) for array in (X, y, *arrays))) + if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) - else: - return safe_indexing(X, idx_under), safe_indexing(y, idx_under) + return resampled_arrays + [idx_under] + return resampled_arrays diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 3ffcbe4bd..f7bab535f 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -10,6 +10,7 @@ import traceback from collections import Counter +from inspect import signature import pytest @@ -20,6 +21,7 @@ from sklearn.datasets import make_classification from sklearn.cluster import KMeans from sklearn.preprocessing import label_binarize +from sklearn.utils import check_consistent_length from sklearn.utils.estimator_checks import check_estimator \ as sklearn_check_estimator, check_parameters_default_constructible from sklearn.utils.testing import assert_allclose @@ -73,6 +75,7 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_pandas yield check_samplers_multiclass_ova yield check_samplers_preserve_dtype + yield check_samplers_resample_sample_weight def _yield_all_checks(name, estimator): @@ -358,3 +361,18 @@ def check_samplers_preserve_dtype(name, Sampler): X_res, y_res = sampler.fit_resample(X, y) assert X.dtype == X_res.dtype assert y.dtype == y_res.dtype + + +def check_samplers_resample_sample_weight(name, Sampler): + # check that X, y, and an additional sample_weight array can be resampled. + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0) + sample_weight = np.ones_like(y) + sampler = Sampler() + set_random_state(sampler) + X_res, y_res, sw_res = sampler.fit_resample(X, y, sample_weight) + assert check_consistent_length(X_res, y_res, sw_res) From 8f86d9861b9c1f7fe49bdd86f28f45ad1b5b4b94 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Aug 2018 14:25:48 +0200 Subject: [PATCH 12/15] FIX only consider sample_weight --- imblearn/base.py | 46 ++++++----- .../_condensed_nearest_neighbour.py | 11 +-- .../_edited_nearest_neighbours.py | 81 ++++++++++++------- .../_instance_hardness_threshold.py | 25 ++++-- .../_prototype_selection/_nearmiss.py | 8 +- .../_neighbourhood_cleaning_rule.py | 9 +-- .../_one_sided_selection.py | 24 +++--- .../_random_under_sampler.py | 11 +-- .../_prototype_selection/_tomek_links.py | 12 +-- 9 files changed, 140 insertions(+), 87 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 9c57a7b58..1aeb9c934 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -57,7 +57,7 @@ def fit(self, X, y): self.sampling_strategy, y, self._sampling_type) return self - def fit_resample(self, X, y, *arrays): + def fit_resample(self, X, y, sample_weight=None): """Resample the dataset. Parameters @@ -68,34 +68,38 @@ def fit_resample(self, X, y, *arrays): y : array-like, shape (n_samples,) Corresponding label for each sample in X. - *arrays : sequence of indexables with same length / shape[0] - Allowed inputs are lists, numpy arrays, scipy-sparse matrices or - pandas dataframes. It is the placeholder to sample - ``sample_weight`` array. + sample_weight : array-like, shape (n_samples,) or None + Sample weights. + Returns ------- - X_resampled : {array-like, sparse matrix}, shape \ + X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. - y_resampled : array-like, shape (n_samples_new,) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled`. - *arrays : sequence of indexables, shape (n_samples_new,) or \ -(n_samples_new, n_features) + sample_weight_resampled : ndarray, shape (n_samples_new,) + Resampled sample weights. This output is returned only if + ``sample_weight`` was not ``None``. + + idx_resampled : ndarray, shape (n_samples_new,) + Indices of the selected features. This output is optional and only + available for some sampler if ``return_indices=True``. """ self._deprecate_ratio() - arrays = indexable(*arrays) X, y, binarize_y = self._check_X_y(X, y) - check_consistent_length(X, y, *arrays) + if sample_weight is not None: + check_consistent_length(X, y, sample_weight) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type) - output = self._fit_resample(X, y, *arrays) + output = self._fit_resample(X, y, sample_weight) if binarize_y: y_sampled = label_binarize(output[1], np.unique(y)) @@ -108,7 +112,7 @@ def fit_resample(self, X, y, *arrays): fit_sample = fit_resample @abstractmethod - def _fit_resample(self, X, y, *arrays): + def _fit_resample(self, X, y, sample_weight=None): """Base method defined in each sampler to defined the sampling strategy. @@ -120,10 +124,8 @@ def _fit_resample(self, X, y, *arrays): y : array-like, shape (n_samples,) Corresponding label for each sample in X. - *arrays : sequence of indexables with same length / shape[0] - Allowed inputs are lists, numpy arrays, scipy-sparse matrices or - pandas dataframes. It is the placeholder to sample - ``sample_weight`` array. + sample_weight : array-like, shape (n_samples,) or None + Sample weights. Returns ------- @@ -132,7 +134,15 @@ def _fit_resample(self, X, y, *arrays): The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) - The corresponding label of `X_resampled` + The corresponding label of `X_resampled`. + + sample_weight_resampled : ndarray, shape (n_samples_new,) + Resampled sample weights. This output is returned only if + ``sample_weight`` was not ``None``. + + idx_resampled : ndarray, shape (n_samples_new,) + Indices of the selected features. This output is optional and only + available for some sampler if ``return_indices=True``. """ pass diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index a5c49ed2a..e20243aaf 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -129,7 +129,7 @@ def _validate_estimator(self): ' inhereited from KNeighborsClassifier.' ' Got {} instead.'.format(type(self.n_neighbors))) - def _fit_resample(self, X, y, *arrays): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() random_state = check_random_state(self.random_state) @@ -202,9 +202,10 @@ def _fit_resample(self, X, y, *arrays): idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)), axis=0) - resampled_arrays = list(chain.from_iterable( - (safe_indexing(array, idx_under),) for array in (X, y, *arrays))) + resampled_arrays = [safe_indexing(arr, idx_under) + for arr in (X, y, sample_weight) + if arr is not None] if self.return_indices: - return resampled_arrays + [idx_under] - return resampled_arrays + return tuple(resampled_arrays + [idx_under]) + return tuple(resampled_arrays) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 72c62e73d..beb90e420 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -9,7 +9,6 @@ from __future__ import division from collections import Counter -from itertools import chain import numpy as np from scipy.stats import mode @@ -139,7 +138,7 @@ def _validate_estimator(self): if self.kind_sel not in SEL_KIND: raise NotImplementedError - def _fit_resample(self, X, y, *arrays): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() idx_under = np.empty((0, ), dtype=int) @@ -169,12 +168,13 @@ def _fit_resample(self, X, y, *arrays): np.flatnonzero(y == target_class)[index_target_class]), axis=0) - resampled_arrays = list(chain.from_iterable( - (safe_indexing(array, idx_under),) for array in (X, y, *arrays))) + resampled_arrays = [safe_indexing(arr, idx_under) + for arr in (X, y, sample_weight) + if arr is not None] if self.return_indices: - return resampled_arrays + [idx_under] - return resampled_arrays + return tuple(resampled_arrays + [idx_under]) + return tuple(resampled_arrays) @Substitution( @@ -305,22 +305,35 @@ def _validate_estimator(self): n_jobs=self.n_jobs, ratio=self.ratio) - def _fit_resample(self, X, y, *arrays): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() - X_, y_ = X, y + X_, y_, sample_weight_ = X, y, sample_weight if self.return_indices: idx_under = np.arange(X.shape[0], dtype=int) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) - for n_iter in range(self.max_iter): + for _ in range(self.max_iter): prev_len = y_.shape[0] if self.return_indices: - X_enn, y_enn, idx_enn = self.enn_.fit_resample(X_, y_) + resampled_data = self.enn_.fit_resample(X_, y_, sample_weight_) else: - X_enn, y_enn = self.enn_.fit_resample(X_, y_) + resampled_data = self.enn_.fit_resample(X_, y_, sample_weight_) + + # unpacking data + if len(resampled_data) == 2: + X_enn, y_enn = resampled_data + sample_weight_enn = None + elif len(resampled_data) == 3: + if sample_weight_ is not None: + X_enn, y_enn, sample_weight_enn = resampled_data + else: + X_enn, y_enn, idx_enn = resampled_data + sample_weight_enn = None + else: + X_enn, y_enn, sample_weight_enn, idx_enn = resampled_data # Check the stopping criterion # 1. If there is no changes for the vector y @@ -343,25 +356,24 @@ def _fit_resample(self, X, y, *arrays): # Case 3 b_remove_maj_class = (len(stats_enn) < len(target_stats)) - X_, y_, = X_enn, y_enn + X_, y_, sample_weight_ = X_enn, y_enn, sample_weight_enn + if self.return_indices: idx_under = idx_under[idx_enn] if b_conv or b_min_bec_maj or b_remove_maj_class: if b_conv: + X_, y_, sample_weight_ = X_enn, y_enn, sample_weight_enn if self.return_indices: - X_, y_, = X_enn, y_enn idx_under = idx_under[idx_enn] - else: - X_, y_, = X_enn, y_enn break - X_resampled, y_resampled = X_, y_ + resampled_arrays = [arr for arr in (X_, y_, sample_weight_) + if arr is not None] if self.return_indices: - return X_resampled, y_resampled, idx_under - else: - return X_resampled, y_resampled + return tuple(resampled_arrays + [idx_under]) + return tuple(resampled_arrays) @Substitution( @@ -491,10 +503,10 @@ def _validate_estimator(self): n_jobs=self.n_jobs, ratio=self.ratio) - def _fit_resample(self, X, y, *arrays): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() - X_, y_ = X, y + X_, y_, sample_weight_ = X, y, sample_weight target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) @@ -505,9 +517,22 @@ def _fit_resample(self, X, y, *arrays): self.enn_.n_neighbors = curr_size_ngh if self.return_indices: - X_enn, y_enn, idx_enn = self.enn_.fit_resample(X_, y_) + resampled_data = self.enn_.fit_resample(X_, y_, sample_weight_) + else: + resampled_data = self.enn_.fit_resample(X_, y_, sample_weight_) + + # unpacking data + if len(resampled_data) == 2: + X_enn, y_enn = resampled_data + sample_weight_enn = None + elif len(resampled_data) == 3: + if sample_weight_ is not None: + X_enn, y_enn, sample_weight_enn = resampled_data + else: + X_enn, y_enn, idx_enn = resampled_data + sample_weight_enn = None else: - X_enn, y_enn = self.enn_.fit_resample(X_, y_) + X_enn, y_enn, sample_weight_enn, idx_enn = resampled_data # Check the stopping criterion # 1. If the number of samples in the other class become inferior to @@ -528,16 +553,16 @@ def _fit_resample(self, X, y, *arrays): # Case 2 b_remove_maj_class = (len(stats_enn) < len(target_stats)) - X_, y_, = X_enn, y_enn + X_, y_, sample_weight_ = X_enn, y_enn, sample_weight_enn if self.return_indices: idx_under = idx_under[idx_enn] if b_min_bec_maj or b_remove_maj_class: break - X_resampled, y_resampled = X_, y_ + resampled_arrays = [arr for arr in (X_, y_, sample_weight_) + if arr is not None] if self.return_indices: - return X_resampled, y_resampled, idx_under - else: - return X_resampled, y_resampled + return tuple(resampled_arrays + [idx_under]) + return tuple(resampled_arrays) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index eb0aae76b..4656556ff 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -9,7 +9,6 @@ from __future__ import division from collections import Counter -from itertools import chain import numpy as np @@ -17,6 +16,7 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import StratifiedKFold from sklearn.utils import safe_indexing +from sklearn.utils.fixes import signature from ..base import BaseCleaningSampler from ...utils import Substitution @@ -126,7 +126,7 @@ def _validate_estimator(self): raise ValueError('Invalid parameter `estimator`. Got {}.'.format( type(self.estimator))) - def _fit_resample(self, X, y, *arrays): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() target_stats = Counter(y) @@ -135,13 +135,23 @@ def _fit_resample(self, X, y, *arrays): random_state=self.random_state).split(X, y) probabilities = np.zeros(y.shape[0], dtype=float) + support_sample_weight = "sample_weight" in signature( + self.estimator_.fit).parameters + for train_index, test_index in skf: X_train = safe_indexing(X, train_index) X_test = safe_indexing(X, test_index) y_train = safe_indexing(y, train_index) y_test = safe_indexing(y, test_index) + if sample_weight is not None: + sample_weight_train = safe_indexing(sample_weight, train_index) + else: + sample_weight_train = None - self.estimator_.fit(X_train, y_train) + if support_sample_weight: + self.estimator_.fit(X_train, y_train, sample_weight_train) + else: + self.estimator_.fit(X_train, y_train) probs = self.estimator_.predict_proba(X_test) classes = self.estimator_.classes_ @@ -168,9 +178,10 @@ def _fit_resample(self, X, y, *arrays): np.flatnonzero(y == target_class)[index_target_class]), axis=0) - resampled_arrays = list(chain.from_iterable( - (safe_indexing(array, idx_under),) for array in (X, y, *arrays))) + resampled_arrays = [safe_indexing(arr, idx_under) + for arr in (X, y, sample_weight) + if arr is not None] if self.return_indices: - return resampled_arrays + [idx_under] - return resampled_arrays \ No newline at end of file + return tuple(resampled_arrays + [idx_under]) + return tuple(resampled_arrays) diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index dbd351ef4..ccf1d68cf 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -8,7 +8,6 @@ import warnings from collections import Counter -from itertools import chain import numpy as np @@ -212,7 +211,7 @@ def _validate_estimator(self): raise ValueError('Parameter `version` must be 1, 2 or 3, got' ' {}'.format(self.version)) - def _fit_resample(self, X, y, *arrays): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() idx_under = np.empty((0, ), dtype=int) @@ -278,8 +277,9 @@ def _fit_resample(self, X, y, *arrays): np.flatnonzero(y == target_class)[index_target_class]), axis=0) - resampled_arrays = list(chain.from_iterable( - (safe_indexing(array, idx_under),) for array in (X, y, *arrays))) + resampled_arrays = [safe_indexing(arr, idx_under) + for arr in (X, y, sample_weight) + if arr is not None] if self.return_indices: return resampled_arrays + [idx_under] diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index 147f13753..1cf548c50 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -7,7 +7,6 @@ from __future__ import division, print_function from collections import Counter -from itertools import chain import numpy as np from scipy.stats import mode @@ -140,7 +139,7 @@ def _validate_estimator(self): "'threshold_cleaning' is a value between 0 and 1." " Got {} instead.".format(self.threshold_cleaning)) - def _fit_resample(self, X, y, *arrays): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() enn = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, @@ -187,9 +186,9 @@ def _fit_resample(self, X, y, *arrays): selected_samples[union_a1_a2] = False index_target_class = np.flatnonzero(selected_samples) - resampled_arrays = list(chain.from_iterable( - (safe_indexing(array, index_target_class),) - for array in (X, y, *arrays))) + resampled_arrays = [safe_indexing(arr, index_target_class) + for arr in (X, y, sample_weight) + if arr is not None] if self.return_indices: return resampled_arrays + [index_target_class] diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index c5d0f5b7d..180d96850 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -7,7 +7,6 @@ from __future__ import division from collections import Counter -from itertools import chain import numpy as np @@ -123,7 +122,7 @@ def _validate_estimator(self): ' inhereited from KNeighborsClassifier.' ' Got {} instead.'.format(type(self.n_neighbors))) - def _fit_resample(self, X, y, *arrays): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() random_state = check_random_state(self.random_state) @@ -165,19 +164,24 @@ def _fit_resample(self, X, y, *arrays): idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)), axis=0) - X_resampled = safe_indexing(X, idx_under) - y_resampled = safe_indexing(y, idx_under) + X_res = safe_indexing(X, idx_under) + y_res = safe_indexing(y, idx_under) + sample_weight_res = (safe_indexing(sample_weight, idx_under) + if sample_weight is not None else None) # apply Tomek cleaning tl = TomekLinks( sampling_strategy=self.sampling_strategy_, return_indices=True) - X_cleaned, y_cleaned, idx_cleaned = tl.fit_resample( - X_resampled, y_resampled) + X_res, y_res, idx_cleaned = tl.fit_resample(X_res, y_res, + sample_weight_res) idx_under = safe_indexing(idx_under, idx_cleaned) - resampled_arrays = list(chain.from_iterable( - (safe_indexing(array, idx_under),) for array in arrays)) + sample_weight_res = (safe_indexing(sample_weight_res, idx_cleaned) + if sample_weight_res is not None else None) + + resampled_arrays = [arr for arr in (X_res, y_res, sample_weight_res) + if arr is not None] if self.return_indices: - return [X_cleaned, y_cleaned] + resampled_arrays +[idx_under] - return [X_cleaned, y_cleaned] + resampled_arrays + return tuple(resampled_arrays + [idx_under]) + return tuple(resampled_arrays) diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 10a14ec91..bf5179247 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -94,7 +94,7 @@ def _check_X_y(X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) return X, y, binarize_y - def _fit_resample(self, X, y, *arrays): + def _fit_resample(self, X, y, sample_weight=None): random_state = check_random_state(self.random_state) idx_under = np.empty((0, ), dtype=int) @@ -114,9 +114,10 @@ def _fit_resample(self, X, y, *arrays): np.flatnonzero(y == target_class)[index_target_class]), axis=0) - resampled_arrays = list(chain.from_iterable( - (safe_indexing(array, idx_under),) for array in (X, y, *arrays))) + resampled_arrays = [safe_indexing(arr, idx_under) + for arr in (X, y, sample_weight) + if arr is not None] if self.return_indices: - return resampled_arrays + [idx_under] - return resampled_arrays + return tuple(resampled_arrays + [idx_under]) + return tuple(resampled_arrays) diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index 39de438e7..dde7c19b9 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -134,7 +134,7 @@ def is_tomek(y, nn_index, class_type): return links - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): # check for deprecated random_state if self.random_state is not None: deprecate_parameter(self, '0.4', 'random_state') @@ -147,8 +147,10 @@ def _fit_resample(self, X, y): links = self.is_tomek(y, nns, self.sampling_strategy_) idx_under = np.flatnonzero(np.logical_not(links)) + resampled_arrays = [safe_indexing(arr, idx_under) + for arr in (X, y, sample_weight) + if arr is not None] + if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) - else: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under)) + return tuple(resampled_arrays + [idx_under]) + return tuple(resampled_arrays) From 9f600fd15ecb3cc4eea2c512240e7c184b1cf3df Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Aug 2018 23:44:11 +0200 Subject: [PATCH 13/15] iter --- imblearn/base.py | 4 +- imblearn/combine/_smote_enn.py | 6 +- imblearn/combine/_smote_tomek.py | 6 +- imblearn/ensemble/_bagging.py | 17 +++--- imblearn/ensemble/_balance_cascade.py | 2 +- imblearn/ensemble/_easy_ensemble.py | 2 +- imblearn/over_sampling/_adasyn.py | 17 ++++-- .../over_sampling/_random_over_sampler.py | 13 +++-- imblearn/over_sampling/_smote.py | 58 +++++++++++++++---- imblearn/pipeline.py | 1 - .../_cluster_centroids.py | 7 +-- .../_condensed_nearest_neighbour.py | 1 - .../_one_sided_selection.py | 11 ++-- imblearn/utils/estimator_checks.py | 30 +++++----- 14 files changed, 112 insertions(+), 63 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index eddd54229..a4b7f7e7a 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -17,6 +17,7 @@ from sklearn.preprocessing import label_binarize from sklearn.utils import check_X_y from sklearn.utils import check_consistent_length +from sklearn.utils import check_array from .utils import check_sampling_strategy, check_target_type from .utils.deprecation import deprecate_parameter @@ -93,6 +94,7 @@ def fit_resample(self, X, y, sample_weight=None): X, y, binarize_y = self._check_X_y(X, y) if sample_weight is not None: + sample_weight = check_array(sample_weight, ensure_2d=False) check_consistent_length(X, y, sample_weight) self.sampling_strategy_ = check_sampling_strategy( @@ -269,7 +271,7 @@ def __init__(self, func=None, accept_sparse=True, kw_args=None): self.kw_args = kw_args self.logger = logging.getLogger(__name__) - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'] if self.accept_sparse else False) func = _identity if self.func is None else self.func diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py index 4bce0052f..87a6ce8da 100644 --- a/imblearn/combine/_smote_enn.py +++ b/imblearn/combine/_smote_enn.py @@ -125,11 +125,11 @@ def _validate_estimator(self): else: self.enn_ = EditedNearestNeighbours(sampling_strategy='all') - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) self.sampling_strategy_ = self.sampling_strategy - X_res, y_res = self.smote_.fit_resample(X, y) - return self.enn_.fit_resample(X_res, y_res) + resampled_arrays = self.smote_.fit_resample(X, y, sample_weight) + return self.enn_.fit_resample(*resampled_arrays) diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py index 43340e91d..9a85043da 100644 --- a/imblearn/combine/_smote_tomek.py +++ b/imblearn/combine/_smote_tomek.py @@ -134,11 +134,11 @@ def _validate_estimator(self): else: self.tomek_ = TomekLinks(sampling_strategy='all') - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) self.sampling_strategy_ = self.sampling_strategy - X_res, y_res = self.smote_.fit_resample(X, y) - return self.tomek_.fit_resample(X_res, y_res) + resampled_arrays = self.smote_.fit_resample(X, y, sample_weight) + return self.tomek_.fit_resample(*resampled_arrays) diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py index c6c66c656..9b77d444e 100644 --- a/imblearn/ensemble/_bagging.py +++ b/imblearn/ensemble/_bagging.py @@ -210,7 +210,7 @@ def __init__(self, self.ratio = ratio self.replacement = replacement - def _validate_estimator(self, default=DecisionTreeClassifier()): + def _validate_estimator(self): """Check the estimator and the n_estimator attribute, set the `base_estimator_` attribute.""" if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): @@ -224,12 +224,14 @@ def _validate_estimator(self, default=DecisionTreeClassifier()): if self.base_estimator is not None: base_estimator = clone(self.base_estimator) else: - base_estimator = clone(default) + base_estimator = clone(DecisionTreeClassifier()) - self.base_estimator_ = Pipeline([('sampler', RandomUnderSampler( - sampling_strategy=self.sampling_strategy, - replacement=self.replacement, - ratio=self.ratio)), ('classifier', base_estimator)]) + self.base_estimator_ = Pipeline([ + ('sampler', RandomUnderSampler( + sampling_strategy=self.sampling_strategy, + replacement=self.replacement, + ratio=self.ratio)), + ('classifier', base_estimator)]) def fit(self, X, y): """Build a Bagging ensemble of estimators from the training @@ -248,6 +250,5 @@ def fit(self, X, y): self : object Returns self. """ - # RandomUnderSampler is not supporting sample_weight. We need to pass - # None. + # Pipeline does not support sample_weight return self._fit(X, y, self.max_samples, sample_weight=None) diff --git a/imblearn/ensemble/_balance_cascade.py b/imblearn/ensemble/_balance_cascade.py index 67526d233..c330c43f1 100644 --- a/imblearn/ensemble/_balance_cascade.py +++ b/imblearn/ensemble/_balance_cascade.py @@ -128,7 +128,7 @@ def _validate_estimator(self): self.logger.debug(self.estimator_) - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() self.sampling_strategy_ = check_sampling_strategy( diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index 1a343d05c..fcc9763f7 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -114,7 +114,7 @@ def __init__(self, self.replacement = replacement self.n_subsets = n_subsets - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): random_state = check_random_state(self.random_state) X_resampled = [] diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index 00e3c57a4..d9a7e1784 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -106,12 +106,14 @@ def _validate_estimator(self): 'n_neighbors', self.n_neighbors, additional_neighbor=1) self.nn_.set_params(**{'n_jobs': self.n_jobs}) - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() random_state = check_random_state(self.random_state) X_resampled = X.copy() y_resampled = y.copy() + if sample_weight is not None: + sample_weight_resampled = sample_weight.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: @@ -165,8 +167,6 @@ def _fit_resample(self, X, y): X_new = (sparse.csr_matrix( (samples, (row_indices, col_indices)), [np.sum(n_samples_generate), X.shape[1]], dtype=X.dtype)) - y_new = np.array([class_sample] * np.sum(n_samples_generate), - dtype=y.dtype) else: x_class_gen = [] for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, @@ -182,8 +182,13 @@ def _fit_resample(self, X, y): ]) X_new = np.concatenate(x_class_gen).astype(X.dtype) - y_new = np.array([class_sample] * np.sum(n_samples_generate), - dtype=y.dtype) + + y_new = np.array([class_sample] * np.sum(n_samples_generate), + dtype=y.dtype) + if sample_weight is not None: + sample_weight_resampled = np.hstack( + (sample_weight_resampled, + np.ones_like(y_new, dtype=sample_weight.dtype))) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) @@ -191,4 +196,6 @@ def _fit_resample(self, X, y): X_resampled = np.vstack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) + if sample_weight is not None: + return X_resampled, y_resampled, sample_weight_resampled return X_resampled, y_resampled diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index b01a95a8e..0407db009 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -88,7 +88,7 @@ def _check_X_y(X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) return X, y, binarize_y - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): random_state = check_random_state(self.random_state) target_stats = Counter(y) @@ -102,9 +102,10 @@ def _fit_resample(self, X, y): sample_indices = np.append(sample_indices, target_class_indices[indices]) + resampled_arrays = [safe_indexing(arr, sample_indices) + for arr in (X, y, sample_weight) + if arr is not None] + if self.return_indices: - return (safe_indexing(X, sample_indices), safe_indexing( - y, sample_indices), sample_indices) - else: - return (safe_indexing(X, sample_indices), safe_indexing( - y, sample_indices)) + return tuple(resampled_arrays + [sample_indices]) + return tuple(resampled_arrays) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 34c56a560..b0367e802 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -280,14 +280,16 @@ def _validate_estimator(self): 'Got {} instead.'.format(self.kind)) # FIXME: rename _sample -> _fit_resample in 0.6 - def _fit_resample(self, X, y): - return self._sample(X, y) + def _fit_resample(self, X, y, sample_weight=None): + return self._sample(X, y, sample_weight) - def _sample(self, X, y): + def _sample(self, X, y, sample_weight=None): self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() + if sample_weight is not None: + sample_weight_resampled = sample_weight.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: @@ -317,6 +319,10 @@ def _sample(self, X, y): else: X_resampled = np.vstack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) + if sample_weight is not None: + sample_weight_resampled = np.hstack( + (sample_weight_resampled, + np.ones_like(y_new, dtype=sample_weight.dtype))) elif self.kind == 'borderline-2': random_state = check_random_state(self.random_state) @@ -350,7 +356,14 @@ def _sample(self, X, y): else: X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) y_resampled = np.hstack((y_resampled, y_new_1, y_new_2)) - + if sample_weight is not None: + sample_weight_resampled = np.hstack( + (sample_weight_resampled, + np.ones_like(y_new_1, dtype=sample_weight.dtype), + np.ones_like(y_new_2, dtype=sample_weight.dtype))) + + if sample_weight is not None: + return X_resampled, y_resampled, sample_weight_resampled return X_resampled, y_resampled @@ -466,14 +479,16 @@ def _validate_estimator(self): self.svm_estimator) # FIXME: rename _sample -> _fit_resample in 0.6 - def _fit_resample(self, X, y): - return self._sample(X, y) + def _fit_resample(self, X, y, sample_weight=None): + return self._sample(X, y, sample_weight) - def _sample(self, X, y): + def _sample(self, X, y, sample_weight=None): self._validate_estimator() random_state = check_random_state(self.random_state) X_resampled = X.copy() y_resampled = y.copy() + if sample_weight is not None: + sample_weight_resampled = sample_weight.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: @@ -535,19 +550,34 @@ def _sample(self, X, y): X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) y_resampled = np.concatenate( (y_resampled, y_new_1, y_new_2), axis=0) + if sample_weight is not None: + sample_weight_resampled = np.hstack( + (sample_weight_resampled, + np.ones_like(y_new_1, dtype=sample_weight.dtype), + np.ones_like(y_new_2, dtype=sample_weight.dtype))) elif np.count_nonzero(danger_bool) == 0: if sparse.issparse(X_resampled): X_resampled = sparse.vstack([X_resampled, X_new_2]) else: X_resampled = np.vstack((X_resampled, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_2), axis=0) + if sample_weight is not None: + sample_weight_resampled = np.hstack( + (sample_weight_resampled, + np.ones_like(y_new_2, dtype=sample_weight.dtype))) elif np.count_nonzero(safety_bool) == 0: if sparse.issparse(X_resampled): X_resampled = sparse.vstack([X_resampled, X_new_1]) else: X_resampled = np.vstack((X_resampled, X_new_1)) y_resampled = np.concatenate((y_resampled, y_new_1), axis=0) + if sample_weight is not None: + sample_weight_resampled = np.hstack( + (sample_weight_resampled, + np.ones_like(y_new_1, dtype=sample_weight.dtype))) + if sample_weight is not None: + return X_resampled, y_resampled, sample_weight_resampled return X_resampled, y_resampled @@ -735,16 +765,18 @@ def _validate_estimator(self): self.nn_m_.set_params(**{'n_jobs': self.n_jobs}) # FIXME: to be removed in 0.6 - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() - return self._sample(X, y) + return self._sample(X, y, sample_weight) - def _sample(self, X, y): + def _sample(self, X, y, sample_weight=None): # FIXME: uncomment in version 0.6 # self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() + if sample_weight is not None: + sample_weight_resampled = sample_weight.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: @@ -762,5 +794,11 @@ def _sample(self, X, y): else: X_resampled = np.vstack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) + if sample_weight is not None: + sample_weight_resampled = np.hstack( + (sample_weight_resampled, + np.ones_like(y_new, dtype=sample_weight.dtype))) + if sample_weight is not None: + return X_resampled, y_resampled, sample_weight_resampled return X_resampled, y_resampled diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index 18cea4059..019ff6898 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -562,7 +562,6 @@ def _fit_transform_one(transformer, weight, X, y, **fit_params): def _fit_resample_one(sampler, X, y, **fit_params): X_res, y_res = sampler.fit_resample(X, y, **fit_params) - return X_res, y_res, sampler diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 190d8f9e5..6c6af6522 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -135,14 +135,11 @@ def _generate_sample(self, X, y, centroids, target_class): return X_new, y_new - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() if self.voting == 'auto': - if sparse.issparse(X): - self.voting_ = 'hard' - else: - self.voting_ = 'soft' + self.voting_ = 'hard' if sparse.issparse(X) else 'soft' else: if self.voting in VOTING_KIND: self.voting_ = self.voting diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index e20243aaf..52b495463 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -8,7 +8,6 @@ from __future__ import division from collections import Counter -from itertools import chain import numpy as np diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 180d96850..61950b71a 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -172,12 +172,15 @@ def _fit_resample(self, X, y, sample_weight=None): # apply Tomek cleaning tl = TomekLinks( sampling_strategy=self.sampling_strategy_, return_indices=True) - X_res, y_res, idx_cleaned = tl.fit_resample(X_res, y_res, - sample_weight_res) + resampled_arrays = tl.fit_resample(X_res, y_res, sample_weight_res) + if sample_weight_res is not None: + X_res, y_res, sample_weight_res, idx_cleaned = resampled_arrays + else: + X_res, y_res, idx_cleaned = resampled_arrays idx_under = safe_indexing(idx_under, idx_cleaned) - sample_weight_res = (safe_indexing(sample_weight_res, idx_cleaned) - if sample_weight_res is not None else None) + """ sample_weight_res = (safe_indexing(sample_weight_res, idx_cleaned) + if sample_weight_res is not None else None) """ resampled_arrays = [arr for arr in (X_res, y_res, sample_weight_res) if arr is not None] diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index f7bab535f..83063c106 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -10,7 +10,6 @@ import traceback from collections import Counter -from inspect import signature import pytest @@ -38,6 +37,8 @@ from imblearn.utils.testing import warns DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE'] +DONT_SUPPORT_SAMPLE_WEIGHT = ['EasyEnsemble', 'BalanceCascade', + 'ClusterCentroids', 'FunctionTransformer'] SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler'] @@ -301,7 +302,7 @@ def check_samplers_pandas(name, Sampler): n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0) - X_pd, y_pd = pd.DataFrame(X), pd.Series(y) + X_pd = pd.DataFrame(X) sampler = Sampler() if isinstance(Sampler(), SMOTE): samplers = [ @@ -317,7 +318,7 @@ def check_samplers_pandas(name, Sampler): for sampler in samplers: set_random_state(sampler) - X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd) + X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y) X_res, y_res = sampler.fit_resample(X, y) assert_allclose(X_res_pd, X_res) assert_allclose(y_res_pd, y_res) @@ -365,14 +366,15 @@ def check_samplers_preserve_dtype(name, Sampler): def check_samplers_resample_sample_weight(name, Sampler): # check that X, y, and an additional sample_weight array can be resampled. - X, y = make_classification( - n_samples=1000, - n_classes=3, - n_informative=4, - weights=[0.2, 0.3, 0.5], - random_state=0) - sample_weight = np.ones_like(y) - sampler = Sampler() - set_random_state(sampler) - X_res, y_res, sw_res = sampler.fit_resample(X, y, sample_weight) - assert check_consistent_length(X_res, y_res, sw_res) + if name not in DONT_SUPPORT_SAMPLE_WEIGHT: + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0) + sample_weight = np.ones_like(y) + sampler = Sampler() + set_random_state(sampler) + X_res, y_res, sw_res = sampler.fit_resample(X, y, sample_weight) + assert check_consistent_length(X_res, y_res, sw_res) is None From 2b8ab83dd4f258f609525bf9ac2d9d9c96e60dcc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 29 Aug 2018 11:44:06 +0200 Subject: [PATCH 14/15] iter --- .../_cluster_centroids.py | 45 +++++++++++++------ imblearn/utils/estimator_checks.py | 2 +- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 6c6af6522..e92963046 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -119,20 +119,28 @@ def _validate_estimator(self): raise ValueError('`estimator` has to be a KMeans clustering.' ' Got {} instead.'.format(type(self.estimator))) - def _generate_sample(self, X, y, centroids, target_class): + def _generate_sample(self, X, y, sample_weight, centroids, target_class): if self.voting_ == 'hard': nearest_neighbors = NearestNeighbors(n_neighbors=1) nearest_neighbors.fit(X, y) indices = nearest_neighbors.kneighbors( centroids, return_distance=False) X_new = safe_indexing(X, np.squeeze(indices)) + if sample_weight is not None: + sample_weight_new = safe_indexing(sample_weight, + np.squeeze(indices)) else: if sparse.issparse(X): X_new = sparse.csr_matrix(centroids, dtype=X.dtype) else: X_new = centroids + if sample_weight is not None: + sample_weight_new = np.ones(centroids.shape[0], + dtype=sample_weight.dtype) y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype) + if sample_weight is not None: + return X_new, y_new, sample_weight_new return X_new, y_new def _fit_resample(self, X, y, sample_weight=None): @@ -148,24 +156,35 @@ def _fit_resample(self, X, y, sample_weight=None): " instead.".format(VOTING_KIND, self.voting)) X_resampled, y_resampled = [], [] + if sample_weight is not None: + sample_weight_resampled = [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) - X_new, y_new = self._generate_sample( - X, y, self.estimator_.cluster_centers_, target_class) - X_resampled.append(X_new) - y_resampled.append(y_new) + new_arrays = self._generate_sample( + X, y, sample_weight, self.estimator_.cluster_centers_, + target_class) + X_resampled.append(new_arrays[0]) + y_resampled.append(new_arrays[1]) + if sample_weight is not None: + sample_weight_resampled.append(new_arrays[2]) else: target_class_indices = np.flatnonzero(y == target_class) X_resampled.append(safe_indexing(X, target_class_indices)) y_resampled.append(safe_indexing(y, target_class_indices)) - - if sparse.issparse(X): - X_resampled = sparse.vstack(X_resampled) - else: - X_resampled = np.vstack(X_resampled) - y_resampled = np.hstack(y_resampled) - - return X_resampled, np.array(y_resampled, dtype=y.dtype) + if sample_weight is not None: + sample_weight_resampled.append( + safe_indexing(sample_weight, target_class_indices)) + + X_resampled = (sparse.vstack(X_resampled) + if sparse.issparse(X) else np.vstack(X_resampled)) + y_resampled = np.array(np.hstack(y_resampled), dtype=y.dtype) + if sample_weight is not None: + sample_weight_resampled = np.array( + np.hstack(sample_weight_resampled), dtype=sample_weight.dtype) + + if sample_weight is not None: + return X_resampled, y_resampled, sample_weight_resampled + return X_resampled, y_resampled diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 83063c106..37fa69e37 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -38,7 +38,7 @@ DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE'] DONT_SUPPORT_SAMPLE_WEIGHT = ['EasyEnsemble', 'BalanceCascade', - 'ClusterCentroids', 'FunctionTransformer'] + 'FunctionSampler'] SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler'] From 7a8fad08a22da86735f80b20ade58236ae1206b7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 29 Aug 2018 11:59:11 +0200 Subject: [PATCH 15/15] EXA fix fake sampler in example --- examples/over-sampling/plot_comparison_over_sampling.py | 2 +- imblearn/over_sampling/_smote.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py index 587a258d0..9614f79a1 100644 --- a/examples/over-sampling/plot_comparison_over_sampling.py +++ b/examples/over-sampling/plot_comparison_over_sampling.py @@ -134,7 +134,7 @@ class FakeSampler(BaseSampler): _sampling_type = 'bypass' - def _fit_resample(self, X, y): + def _fit_resample(self, X, y, sample_weight=None): return X, y diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index b0367e802..e4a7c61fe 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -123,8 +123,7 @@ def _make_samples(self, [len(samples_indices), X.shape[1]], dtype=X.dtype), y_new) - else: - return X_new, y_new + return X_new, y_new def _in_danger_noise(self, nn_estimator, samples, target_class, y, kind='danger'):