-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] ENH: Resample additional arrays apart from X and y #463
base: master
Are you sure you want to change the base?
Changes from all commits
0ac2c92
c872679
bbecd30
f7120d8
00f8e44
9046477
8b3aa50
24fd62d
59725c7
6189206
61f53a7
8f86d98
38526bc
9f600fd
2b8ab83
7a8fad0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,8 @@ | |
from sklearn.externals import six | ||
from sklearn.preprocessing import label_binarize | ||
from sklearn.utils import check_X_y | ||
from sklearn.utils import check_consistent_length | ||
from sklearn.utils import check_array | ||
|
||
from .utils import check_sampling_strategy, check_target_type | ||
from .utils.deprecation import deprecate_parameter | ||
|
@@ -55,7 +57,7 @@ def fit(self, X, y): | |
self.sampling_strategy, y, self._sampling_type) | ||
return self | ||
|
||
def fit_resample(self, X, y): | ||
def fit_resample(self, X, y, sample_weight=None): | ||
"""Resample the dataset. | ||
|
||
Parameters | ||
|
@@ -66,24 +68,39 @@ def fit_resample(self, X, y): | |
y : array-like, shape (n_samples,) | ||
Corresponding label for each sample in X. | ||
|
||
sample_weight : array-like, shape (n_samples,) or None | ||
Sample weights. | ||
|
||
|
||
Returns | ||
------- | ||
X_resampled : {array-like, sparse matrix}, shape \ | ||
X_resampled : {ndarray, sparse matrix}, shape \ | ||
(n_samples_new, n_features) | ||
The array containing the resampled data. | ||
|
||
y_resampled : array-like, shape (n_samples_new,) | ||
y_resampled : ndarray, shape (n_samples_new,) | ||
The corresponding label of `X_resampled`. | ||
|
||
sample_weight_resampled : ndarray, shape (n_samples_new,) | ||
Resampled sample weights. This output is returned only if | ||
``sample_weight`` was not ``None``. | ||
|
||
idx_resampled : ndarray, shape (n_samples_new,) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you explain why this should be returned from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that it was some original design (before it was in scikit-learn). But actually it would be better to keep it as an attribute with the single |
||
Indices of the selected features. This output is optional and only | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean the selected samples? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes |
||
available for some sampler if ``return_indices=True``. | ||
|
||
""" | ||
self._deprecate_ratio() | ||
|
||
X, y, binarize_y = self._check_X_y(X, y) | ||
if sample_weight is not None: | ||
sample_weight = check_array(sample_weight, ensure_2d=False) | ||
check_consistent_length(X, y, sample_weight) | ||
|
||
self.sampling_strategy_ = check_sampling_strategy( | ||
self.sampling_strategy, y, self._sampling_type) | ||
|
||
output = self._fit_resample(X, y) | ||
output = self._fit_resample(X, y, sample_weight) | ||
|
||
if binarize_y: | ||
y_sampled = label_binarize(output[1], np.unique(y)) | ||
|
@@ -96,7 +113,7 @@ def fit_resample(self, X, y): | |
fit_sample = fit_resample | ||
|
||
@abstractmethod | ||
def _fit_resample(self, X, y): | ||
def _fit_resample(self, X, y, sample_weight=None): | ||
"""Base method defined in each sampler to defined the sampling | ||
strategy. | ||
|
||
|
@@ -108,14 +125,25 @@ def _fit_resample(self, X, y): | |
y : array-like, shape (n_samples,) | ||
Corresponding label for each sample in X. | ||
|
||
sample_weight : array-like, shape (n_samples,) or None | ||
Sample weights. | ||
|
||
Returns | ||
------- | ||
X_resampled : {ndarray, sparse matrix}, shape \ | ||
(n_samples_new, n_features) | ||
The array containing the resampled data. | ||
|
||
y_resampled : ndarray, shape (n_samples_new,) | ||
The corresponding label of `X_resampled` | ||
The corresponding label of `X_resampled`. | ||
|
||
sample_weight_resampled : ndarray, shape (n_samples_new,) | ||
Resampled sample weights. This output is returned only if | ||
``sample_weight`` was not ``None``. | ||
|
||
idx_resampled : ndarray, shape (n_samples_new,) | ||
Indices of the selected features. This output is optional and only | ||
available for some sampler if ``return_indices=True``. | ||
|
||
""" | ||
pass | ||
|
@@ -243,7 +271,7 @@ def __init__(self, func=None, accept_sparse=True, kw_args=None): | |
self.kw_args = kw_args | ||
self.logger = logging.getLogger(__name__) | ||
|
||
def _fit_resample(self, X, y): | ||
def _fit_resample(self, X, y, sample_weight=None): | ||
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'] | ||
if self.accept_sparse else False) | ||
func = _identity if self.func is None else self.func | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would rather have a dict of non-X,y returned. (Optionally? In scikit-learn I would rather this be mandatory so we don't need to handle both cases.)