From 58f4a11eab0fc0e58b19ce6e9cfad36069fd4920 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 20 Oct 2021 23:34:38 +0200 Subject: [PATCH] DOC add test for numpydoc validation and documented param/attributes (#869) --- build_tools/azure/install.cmd | 2 +- build_tools/azure/install.sh | 4 +- imblearn/base.py | 31 ++- imblearn/combine/_smote_enn.py | 19 ++ imblearn/combine/_smote_tomek.py | 18 ++ imblearn/datasets/_imbalance.py | 7 +- imblearn/datasets/_zenodo.py | 9 +- imblearn/ensemble/_bagging.py | 8 + imblearn/ensemble/_easy_ensemble.py | 11 + imblearn/ensemble/_forest.py | 18 +- imblearn/ensemble/_weight_boosting.py | 10 +- imblearn/exceptions.py | 18 ++ imblearn/keras/_generator.py | 9 +- imblearn/metrics/_classification.py | 18 +- imblearn/metrics/pairwise.py | 7 +- imblearn/over_sampling/_adasyn.py | 15 ++ .../over_sampling/_random_over_sampler.py | 10 + imblearn/over_sampling/_smote/base.py | 30 +++ imblearn/over_sampling/_smote/cluster.py | 10 + imblearn/over_sampling/_smote/filter.py | 40 +++ imblearn/pipeline.py | 7 + imblearn/tests/test_docstring_parameters.py | 253 ++++++++++++++++++ .../_cluster_centroids.py | 18 ++ .../_condensed_nearest_neighbour.py | 13 + .../_edited_nearest_neighbours.py | 47 ++++ .../_instance_hardness_threshold.py | 13 + .../_prototype_selection/_nearmiss.py | 13 + .../_neighbourhood_cleaning_rule.py | 13 + .../_one_sided_selection.py | 13 + .../_random_under_sampler.py | 10 + .../_prototype_selection/_tomek_links.py | 12 +- imblearn/utils/_validation.py | 3 +- imblearn/utils/deprecation.py | 7 +- imblearn/utils/testing.py | 19 +- maint_tools/test_docstring.py | 115 ++++++-- 35 files changed, 774 insertions(+), 76 deletions(-) create mode 100644 imblearn/tests/test_docstring_parameters.py diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd index 73fd8e63c..3c1eda82a 100644 --- a/build_tools/azure/install.cmd +++ b/build_tools/azure/install.cmd @@ -11,7 +11,7 @@ IF "%PYTHON_ARCH%"=="64" ( call deactivate @rem Clean up any left-over from a previous build conda remove --all -q -y -n %VIRTUALENV% - conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython wheel joblib git + conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython wheel joblib git -c conda-forge call activate %VIRTUALENV% diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 65764090e..82521d577 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -7,7 +7,7 @@ UNAMESTR=`uname` make_conda() { TO_INSTALL="$@" - conda create -n $VIRTUALENV --yes $TO_INSTALL + conda create -n $VIRTUALENV --yes $TO_INSTALL -c conda-forge source activate $VIRTUALENV } @@ -65,7 +65,7 @@ if [[ "$DISTRIB" == "conda" ]]; then fi if [[ -n "$TO_INSTALL" ]]; then - conda install --yes $TO_INSTALL + conda install --yes $TO_INSTALL -c conda-forge fi if [[ -n "$KERAS_VERSION" ]]; then diff --git a/imblearn/base.py b/imblearn/base.py index e35288af1..df4702236 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -140,6 +140,24 @@ def _identity(X, y): return X, y +def is_sampler(estimator): + """Return True if the given estimator is a sampler, False otherwise. + + Parameters + ---------- + estimator : object + Estimator to test. + + Returns + ------- + is_sampler : bool + True if estimator is a sampler, otherwise False. + """ + if estimator._estimator_type == "sampler": + return True + return False + + class FunctionSampler(BaseSampler): """Construct a sampler from calling an arbitrary callable. @@ -166,9 +184,20 @@ class FunctionSampler(BaseSampler): .. versionadded:: 0.6 + Attributes + ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- - sklearn.preprocessing.FunctionTransfomer : Stateless transformer. Notes diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py index 45c96adb2..4d2e411e6 100644 --- a/imblearn/combine/_smote_enn.py +++ b/imblearn/combine/_smote_enn.py @@ -49,6 +49,25 @@ class SMOTEENN(BaseSampler): {n_jobs} + Attributes + ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + smote_ : sampler object + The validated :class:`~imblearn.over_sampling.SMOTE` instance. + + enn_ : sampler object + The validated :class:`~imblearn.under_sampling.EditedNearestNeighbours` + instance. + + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- SMOTETomek : Over-sample using SMOTE followed by under-sampling removing diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py index ef1f8daf3..f4d655a42 100644 --- a/imblearn/combine/_smote_tomek.py +++ b/imblearn/combine/_smote_tomek.py @@ -49,6 +49,24 @@ class SMOTETomek(BaseSampler): {n_jobs} + Attributes + ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + smote_ : sampler object + The validated :class:`~imblearn.over_sampling.SMOTE` instance. + + tomek_ : sampler object + The validated :class:`~imblearn.under_sampling.TomekLinks` instance. + + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- SMOTEENN : Over-sample using SMOTE followed by under-sampling using Edited diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py index 44154884b..4ee0c8816 100644 --- a/imblearn/datasets/_imbalance.py +++ b/imblearn/datasets/_imbalance.py @@ -16,8 +16,7 @@ def make_imbalance( X, y, *, sampling_strategy=None, random_state=None, verbose=False, **kwargs ): - """Turns a dataset into an imbalanced dataset with a specific sampling - strategy. + """Turn a dataset into an imbalanced dataset with a specific sampling strategy. A simple toy dataset to visualize clustering and classification algorithms. @@ -52,7 +51,7 @@ def make_imbalance( verbose : bool, default=False Show information regarding the sampling. - kwargs : dict + **kwargs : dict Dictionary of additional keyword arguments to pass to ``sampling_strategy``. @@ -62,7 +61,7 @@ def make_imbalance( The array containing the imbalanced data. y_resampled : ndarray of shape (n_samples_new) - The corresponding label of `X_resampled` + The corresponding label of `X_resampled`. Notes ----- diff --git a/imblearn/datasets/_zenodo.py b/imblearn/datasets/_zenodo.py index 6fc0d6eaf..345580252 100644 --- a/imblearn/datasets/_zenodo.py +++ b/imblearn/datasets/_zenodo.py @@ -38,7 +38,6 @@ .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly Imbalanced Data Learning and their Application in Bioinformatics." Dissertation, Georgia State University, (2011). - """ # Author: Guillaume Lemaitre @@ -147,12 +146,12 @@ def fetch_datasets( The ordered is defined by ``filter_data``. Each Bunch object --- referred as dataset --- have the following attributes: - dataset.data : ndarray of shape (n_samples, n_features) + dataset.data : ndarray of shape (n_samples, n_features) - dataset.target : ndarray of shape (n_samples,) + dataset.target : ndarray of shape (n_samples,) - dataset.DESCR : str - Description of the each dataset. + dataset.DESCR : str + Description of the each dataset. Notes ----- diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py index d773fd127..23bec1793 100644 --- a/imblearn/ensemble/_bagging.py +++ b/imblearn/ensemble/_bagging.py @@ -111,6 +111,9 @@ class BalancedBaggingClassifier(BaggingClassifier): estimators_ : list of estimators The collection of fitted base estimators. + sampler_ : sampler object + The validate sampler created from the `sampler` parameter. + estimators_samples_ : list of ndarray The subset of drawn samples (i.e., the in-bag samples) for each base estimator. Each subset is defined by a boolean mask. @@ -133,6 +136,11 @@ class BalancedBaggingClassifier(BaggingClassifier): was never left out during the bootstrap. In this case, ``oob_decision_function_`` might contain NaN. + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- BalancedRandomForestClassifier : Random forest applying random-under diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index 328b4a9ed..dcef08c5d 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -73,12 +73,23 @@ class EasyEnsembleClassifier(BaggingClassifier): estimators_ : list of estimators The collection of fitted base estimators. + estimators_samples_ : list of arrays + The subset of drawn samples for each base estimator. + + estimators_features_ : list of arrays + The subset of drawn features for each base estimator. + classes_ : array, shape (n_classes,) The classes labels. n_classes_ : int or list The number of classes. + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- BalancedBaggingClassifier : Bagging classifier for which each base diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 7346e74d2..eec749227 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -230,10 +230,17 @@ class BalancedRandomForestClassifier(RandomForestClassifier): Attributes ---------- - estimators_ : list of DecisionTreeClassifier + base_estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` instance + The child estimator template used to create the collection of fitted + sub-estimators. + + estimators_ : list of :class:`~sklearn.tree.DecisionTreeClassifier` The collection of fitted sub-estimators. - samplers_ : list of RandomUnderSampler + base_sampler_ : :class:`~imblearn.under_sampling.RandomUnderSampler` + The base sampler used to construct the subsequent list of samplers. + + samplers_ : list of :class:`~imblearn.under_sampling.RandomUnderSampler` The collection of fitted samplers. pipelines_ : list of Pipeline. @@ -250,6 +257,11 @@ class labels (multi-output problem). n_features_ : int The number of features when ``fit`` is performed. + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -628,7 +640,7 @@ def _set_oob_score(self, X, y): @property def n_features_(self): """Number of features when fitting the estimator.""" - return getattr(self.n_features_in_, self._n_features) + return getattr(self.n_features_in_, "n_features_", self._n_features) def _more_tags(self): return { diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py index 5c4f80ecc..6376c3865 100644 --- a/imblearn/ensemble/_weight_boosting.py +++ b/imblearn/ensemble/_weight_boosting.py @@ -68,7 +68,10 @@ class RUSBoostClassifier(AdaBoostClassifier): estimators_ : list of classifiers The collection of fitted sub-estimators. - samplers_ : list of RandomUnderSampler + base_sampler_ : :class:`~imblearn.under_sampling.RandomUnderSampler` + The base sampler used to generate the subsequent samplers. + + samplers_ : list of :class:`~imblearn.under_sampling.RandomUnderSampler` The collection of fitted samplers. pipelines_ : list of Pipeline @@ -90,6 +93,11 @@ class RUSBoostClassifier(AdaBoostClassifier): feature_importances_ : ndarray of shape (n_features,) The feature importances if supported by the ``base_estimator``. + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- BalancedBaggingClassifier : Bagging classifier for which each base diff --git a/imblearn/exceptions.py b/imblearn/exceptions.py index b127de986..1011d142b 100644 --- a/imblearn/exceptions.py +++ b/imblearn/exceptions.py @@ -8,6 +8,24 @@ def raise_isinstance_error(variable_name, possible_type, variable): + """Raise consistent error message for isinstance() function. + + Parameters + ---------- + variable_name : str + The name of the variable. + + possible_type : type + The possible type of the variable. + + variable : object + The variable to check. + + Raises + ------ + ValueError + If the instance is not of the possible type. + """ raise ValueError( f"{variable_name} has to be one of {possible_type}. " f"Got {type(variable)} instead." diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 309ab6cb9..4c0707498 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -15,9 +15,9 @@ def import_keras(): def import_from_keras(): try: - import keras + import keras # noqa - return (keras.utils.Sequence,), True + return (keras.utils.data_utils.Sequence,), True except ImportError: return tuple(), False @@ -33,7 +33,10 @@ def import_from_tensforflow(): ParentClassTensorflow, has_keras_tf = import_from_tensforflow() has_keras = has_keras_k or has_keras_tf if has_keras: - ParentClass = ParentClassKeras + ParentClassTensorflow + if has_keras_tf: + ParentClass = ParentClassTensorflow + else: + ParentClass = ParentClassKeras else: ParentClass = (object,) return ParentClass, has_keras diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index 6f450598a..5b7c25a7b 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -50,7 +50,7 @@ def sensitivity_specificity_support( warn_for=("sensitivity", "specificity"), sample_weight=None, ): - """Compute sensitivity, specificity, and support for each class + """Compute sensitivity, specificity, and support for each class. The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The sensitivity @@ -297,7 +297,7 @@ def sensitivity_score( average="binary", sample_weight=None, ): - """Compute the sensitivity + """Compute the sensitivity. The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The sensitivity @@ -398,7 +398,7 @@ def specificity_score( average="binary", sample_weight=None, ): - """Compute the specificity + """Compute the specificity. The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number of true negatives and ``fp`` the number of false positives. The specificity @@ -568,13 +568,14 @@ class is unrecognized by the classifier, G-mean resolves to zero. To sample_weight : ndarray of shape (n_samples,), default=None Sample weights. - correction: float, default=0.0 + correction : float, default=0.0 Substitutes sensitivity of unrecognized classes from zero to a given value. Returns ------- geometric_mean : float + Returns the geometric mean. Notes ----- @@ -675,7 +676,7 @@ class is unrecognized by the classifier, G-mean resolves to zero. To @_deprecate_positional_args def make_index_balanced_accuracy(*, alpha=0.1, squared=True): - """Balance any scoring function using the index balanced accuracy + """Balance any scoring function using the index balanced accuracy. This factory function wraps scoring function to express it as the index balanced accuracy (IBA). You need to use this function to @@ -792,8 +793,7 @@ def classification_report_imbalanced( output_dict=False, zero_division="warn", ): - """Build a classification report based on metrics used with imbalanced - dataset + """Build a classification report based on metrics used with imbalanced dataset. Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. This report compiles the @@ -878,7 +878,6 @@ class 2 1.00 0.67 1.00 0.80 0.82 0.64\ avg / total 0.70 0.60 0.90 0.61 0.66 0.54\ 5 - """ if labels is None: @@ -991,8 +990,7 @@ class 2 1.00 0.67 1.00 0.80 0.82 0.64\ def macro_averaged_mean_absolute_error(y_true, y_pred, *, sample_weight=None): - """Compute Macro-Averaged Mean Absolute Error (MA-MAE) - for imbalanced ordinal classification. + """Compute Macro-Averaged MAE for imbalanced ordinal classification. This function computes each MAE for each class and average them, giving an equal weight to each class. diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index ff23b9004..8d406da3c 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -66,6 +66,10 @@ class ValueDifferenceMetric(BaseEstimator): List of length `n_features` containing the conditional probabilities for each category given a class. + See Also + -------- + sklearn.neighbors.DistanceMetric : Interface for fast metric computation. + Notes ----- The input data `X` are expected to be encoded by an @@ -118,7 +122,8 @@ def fit(self, X, y): Returns ------- - self + self : object + Return the instance itself. """ check_consistent_length(X, y) X, y = self._validate_data(X, y, reset=True, dtype=np.int32) diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index c7edd7d57..cbfeeda22 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -46,6 +46,21 @@ class ADASYN(BaseOverSampler): {n_jobs} + Attributes + ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + nn_ : estimator object + Validated K-nearest Neighbours estimator linked to the parameter `n_neighbors`. + + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- SMOTE : Over-sample using SMOTE. diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 8d5472fbb..8df574c20 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -57,6 +57,11 @@ class RandomOverSampler(BaseOverSampler): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. @@ -68,6 +73,11 @@ class RandomOverSampler(BaseOverSampler): .. versionadded:: 0.8 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- BorderlineSMOTE : Over-sample using the borderline-SMOTE variant. diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 5e311460a..23bff84c1 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -231,6 +231,21 @@ class SMOTE(BaseSMOTE): {n_jobs} + Attributes + ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + nn_k_ : estimator object + Validated k-nearest neighbours created from the `k_neighbors` parameter. + + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- SMOTENC : Over-sample using SMOTE for continuous and categorical features. @@ -628,6 +643,21 @@ class SMOTEN(SMOTE): {n_jobs} + Attributes + ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + nn_k_ : estimator object + Validated k-nearest neighbours created from the `k_neighbors` parameter. + + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- SMOTE : Over-sample using SMOTE. diff --git a/imblearn/over_sampling/_smote/cluster.py b/imblearn/over_sampling/_smote/cluster.py index b5074c5e2..c18e9b7db 100644 --- a/imblearn/over_sampling/_smote/cluster.py +++ b/imblearn/over_sampling/_smote/cluster.py @@ -69,6 +69,11 @@ class KMeansSMOTE(BaseSMOTE): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + kmeans_estimator_ : estimator The fitted clustering method used before to apply SMOTE. @@ -78,6 +83,11 @@ class KMeansSMOTE(BaseSMOTE): cluster_balance_threshold_ : float The threshold used during ``fit`` for calling a cluster balanced. + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- SMOTE : Over-sample using SMOTE. diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py index 7a37b2c17..3c49458ef 100644 --- a/imblearn/over_sampling/_smote/filter.py +++ b/imblearn/over_sampling/_smote/filter.py @@ -65,6 +65,24 @@ class BorderlineSMOTE(BaseSMOTE): The type of SMOTE algorithm to use one of the following options: ``'borderline-1'``, ``'borderline-2'``. + Attributes + ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + nn_k_ : estimator object + Validated k-nearest neighbours created from the `k_neighbors` parameter. + + nn_m_ : estimator object + Validated m-nearest neighbours created from the `m_neighbors` parameter. + + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- SMOTE : Over-sample using SMOTE. @@ -264,6 +282,28 @@ class SVMSMOTE(BaseSMOTE): out_step : float, default=0.5 Step size when extrapolating. + Attributes + ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + nn_k_ : estimator object + Validated k-nearest neighbours created from the `k_neighbors` parameter. + + nn_m_ : estimator object + Validated m-nearest neighbours created from the `m_neighbors` parameter. + + svm_estimator_ : estimator object + The validated SVM classifier used to detect samples from which to + generate new synthetic samples. + + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- SMOTE : Over-sample using SMOTE. diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index b793b5aba..5e8e2ad13 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -67,6 +67,12 @@ class Pipeline(pipeline.Pipeline): Read-only attribute to access any step parameter by user given name. Keys are step names and values are steps parameters. + classes_ : ndarray of shape (n_classes,) + The classes labels. + + n_features_in_ : int + Number of features seen during first step `fit` method. + See Also -------- make_pipeline : Helper function to make pipeline. @@ -419,6 +425,7 @@ def make_pipeline(*steps, memory=None, verbose=False): Returns ------- p : Pipeline + Returns an imbalanced-learn `Pipeline` instance that handles samplers. See Also -------- diff --git a/imblearn/tests/test_docstring_parameters.py b/imblearn/tests/test_docstring_parameters.py new file mode 100644 index 000000000..acc06c8b2 --- /dev/null +++ b/imblearn/tests/test_docstring_parameters.py @@ -0,0 +1,253 @@ +# Authors: Alexandre Gramfort +# Raghav RV +# License: BSD 3 clause + +import inspect +import warnings +import importlib +from pkgutil import walk_packages +from inspect import signature + +import pytest + +from sklearn.datasets import make_classification +from sklearn.linear_model import LogisticRegression +from sklearn.utils import IS_PYPY +from sklearn.utils._testing import check_docstring_parameters +from sklearn.utils._testing import _get_func_name +from sklearn.utils._testing import ignore_warnings +from sklearn.utils.estimator_checks import _enforce_estimator_tags_y +from sklearn.utils.estimator_checks import _enforce_estimator_tags_x +from sklearn.utils.estimator_checks import _construct_instance +from sklearn.utils.deprecation import _is_deprecated + +import imblearn +from imblearn.base import is_sampler +from imblearn.utils.testing import all_estimators + + +# walk_packages() ignores DeprecationWarnings, now we need to ignore +# FutureWarnings +with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + # mypy error: Module has no attribute "__path__" + imblearn_path = imblearn.__path__ # type: ignore # mypy issue #1422 + PUBLIC_MODULES = set( + [ + pckg[1] + for pckg in walk_packages(prefix="imblearn.", path=imblearn_path) + if not ("._" in pckg[1] or ".tests." in pckg[1]) + ] + ) + +# functions to ignore args / docstring of +_DOCSTRING_IGNORES = [ + "RUSBoostClassifier", # TODO remove after releasing scikit-learn 1.0.1 + "ValueDifferenceMetric", +] + +# Methods where y param should be ignored if y=None by default +_METHODS_IGNORE_NONE_Y = [ + "fit", + "score", + "fit_predict", + "fit_transform", + "partial_fit", + "predict", +] + + +# numpydoc 0.8.0's docscrape tool raises because of collections.abc under +# Python 3.7 +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +@pytest.mark.skipif(IS_PYPY, reason="test segfaults on PyPy") +def test_docstring_parameters(): + # Test module docstring formatting + + # Skip test if numpydoc is not found + pytest.importorskip( + "numpydoc", reason="numpydoc is required to test the docstrings" + ) + + # XXX unreached code as of v0.22 + from numpydoc import docscrape + + incorrect = [] + for name in PUBLIC_MODULES: + if name.endswith(".conftest"): + # pytest tooling, not part of the scikit-learn API + continue + with warnings.catch_warnings(record=True): + module = importlib.import_module(name) + classes = inspect.getmembers(module, inspect.isclass) + # Exclude non-scikit-learn classes + classes = [cls for cls in classes if cls[1].__module__.startswith("imblearn")] + for cname, cls in classes: + this_incorrect = [] + if cname in _DOCSTRING_IGNORES or cname.startswith("_"): + continue + if inspect.isabstract(cls): + continue + with warnings.catch_warnings(record=True) as w: + cdoc = docscrape.ClassDoc(cls) + if len(w): + raise RuntimeError( + "Error for __init__ of %s in %s:\n%s" % (cls, name, w[0]) + ) + + cls_init = getattr(cls, "__init__", None) + + if _is_deprecated(cls_init): + continue + elif cls_init is not None: + this_incorrect += check_docstring_parameters(cls.__init__, cdoc) + + for method_name in cdoc.methods: + method = getattr(cls, method_name) + if _is_deprecated(method): + continue + param_ignore = None + # Now skip docstring test for y when y is None + # by default for API reason + if method_name in _METHODS_IGNORE_NONE_Y: + sig = signature(method) + if "y" in sig.parameters and sig.parameters["y"].default is None: + param_ignore = ["y"] # ignore y for fit and score + result = check_docstring_parameters(method, ignore=param_ignore) + this_incorrect += result + + incorrect += this_incorrect + + functions = inspect.getmembers(module, inspect.isfunction) + # Exclude imported functions + functions = [fn for fn in functions if fn[1].__module__ == name] + for fname, func in functions: + # Don't test private methods / functions + if fname.startswith("_"): + continue + if fname == "configuration" and name.endswith("setup"): + continue + name_ = _get_func_name(func) + if not any(d in name_ for d in _DOCSTRING_IGNORES) and not _is_deprecated( + func + ): + incorrect += check_docstring_parameters(func) + + msg = "\n".join(incorrect) + if len(incorrect) > 0: + raise AssertionError("Docstring Error:\n" + msg) + + +@ignore_warnings(category=FutureWarning) +def test_tabs(): + # Test that there are no tabs in our source files + for importer, modname, ispkg in walk_packages( + imblearn.__path__, prefix="imblearn." + ): + + if IS_PYPY: + continue + + # because we don't import + mod = importlib.import_module(modname) + + try: + source = inspect.getsource(mod) + except IOError: # user probably should have run "make clean" + continue + assert "\t" not in source, ( + '"%s" has tabs, please remove them ', + "or add it to the ignore list" % modname, + ) + + +def _construct_compose_pipeline_instance(Estimator): + # Minimal / degenerate instances: only useful to test the docstrings. + if Estimator.__name__ == "Pipeline": + return Estimator(steps=[("clf", LogisticRegression())]) + + +@pytest.mark.parametrize("name, Estimator", all_estimators()) +def test_fit_docstring_attributes(name, Estimator): + pytest.importorskip("numpydoc") + from numpydoc import docscrape + + if Estimator.__name__ in _DOCSTRING_IGNORES: + return + + doc = docscrape.ClassDoc(Estimator) + attributes = doc["Attributes"] + + if Estimator.__name__ == "Pipeline": + est = _construct_compose_pipeline_instance(Estimator) + else: + est = _construct_instance(Estimator) + + X, y = make_classification( + n_samples=20, + n_features=3, + n_redundant=0, + n_classes=2, + random_state=2, + ) + + y = _enforce_estimator_tags_y(est, y) + X = _enforce_estimator_tags_x(est, X) + + if "oob_score" in est.get_params(): + est.set_params(oob_score=True) + + if is_sampler(est): + est.fit_resample(X, y) + else: + est.fit(X, y) + + skipped_attributes = set([]) + + for attr in attributes: + if attr.name in skipped_attributes: + continue + desc = " ".join(attr.desc).lower() + # As certain attributes are present "only" if a certain parameter is + # provided, this checks if the word "only" is present in the attribute + # description, and if not the attribute is required to be present. + if "only " in desc: + continue + # ignore deprecation warnings + with ignore_warnings(category=FutureWarning): + assert hasattr(est, attr.name) + + fit_attr = _get_all_fitted_attributes(est) + fit_attr_names = [attr.name for attr in attributes] + undocumented_attrs = set(fit_attr).difference(fit_attr_names) + undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes) + if undocumented_attrs: + raise AssertionError( + f"Undocumented attributes for {Estimator.__name__}: {undocumented_attrs}" + ) + + +def _get_all_fitted_attributes(estimator): + "Get all the fitted attributes of an estimator including properties" + # attributes + fit_attr = list(estimator.__dict__.keys()) + + # properties + with warnings.catch_warnings(): + warnings.filterwarnings("error", category=FutureWarning) + + for name in dir(estimator.__class__): + obj = getattr(estimator.__class__, name) + if not isinstance(obj, property): + continue + + # ignore properties that raises an AttributeError and deprecated + # properties + try: + getattr(estimator, name) + except (AttributeError, FutureWarning): + continue + fit_attr.append(name) + + return [k for k in fit_attr if k.endswith("_") and not k.startswith("_")] diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 99a8e470f..4ffe7f76b 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -69,6 +69,24 @@ class ClusterCentroids(BaseUnderSampler): .. deprecated:: 0.7 `n_jobs` was deprecated in 0.7 and will be removed in 0.9. + Attributes + ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + estimator_ : estimator object + The validated estimator created from the `estimator` parameter. + + voting_ : str + The validated voting strategy. + + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- EditedNearestNeighbours : Under-sampling by editing samples. diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 738110cae..df4afce76 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -53,11 +53,24 @@ class CondensedNearestNeighbour(BaseCleaningSampler): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + estimator_ : estimator object + The validated K-nearest neighbor estimator created from `n_neighbors` parameter. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- EditedNearestNeighbours : Undersample by editing samples. diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index e0eb866a7..d06a867be 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -59,11 +59,24 @@ class EditedNearestNeighbours(BaseCleaningSampler): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + nn_ : estimator object + Validated K-nearest Neighbours instance created from `n_neighbors` parameter. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- CondensedNearestNeighbour : Undersample by condensing samples. @@ -207,6 +220,18 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + nn_ : estimator object + Validated K-nearest Neighbours estimator linked to the parameter `n_neighbors`. + + enn_ : sampler object + The validated :class:`~imblearn.under_sampling.EditedNearestNeighbours` + instance. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. @@ -217,6 +242,11 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): .. versionadded:: 0.6 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- CondensedNearestNeighbour : Undersample by condensing samples. @@ -395,11 +425,28 @@ class without early stopping. Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + nn_ : estimator object + Validated K-nearest Neighbours estimator linked to the parameter `n_neighbors`. + + enn_ : sampler object + The validated :class:`~imblearn.under_sampling.EditedNearestNeighbours` + instance. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- CondensedNearestNeighbour: Under-sampling by condensing samples. diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 9b82215ec..30385c861 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -57,11 +57,24 @@ class InstanceHardnessThreshold(BaseUnderSampler): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + estimator_ : estimator object + The validated classifier used to estimate the instance hardness of the samples. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- NearMiss : Undersample based on near-miss search. diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index ec3f33cfe..5246e54f0 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -55,11 +55,24 @@ class NearMiss(BaseUnderSampler): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + nn_ : estimator object + Validated K-nearest Neighbours object created from `n_neighbors` parameter. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- RandomUnderSampler : Random undersample the dataset. diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index 702a022ac..3cb3d5320 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -66,11 +66,24 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + nn_ : estimator object + Validated K-nearest Neighbours object created from `n_neighbors` parameter. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- EditedNearestNeighbours : Undersample by editing noisy samples. diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 305abec0b..abe9484d9 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -51,11 +51,24 @@ class OneSidedSelection(BaseCleaningSampler): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + estimator_ : estimator object + Validated K-nearest neighbors estimator created from parameter `n_neighbors`. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- EditedNearestNeighbours : Undersample by editing noisy samples. diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 6a57659fb..fe2b9b7a2 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -39,11 +39,21 @@ class RandomUnderSampler(BaseUnderSampler): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- NearMiss : Undersample using near-miss samples. diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index c3d84b61a..7ee8d31d9 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -32,18 +32,28 @@ class TomekLinks(BaseCleaningSampler): Attributes ---------- + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.9 + See Also -------- EditedNearestNeighbours : Undersample by samples edition. CondensedNearestNeighbour : Undersample by samples condensation. - RandomUnderSampling : Randomly under-sample the dataset. + RandomUnderSampler : Randomly under-sample the dataset. Notes ----- diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 7eb4099ea..23dc3b3ab 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -479,7 +479,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): The type of sampling. Can be either ``'over-sampling'``, ``'under-sampling'``, or ``'clean-sampling'``. - kwargs : dict + **kwargs : dict Dictionary of additional keyword arguments to pass to ``sampling_strategy`` when this is a callable. @@ -489,7 +489,6 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): The converted and validated sampling target. Returns a dictionary with the key being the class target and the value being the desired number of samples. - """ if sampling_type not in SAMPLING_KIND: raise ValueError( diff --git a/imblearn/utils/deprecation.py b/imblearn/utils/deprecation.py index 2faa35154..1120dae0f 100644 --- a/imblearn/utils/deprecation.py +++ b/imblearn/utils/deprecation.py @@ -16,7 +16,7 @@ def deprecate_parameter(sampler, version_deprecation, param_deprecated, new_para version_deprecation : str, The version from which the parameter will be deprecated. The format - should be ``'x.y'`` + should be ``'x.y'``. param_deprecated : str, The parameter being deprecated. @@ -24,11 +24,6 @@ def deprecate_parameter(sampler, version_deprecation, param_deprecated, new_para new_param : str, The parameter used instead of the deprecated parameter. By default, no parameter is expected. - - Returns - ------- - None - """ x, y = version_deprecation.split(".") version_removed = x + "." + str(int(y) + 2) diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py index e163d31ea..eaad65efb 100644 --- a/imblearn/utils/testing.py +++ b/imblearn/utils/testing.py @@ -32,7 +32,7 @@ def all_estimators( Parameters ---------- - type_filter : string, list of string, or None, default=None + type_filter : str, list of str, or None, default=None Which kind of estimators should be returned. If None, no filter is applied and all estimators are returned. Possible values are 'sampler' to get estimators only of these specific @@ -44,7 +44,6 @@ def all_estimators( estimators : list of tuples List of (name, class), where ``name`` is the class name as string and ``class`` is the actual type of the class. - """ from ..base import SamplerMixin @@ -117,16 +116,16 @@ def is_abstract(c): @contextmanager def warns(expected_warning, match=None): - r"""Assert that a warning is raised with an optional matching pattern - - Assert that a code block/function call warns ``expected_warning`` - and raise a failure exception otherwise. It can be used within a context - manager ``with``. + r"""Assert that a warning is raised with an optional matching pattern. .. deprecated:: 0.8 This function is deprecated in 0.8 and will be removed in 0.10. Use `pytest.warns()` instead. + Assert that a code block/function call warns ``expected_warning`` + and raise a failure exception otherwise. It can be used within a context + manager ``with``. + Parameters ---------- expected_warning : Warning @@ -135,9 +134,9 @@ def warns(expected_warning, match=None): match : regex str or None, optional The pattern to be matched. By default, no check is done. - Returns - ------- - None + Yields + ------ + Nothing. Examples -------- diff --git a/maint_tools/test_docstring.py b/maint_tools/test_docstring.py index c9f802c4a..990525b56 100644 --- a/maint_tools/test_docstring.py +++ b/maint_tools/test_docstring.py @@ -1,8 +1,13 @@ +import inspect +import importlib import re +import pkgutil from inspect import signature from typing import Optional import pytest + +import imblearn from imblearn.utils.testing import all_estimators numpydoc_validation = pytest.importorskip("numpydoc.validate") @@ -15,30 +20,15 @@ "AllKNN$", "AllKNN.", "BalancedBaggingClassifier$", - "BalancedBaggingClassifier.estimators_samples_", - "BalancedBaggingClassifier.fit", - "BalancedBaggingClassifier.get_params", - "BalancedBaggingClassifier.predict", - "BalancedBaggingClassifier.score", - "BalancedBaggingClassifier.set_params", + "BalancedBaggingClassifier.", "BalancedRandomForestClassifier$", - "BalancedRandomForestClassifier.apply", - "BalancedRandomForestClassifier.feature_importances_", - "BalancedRandomForestClassifier.fit", - "BalancedRandomForestClassifier.predict$", - "BalancedRandomForestClassifier.score", - "BalancedRandomForestClassifier.set_params", + "BalancedRandomForestClassifier.", "ClusterCentroids$", "ClusterCentroids.", "CondensedNearestNeighbour$", "CondensedNearestNeighbour.", "EasyEnsembleClassifier$", - "EasyEnsembleClassifier.estimators_samples_", - "EasyEnsembleClassifier.fit", - "EasyEnsembleClassifier.get_params", - "EasyEnsembleClassifier.predict", - "EasyEnsembleClassifier.score", - "EasyEnsembleClassifier.set_params", + "EasyEnsembleClassifier.", "EditedNearestNeighbours$", "EditedNearestNeighbours.", "FunctionSampler$", @@ -54,10 +44,7 @@ "OneSidedSelection$", "OneSidedSelection.", "Pipeline$", - "Pipeline.fit$", - "Pipeline.fit_transform", - "Pipeline.fit_resample", - "Pipeline.fit_predict", + "Pipeline.", "RUSBoostClassifier$", "RUSBoostClassifier.", "RandomOverSampler$", @@ -66,7 +53,14 @@ "RandomUnderSampler.", "TomekLinks$", "TomekLinks", + "ValueDifferenceMetric$", + "ValueDifferenceMetric.", +] + +FUNCTION_DOCSTRING_IGNORE_LIST = [ + "imblearn.tensorflow._generator.balanced_batch_generator", ] +FUNCTION_DOCSTRING_IGNORE_LIST = set(FUNCTION_DOCSTRING_IGNORE_LIST) def get_all_methods(): @@ -88,7 +82,48 @@ def get_all_methods(): yield Estimator, method -def filter_errors(errors, method): +def _is_checked_function(item): + if not inspect.isfunction(item): + return False + + if item.__name__.startswith("_"): + return False + + mod = item.__module__ + if not mod.startswith("imblearn.") or mod.endswith("estimator_checks"): + return False + + return True + + +def get_all_functions_names(): + """Get all public functions define in the imblearn module""" + modules_to_ignore = { + "tests", + "estimator_checks", + } + + all_functions_names = set() + for module_finder, module_name, ispkg in pkgutil.walk_packages( + path=imblearn.__path__, prefix="imblearn." + ): + module_parts = module_name.split(".") + if ( + any(part in modules_to_ignore for part in module_parts) + or "._" in module_name + ): + continue + + module = importlib.import_module(module_name) + functions = inspect.getmembers(module, _is_checked_function) + for name, func in functions: + full_name = f"{func.__module__}.{func.__name__}" + all_functions_names.add(full_name) + + return sorted(all_functions_names) + + +def filter_errors(errors, method, Estimator=None): """ Ignore some errors based on the method type. @@ -100,12 +135,22 @@ def filter_errors(errors, method): # (as we may need refer to the name of the returned # object) # - GL01: Docstring text (summary) should start in the line - # immediately after the opening quotes (not in the same line, - # or leaving a blank line in between) + # immediately after the opening quotes (not in the same line, + # or leaving a blank line in between) + # - GL02: If there's a blank line, it should be before the + # first line of the Returns section, not after (it allows to have + # short docstrings for properties). - if code in ["RT02", "GL01"]: + if code in ["RT02", "GL01", "GL02"]: continue + # Ignore PR02: Unknown parameters for properties. We sometimes use + # properties for ducktyping, i.e. SGDClassifier.predict_proba + if code == "PR02" and Estimator is not None and method is not None: + method_obj = getattr(Estimator, method) + if isinstance(method_obj, property): + continue + # Following codes are only taken into account for the # top level class docstrings: # - ES01: No extended summary found @@ -172,6 +217,24 @@ def repr_errors(res, estimator=None, method: Optional[str] = None) -> str: return msg +@pytest.mark.parametrize("function_name", get_all_functions_names()) +def test_function_docstring(function_name, request): + """Check function docstrings using numpydoc.""" + if function_name in FUNCTION_DOCSTRING_IGNORE_LIST: + request.applymarker( + pytest.mark.xfail(run=False, reason="TODO pass numpydoc validation") + ) + + res = numpydoc_validation.validate(function_name) + + res["errors"] = list(filter_errors(res["errors"], method="function")) + + if res["errors"]: + msg = repr_errors(res, method=f"Tested function: {function_name}") + + raise ValueError(msg) + + @pytest.mark.parametrize("Estimator, method", get_all_methods()) def test_docstring(Estimator, method, request): base_import_path = Estimator.__module__