Merge pull request #260 from makrobios/count_encoder

Count encoder
scikit-learn-contrib · Jul 17, 2020 · fdd06bc · fdd06bc
2 parents 6e5ed04 + cafd264
commit fdd06bc
Show file tree

Hide file tree

Showing 6 changed files with 89 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -60,4 +60,12 @@ docs/_build/
 # PyBuilder
 target/
 
-.pytest_cache/
+.pytest_cache/
+.tmp/
+checkcommits.sh
+runtest.py
+
+
+*~
+*.swp
+*.swo
diff --git a/category_encoders/__init__.py b/category_encoders/__init__.py
@@ -31,6 +31,7 @@
 __all__ = [
     'BackwardDifferenceEncoder',
     'BinaryEncoder',
+    'CountEncoder',
     'HashingEncoder',
     'HelmertEncoder',
     'OneHotEncoder',

diff --git a/category_encoders/count.py b/category_encoders/count.py
@@ -11,11 +11,11 @@
 
 __author__ = 'joshua t. dunn'
 
-
+# COUNT_ENCODER BRANCH
 class CountEncoder(BaseEstimator, TransformerMixin):
     def __init__(self, verbose=0, cols=None, drop_invariant=False,
-                 return_df=True, handle_unknown=None,
-                 handle_missing='count',
+                 return_df=True, handle_unknown='value',
+                 handle_missing='value',
                  min_group_size=None, combine_min_nan_groups=None,
                  min_group_name=None, normalize=False):
         """Count encoding for categorical features.
@@ -38,31 +38,32 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
             (otherwise it will be a numpy array).
         handle_missing: str
             how to handle missing values at fit time. Options are 'error', 'return_nan',
-            and 'count'. Default 'count', which treat NaNs as a countable category at
+            and 'value'. Default 'value', which treat NaNs as a countable category at
             fit time.
-        handle_unknown: str, int or dict of.
+        handle_unknown: str, int or dict of {column : option, ...}.
             how to handle unknown labels at transform time. Options are 'error'
-            'return_nan' and an int. Defaults to None which uses NaN behaviour
+            'return_nan', 'value' and int. Defaults to None which uses NaN behaviour
             specified at fit time. Passing an int will fill with this int value.
-        normalize: bool or dict of.
+        normalize: bool or dict of {column : bool, ...}.
             whether to normalize the counts to the range (0, 1). See Pandas `value_counts`
             for more details.
-        min_group_size: int, float or dict of.
+        min_group_size: int, float or dict of {column : option, ...}.
             the minimal count threshold of a group needed to ensure it is not
-            combined into a "leftovers" group. If float in the range (0, 1),
+            combined into a "leftovers" group. Default value is 0.01. 
+            If float in the range (0, 1),
             `min_group_size` is calculated as int(X.shape[0] * min_group_size).
             Note: This value may change type based on the `normalize` variable. If True
             this will become a float. If False, it will be an int.
-        min_group_name: None, str or dict of.
+        min_group_name: None, str or dict of {column : option, ...}.
             Set the name of the combined minimum groups when the defaults become
             too long. Default None. In this case the category names will be joined
             alphabetically with a `_` delimiter.
-            Note: The default name can be long ae may keep changing, for example, 
+            Note: The default name can be long and may keep changing, for example, 
             in cross-validation.
-        combine_min_nan_groups: bool or dict of.
+        combine_min_nan_groups: bool or dict of {column : bool, ...}.
             whether to combine the leftovers group with NaN group. Default True. Can
             also be forced to combine with 'force' meaning small groups are effectively
-            counted as NaNs. Force can only used when 'handle_missing' is 'count' or 'error'.
+            counted as NaNs. Force can only used when 'handle_missing' is 'value' or 'error'.
             Note: Will not force if it creates an binary or invariant column.
 
 
@@ -116,6 +117,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
         self.min_group_size = min_group_size
         self.min_group_name = min_group_name
         self.combine_min_nan_groups = combine_min_nan_groups
+        self.feature_names = None
 
         self._check_set_create_attrs()
 
@@ -159,17 +161,26 @@ def fit(self, X, y=None, **kwargs):
 
         self._fit_count_encode(X, y)
 
+        X_temp = self.transform(X, override_return_df=True)
+        self.feature_names = list(X_temp.columns)
+
         if self.drop_invariant:
             self.drop_cols = []
-            X_temp = self.transform(X)
             generated_cols = util.get_generated_cols(X, X_temp, self.cols)
             self.drop_cols = [
                 x for x in generated_cols if X_temp[x].var() <= 10e-5
             ]
 
+            try:
+                [self.feature_names.remove(x) for x in self.drop_cols]
+            except KeyError as e:
+                if self.verbose > 0:
+                    print("Could not remove column from feature names."
+                    "Not found in generated cols.\n{}".format(e))
+
         return self
 
-    def transform(self, X, y=None):
+    def transform(self, X, y=None, override_return_df=False):
         """Perform the transformation to new categorical data.
 
         Parameters
@@ -182,6 +193,9 @@ def transform(self, X, y=None):
         p : array, shape = [n_samples, n_numeric + N]
             Transformed values with encoding applied.
         """
+        if self.handle_missing == 'error':
+            if X[self.cols].isnull().any().any():
+                raise ValueError('Columns to be encoded can not contain null')
 
         if self._dim is None:
             raise ValueError(
@@ -207,7 +221,7 @@ def transform(self, X, y=None):
             for col in self.drop_cols:
                 X.drop(col, 1, inplace=True)
 
-        if self.return_df:
+        if self.return_df or override_return_df:
             return X
         else:
             return X.values
@@ -229,10 +243,10 @@ def _fit_count_encode(self, X_in, y):
                         % (col,)
                     )
 
-                elif self._handle_missing[col] not in ['count', 'return_nan',  'error', None]:
+                elif self._handle_missing[col] not in ['value', 'return_nan',  'error', None]:
                     raise ValueError(
                         '%s key in `handle_missing` should be one of: '
-                        ' `count`, `return_nan` and `error` not `%s`.'
+                        ' `value`, `return_nan` and `error` not `%s`.'
                         % (col, str(self._handle_missing[col]))
                     )
 
@@ -243,16 +257,21 @@ def _fit_count_encode(self, X_in, y):
 
             self.mapping[col].index = self.mapping[col].index.astype(object)
 
+
+
             if self._handle_missing[col] == 'return_nan':
                 self.mapping[col][np.NaN] = np.NaN
+
+            # elif self._handle_missing[col] == 'value':
+            #test_count.py failing     self.mapping[col].loc[-2] = 0
 
         if any([val is not None for val in self._min_group_size.values()]):
             self.combine_min_categories(X)
 
     def _transform_count_encode(self, X_in, y):
         """Perform the transform count encoding."""
         X = X_in.copy(deep=True)
-        X.loc[:, self.cols] = X.fillna(value=np.nan)
+        X.fillna(value=np.nan, inplace=True)
 
         for col in self.cols:
             if self._min_group_size is not None:
@@ -261,11 +280,17 @@ def _transform_count_encode(self, X_in, y):
                         X[col].map(self._min_group_categories[col])
                         .fillna(X[col])
                     )
-
-            X[col] = X[col].map(self.mapping[col])
-
+
+            X[col] = X[col].astype(object).map(self.mapping[col])
             if isinstance(self._handle_unknown[col], (int, np.integer)):
                 X[col] = X[col].fillna(self._handle_unknown[col])
+
+            elif (self._handle_unknown[col] == 'value'
+                    and X[col].isna().any()
+                    and self._handle_missing[col] != 'return_nan'
+                 ):
+                 X[col].replace(np.nan, 0, inplace=True)
+
             elif (
                 self._handle_unknown[col] == 'error'
                 and X[col].isnull().any()
@@ -348,14 +373,16 @@ def _check_set_create_attrs(self):
                 "'combine_min_nan_groups' == 'force' for all columns."
             )
 
+
         if (
             self.combine_min_nan_groups is not None
             and self.min_group_size is None
         ):
-            raise ValueError(
-                "`combine_min_nan_groups` only works when `min_group_size` "
-                "is set for all columns."
-            )
+            pass
+            # raise ValueError(
+            #     "`combine_min_nan_groups` only works when `min_group_size` "
+            #     "is set for all columns."
+            # )
 
         if (
             self.min_group_name is not None
@@ -376,8 +403,8 @@ def _check_set_create_dict_attrs(self):
             'min_group_name': None,
             'combine_min_nan_groups': True,
             'min_group_size': None,
-            'handle_unknown': 'count',
-            'handle_missing': None,
+            'handle_unknown': 'value',
+            'handle_missing': 'value',
         }
 
         for attr_name, attr_default in dict_attrs.items():
@@ -423,3 +450,19 @@ def _check_set_create_dict_attrs(self):
                     "is set for column %s."
                     % (col,)
                 )
+
+    def get_feature_names(self):
+        """
+        Returns the names of all transformed / added columns.
+
+        Returns
+        -------
+        feature_names: list
+            A list with all feature names transformed or added.
+            Note: potentially dropped features are not included!
+
+        """
+        if not isinstance(self.feature_names, list):
+            raise ValueError("CountEncoder has to be fitted to return feature names.")
+        else:
+            return self.feature_names
diff --git a/docs/source/count.rst b/docs/source/count.rst
@@ -0,0 +1,6 @@
+Count Encoder
+==============
+
+.. autoclass:: category_encoders.count.CountEncoder
+    :members:
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -42,6 +42,7 @@ To use:
     encoder = ce.BaseNEncoder(cols=[...])
     encoder = ce.BinaryEncoder(cols=[...])
     encoder = ce.CatBoostEncoder(cols=[...])
+    encoder = ce.CountEncoder(cols=[...])
     encoder = ce.GLMMEncoder(cols=[...])
     encoder = ce.HashingEncoder(cols=[...])
     encoder = ce.HelmertEncoder(cols=[...])
@@ -70,6 +71,7 @@ Contents:
    basen
    binary
    catboost
+   count
    glmm
    hashing
    helmert

diff --git a/tests/test_encoders.py b/tests/test_encoders.py
@@ -232,7 +232,6 @@ def test_handle_unknown_value(self):
 
         for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}):  # HashingEncoder supports new values by design -> excluded
             with self.subTest(encoder_name=encoder_name):
-
                 enc = getattr(encoders, encoder_name)(handle_unknown='value')
                 enc.fit(train, y)
                 result = enc.transform(test)