Skip to content

Commit

Permalink
Merge pull request #260 from makrobios/count_encoder
Browse files Browse the repository at this point in the history
Count encoder
  • Loading branch information
wdm0006 authored Jul 17, 2020
2 parents 6e5ed04 + cafd264 commit fdd06bc
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 30 deletions.
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,12 @@ docs/_build/
# PyBuilder
target/

.pytest_cache/
.pytest_cache/
.tmp/
checkcommits.sh
runtest.py


*~
*.swp
*.swo
1 change: 1 addition & 0 deletions category_encoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
__all__ = [
'BackwardDifferenceEncoder',
'BinaryEncoder',
'CountEncoder',
'HashingEncoder',
'HelmertEncoder',
'OneHotEncoder',
Expand Down
99 changes: 71 additions & 28 deletions category_encoders/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@

__author__ = 'joshua t. dunn'


# COUNT_ENCODER BRANCH
class CountEncoder(BaseEstimator, TransformerMixin):
def __init__(self, verbose=0, cols=None, drop_invariant=False,
return_df=True, handle_unknown=None,
handle_missing='count',
return_df=True, handle_unknown='value',
handle_missing='value',
min_group_size=None, combine_min_nan_groups=None,
min_group_name=None, normalize=False):
"""Count encoding for categorical features.
Expand All @@ -38,31 +38,32 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
(otherwise it will be a numpy array).
handle_missing: str
how to handle missing values at fit time. Options are 'error', 'return_nan',
and 'count'. Default 'count', which treat NaNs as a countable category at
and 'value'. Default 'value', which treat NaNs as a countable category at
fit time.
handle_unknown: str, int or dict of.
handle_unknown: str, int or dict of {column : option, ...}.
how to handle unknown labels at transform time. Options are 'error'
'return_nan' and an int. Defaults to None which uses NaN behaviour
'return_nan', 'value' and int. Defaults to None which uses NaN behaviour
specified at fit time. Passing an int will fill with this int value.
normalize: bool or dict of.
normalize: bool or dict of {column : bool, ...}.
whether to normalize the counts to the range (0, 1). See Pandas `value_counts`
for more details.
min_group_size: int, float or dict of.
min_group_size: int, float or dict of {column : option, ...}.
the minimal count threshold of a group needed to ensure it is not
combined into a "leftovers" group. If float in the range (0, 1),
combined into a "leftovers" group. Default value is 0.01.
If float in the range (0, 1),
`min_group_size` is calculated as int(X.shape[0] * min_group_size).
Note: This value may change type based on the `normalize` variable. If True
this will become a float. If False, it will be an int.
min_group_name: None, str or dict of.
min_group_name: None, str or dict of {column : option, ...}.
Set the name of the combined minimum groups when the defaults become
too long. Default None. In this case the category names will be joined
alphabetically with a `_` delimiter.
Note: The default name can be long ae may keep changing, for example,
Note: The default name can be long and may keep changing, for example,
in cross-validation.
combine_min_nan_groups: bool or dict of.
combine_min_nan_groups: bool or dict of {column : bool, ...}.
whether to combine the leftovers group with NaN group. Default True. Can
also be forced to combine with 'force' meaning small groups are effectively
counted as NaNs. Force can only used when 'handle_missing' is 'count' or 'error'.
counted as NaNs. Force can only used when 'handle_missing' is 'value' or 'error'.
Note: Will not force if it creates an binary or invariant column.
Expand Down Expand Up @@ -116,6 +117,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
self.min_group_size = min_group_size
self.min_group_name = min_group_name
self.combine_min_nan_groups = combine_min_nan_groups
self.feature_names = None

self._check_set_create_attrs()

Expand Down Expand Up @@ -159,17 +161,26 @@ def fit(self, X, y=None, **kwargs):

self._fit_count_encode(X, y)

X_temp = self.transform(X, override_return_df=True)
self.feature_names = list(X_temp.columns)

if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
generated_cols = util.get_generated_cols(X, X_temp, self.cols)
self.drop_cols = [
x for x in generated_cols if X_temp[x].var() <= 10e-5
]

try:
[self.feature_names.remove(x) for x in self.drop_cols]
except KeyError as e:
if self.verbose > 0:
print("Could not remove column from feature names."
"Not found in generated cols.\n{}".format(e))

return self

def transform(self, X, y=None):
def transform(self, X, y=None, override_return_df=False):
"""Perform the transformation to new categorical data.
Parameters
Expand All @@ -182,6 +193,9 @@ def transform(self, X, y=None):
p : array, shape = [n_samples, n_numeric + N]
Transformed values with encoding applied.
"""
if self.handle_missing == 'error':
if X[self.cols].isnull().any().any():
raise ValueError('Columns to be encoded can not contain null')

if self._dim is None:
raise ValueError(
Expand All @@ -207,7 +221,7 @@ def transform(self, X, y=None):
for col in self.drop_cols:
X.drop(col, 1, inplace=True)

if self.return_df:
if self.return_df or override_return_df:
return X
else:
return X.values
Expand All @@ -229,10 +243,10 @@ def _fit_count_encode(self, X_in, y):
% (col,)
)

elif self._handle_missing[col] not in ['count', 'return_nan', 'error', None]:
elif self._handle_missing[col] not in ['value', 'return_nan', 'error', None]:
raise ValueError(
'%s key in `handle_missing` should be one of: '
' `count`, `return_nan` and `error` not `%s`.'
' `value`, `return_nan` and `error` not `%s`.'
% (col, str(self._handle_missing[col]))
)

Expand All @@ -243,16 +257,21 @@ def _fit_count_encode(self, X_in, y):

self.mapping[col].index = self.mapping[col].index.astype(object)



if self._handle_missing[col] == 'return_nan':
self.mapping[col][np.NaN] = np.NaN

# elif self._handle_missing[col] == 'value':
#test_count.py failing self.mapping[col].loc[-2] = 0

if any([val is not None for val in self._min_group_size.values()]):
self.combine_min_categories(X)

def _transform_count_encode(self, X_in, y):
"""Perform the transform count encoding."""
X = X_in.copy(deep=True)
X.loc[:, self.cols] = X.fillna(value=np.nan)
X.fillna(value=np.nan, inplace=True)

for col in self.cols:
if self._min_group_size is not None:
Expand All @@ -261,11 +280,17 @@ def _transform_count_encode(self, X_in, y):
X[col].map(self._min_group_categories[col])
.fillna(X[col])
)

X[col] = X[col].map(self.mapping[col])


X[col] = X[col].astype(object).map(self.mapping[col])
if isinstance(self._handle_unknown[col], (int, np.integer)):
X[col] = X[col].fillna(self._handle_unknown[col])

elif (self._handle_unknown[col] == 'value'
and X[col].isna().any()
and self._handle_missing[col] != 'return_nan'
):
X[col].replace(np.nan, 0, inplace=True)

elif (
self._handle_unknown[col] == 'error'
and X[col].isnull().any()
Expand Down Expand Up @@ -348,14 +373,16 @@ def _check_set_create_attrs(self):
"'combine_min_nan_groups' == 'force' for all columns."
)


if (
self.combine_min_nan_groups is not None
and self.min_group_size is None
):
raise ValueError(
"`combine_min_nan_groups` only works when `min_group_size` "
"is set for all columns."
)
pass
# raise ValueError(
# "`combine_min_nan_groups` only works when `min_group_size` "
# "is set for all columns."
# )

if (
self.min_group_name is not None
Expand All @@ -376,8 +403,8 @@ def _check_set_create_dict_attrs(self):
'min_group_name': None,
'combine_min_nan_groups': True,
'min_group_size': None,
'handle_unknown': 'count',
'handle_missing': None,
'handle_unknown': 'value',
'handle_missing': 'value',
}

for attr_name, attr_default in dict_attrs.items():
Expand Down Expand Up @@ -423,3 +450,19 @@ def _check_set_create_dict_attrs(self):
"is set for column %s."
% (col,)
)

def get_feature_names(self):
"""
Returns the names of all transformed / added columns.
Returns
-------
feature_names: list
A list with all feature names transformed or added.
Note: potentially dropped features are not included!
"""
if not isinstance(self.feature_names, list):
raise ValueError("CountEncoder has to be fitted to return feature names.")
else:
return self.feature_names
6 changes: 6 additions & 0 deletions docs/source/count.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Count Encoder
==============

.. autoclass:: category_encoders.count.CountEncoder
:members:

2 changes: 2 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ To use:
encoder = ce.BaseNEncoder(cols=[...])
encoder = ce.BinaryEncoder(cols=[...])
encoder = ce.CatBoostEncoder(cols=[...])
encoder = ce.CountEncoder(cols=[...])
encoder = ce.GLMMEncoder(cols=[...])
encoder = ce.HashingEncoder(cols=[...])
encoder = ce.HelmertEncoder(cols=[...])
Expand Down Expand Up @@ -70,6 +71,7 @@ Contents:
basen
binary
catboost
count
glmm
hashing
helmert
Expand Down
1 change: 0 additions & 1 deletion tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,6 @@ def test_handle_unknown_value(self):

for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded
with self.subTest(encoder_name=encoder_name):

enc = getattr(encoders, encoder_name)(handle_unknown='value')
enc.fit(train, y)
result = enc.transform(test)
Expand Down

0 comments on commit fdd06bc

Please sign in to comment.