From 6fbe34e876c905a8cdc89ffef2999124285ca0a5 Mon Sep 17 00:00:00 2001 From: makrobios Date: Thu, 9 Jul 2020 17:20:28 +0200 Subject: [PATCH 01/14] added count_encoder to docs --- category_encoders/__init__.py | 1 + docs/source/count.rst | 6 ++++++ docs/source/index.rst | 2 ++ 3 files changed, 9 insertions(+) create mode 100755 docs/source/count.rst diff --git a/category_encoders/__init__.py b/category_encoders/__init__.py index bc705eab..790bb467 100644 --- a/category_encoders/__init__.py +++ b/category_encoders/__init__.py @@ -31,6 +31,7 @@ __all__ = [ 'BackwardDifferenceEncoder', 'BinaryEncoder', + 'CountEncoder', 'HashingEncoder', 'HelmertEncoder', 'OneHotEncoder', diff --git a/docs/source/count.rst b/docs/source/count.rst new file mode 100755 index 00000000..bbd0653f --- /dev/null +++ b/docs/source/count.rst @@ -0,0 +1,6 @@ +Count Encoder +============== + +.. autoclass:: category_encoders.count.CountEncoder + :members: + diff --git a/docs/source/index.rst b/docs/source/index.rst index 05f7312d..02f4b702 100755 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -42,6 +42,7 @@ To use: encoder = ce.BaseNEncoder(cols=[...]) encoder = ce.BinaryEncoder(cols=[...]) encoder = ce.CatBoostEncoder(cols=[...]) + encoder = ce.CountEncoder(cols=[...]) encoder = ce.GLMMEncoder(cols=[...]) encoder = ce.HashingEncoder(cols=[...]) encoder = ce.HelmertEncoder(cols=[...]) @@ -70,6 +71,7 @@ Contents: basen binary catboost + count glmm hashing helmert From 12999ec8684d7a2c138580250e8107bf42cc322f Mon Sep 17 00:00:00 2001 From: makrobios Date: Thu, 9 Jul 2020 17:32:44 +0200 Subject: [PATCH 02/14] added method get_feature_names to CountEncoder --- category_encoders/.count.py.swp | Bin 0 -> 28672 bytes category_encoders/count.py | 50 ++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 12 deletions(-) create mode 100644 category_encoders/.count.py.swp diff --git a/category_encoders/.count.py.swp b/category_encoders/.count.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..16dbc48b1e4ac392a74bf8e133fc0fec76ebaa8f GIT binary patch literal 28672 zcmeI4dyFJkb%%>rOk!*UAtD3^U(d#LkLm5*#cM(a&%%0l4J+{4tk-y881>Y2SI^XL zPgSR?dUwahfP{%0;~1nwfy5Z%h$spW6h$T>Kn&!CAT|L^5Rr&@gg+950$2tjghRe_ zANA<2p51joilQ#*H{D%zALpKX?m73IdurBpzv}R{>V@q+9@kSn@6M~w>nuO~Yu?3o zdtP@#Zl3wd_U_$%=2D*b`tHWAVG>rb>u14woNRT1s2lf!r1nlcin8`_>uegI zm6>s1#)0V^7-jvPFTB9J;`w{@THP^ssrvb+-#ESe?8%G+GY-r+Fyp|C12YcHI56YD zi~}t_ z9P(#S?~USbbN=uA|8HI7c@Kbp1RnzL0k?wJfj-y?THtx$0`L@YKKSB=`0ii@>;>Pw z!1MkJycygEUIZ=%&jwEh-+PMZJp#T0-VN>qzY2Z{JbJ$8eGuFOUI{J%PXO=zIp_rY zz|+7de%A96a4C57$)5Lz;5Kk0I1haIJkL80o(}$rK*YV^dhjSgjR(LMSON|3c>)mYSOzWdJn%U15Ip@8AiRBp@VDkt_B!Zi zzUsz<(PosYW)^4uK<&B0-I-tTT*0G$v>qIedco-hb>;3Cs36LcFi2Z!x5{GfT$z`= zsyp!0R5gZv)brDJH%@|fKk+w%wsdxld~58>OYhsaPYshW%9_z=Gf2X2r$?zY$l77l zkDK%JjM-jNbgYnCgph7KOZ+J9$H``MX}(gNhZH#=`5cE#zDJY6ri;ss125ZuL!+e{ zNB19WtjxFjVb)w~sjX`DB`Obl1>McHQPkaN3xzGmqeXMCANbiQ2|5wOORKGH(TXXQ z`N?{aRVdUCC@_fqUS};%vp8ywjk%x1n`$C&MTMKgILRu5tF)}Er6&e~pG2Bun3}p; zrS2+~Y@=rGl%F#E{P|9E<-ZS|_BV%v0O?Yt>qwjh3+kqDFi=sPDQ2Y`sIafXOm!2O z6r{?J)LIz%$(D)}6-FohB=jR5YcXtBTIMI@O`lP%C-G>=Ad)}@{XR)W;mKgIRiPS!vW*}qH^^@_2>MxklAI&chUtW1 znV)TSFzEZEL8gu+qoA!0_)*@dC5ypDNLbV{G%TpKI3CF8dSN$HalbmcFoNT8n(jR5 z4?<)%jH7A+GgZf+tY>=`=|$9-AE=EPvLC2Te@mSRf}!fd;dS`X5`L+`%zjJdL@F+S zFx>Fhf-LO9sx31@U#)gl>0%IWhFOqQX~^PO_f(AjNE|Wx!;#81n8Gf)fNw&TN!(I; zh0!uo0hPyNrmK)qq2Y!g(>3&9;=ZYE=C!BbV7NKjbhD^MN1C@BNNE#h2c9$}mvCHj zTd(W|CDgT4nk7@&uF1CKNmI&KeU@9snEL7(M2`9_1~&khjrOZD#Z3(3XYNL34mA7W z#nv!TYjCB<3eaO)jnI5d@y)?1#aT;h17Toa%aNx?VgIZaqe*pf&EP?IG=RB!k%=N+ zYNs3iFj(HbqAo9uUuw)(>9k_mlM%#>b1s+gIVaPQp+`>4Ld${b9cG?C~{ zEERH)`Qad~jkJDy9R}U7 zAIc1&P#JF)VohmWYdTSp;_#WmK{|l=17|rF)y+WVlrj%6uQPQ zgi2bi4%Av2!(ctooEB9xL^c~GwimKrwOe|`o^jWu@#PqGE7K@DxiTM_p zuvSFOldI-QV{*yPa6vcf>EHGb!WXP#%)VpSc)c(CVyt6Y6tl}Xu!_%UkD_gMLZTGeePHK}an!@IJJFU( zg5kht^Chau7)@&o&F-P-Xl`!qfXKe=rQ@CqP}E^fn`3HGGpJCu(#N8)kGW=B)CRBQ zM_8to4XYRyTd#hfV;<(b%A2UJoJF_1Cky6+yeM0%meo`pt7J;evkOoiL^8}eon{&g z`Yk)x!6Ft*yAEc6Rnax0Y(Y$o-#?@HrLn#W|&tf#$F{cWHva54 zwhEVk4$L?(ec0xWPtl$$g55Y>;{UPhi_*-;)ze$(Dmg9fV3iyK z*$w723RNcfH6)B2WucBD%4T~A3ASSsIM6*s-N6XQ@X|^H`;B9$-6wJXD)bh?2pSX* zs#kF=E36xH2FK&yM|BXTgnv<4m zmBC7lVg}@wyDn35*3qheI7i5g^8n-MX+K?EmbMK6t^xNsR|*@7kGE<<@2BGAL~3I1 zr2;yxIN8pGsClk+`7DWpyDiST7G@puHW659dzmqB+?YEWTSw`UZeoV=KD=}%O?>jz?ZS`21_X-3qm z)E1RW6jy3iCPPPgI6xvauUpdru*t=Mp766QvGI^br=gxlR5wfXiAc+Ql>?~yY1Ad< zu_aEI*Zx1p;@(H(d>a41Zw=(v@by0i?gwuH>)_`S5B?n71KtUagAg1B zbKraU{*Qpa1s?!^0B!?s0;&h5j}r%YH~39( zJD3NO5AYFu{|xK}&jG&-{vF@{L*Q=kTi{OcVz3W95qyrA$7g}$3A`FCfhKqaW;_Jm z1?~ZF07t;nz{BJWd;t6g7=jq=1TFAy-pnWIk?c?7az-Nl(+->T`e6XWUzM4HLthiND5 zohAUb>0|5@b=7qB@44>f^Q;KKUWi^I*jBD8-^VRRVPh7zd3L!f4#_G_YKDk1G41AY zP#t_8-x$@IHhWN2V-X)4rMsq4n)5BE(JW9gyk|!!F>T1qhi|n>%T1uuDl(tMl-r#B zZl=xBz;s;D9Jia3br*5wu!*+iKv7QeOXo4|@*b%ya79h4rIyN+8D*%*@6bH71QnEv z9LQC#z_zvejcZN1fs&jM49-ZnM?BtbscMeD8lLW~^k;H&R4GzonX4p|w4~>&9j4J} zFlf%V{b-BK5m&-}8mX~n!UntxGhvbOtCS#Y z6Eam(sim4se2b_u*+2T!WxhO*b#(Zt9EdmLHTm~0^KItV&}1iB4|RTWs->waiE_tX zqyy>EnXyhUXL*r#T3XPBL|PJD+vU%ksgwMMV+c`hnrmBbp9r=pD@xkV*LAK~JEGHM z%j`D5g%Uo)4q&N0h`Y-RcHPMKU_~Uru}7aQFEUwdj0z@6($o@XC}e&Ox0*dNNA#&n zQGi8!eUQ-gruLL_=ZFE951}fHy_1(Rr|9;~!mW0?ljN)7i z>CsPF+gW8HPukTrQ(YXzG&$15`;L5b#O1l0?lU(Yi$>Apzjc4bA-9yDAV&nn(4|>v z^COy7f?Q9Ngt;eHLCZ03RUg>{db0qh*Kep*EqSXgwOZ_(SGCDr)f#eDFCFbdOlogO zM&q)SJSmRaqDd!$ES3>zEJRZzsUqe;y}q4EOW;~}pQRmFov6qXhk#{w7Fx|YP%aIw zr5)jwF-dZLOlQ@$wos~{jXlh>Bpxdd&bU~NSu50>P(~#yY*mX{lsbblEQ>Fad&{j= zIn|FvMT%6T`*cdJ)e)W$AeWfBCY!sT;Nm2QhPPH;`oP)EJglBWk$ z8k1c*Ju)IljwACS341{&zgrrntGUz`-J+vZY7aRZkJjYr6 zsK_ZP=7`;&9IqQSHN~2k3}DaEC!dZn`As^KMJBl?M%xG-;}&3Y1+j9R%dls~8q`iN z+^mVuHExW8)1-V!)D@nLGSWLYNp|BbU0v2|k$Gt!Ue>Fwpy}E}gx_ZM?O9QmIyTU5 z!NDD3jU#sNCjzAt>#_~fqfB<4&Tb3WH}Gq)d?lvBQOOlR-Xmr_!wy4lISYFS@#BOv zo$f}^Jz?xU`!((V;|_b@v;Mz(zyC}4`F{p(2WwyfJRN)!zy5E4`1=`n1^5;4BYgNT zf)9cBf%k&91LyCb;`=es0Ox~m;m!0&;h;9~F{eEfd~UjTmx?gRILH-S^&B)AD& z3HE|t0M7st6Zjyw9o!5S!Bya?;3Du%VgUaE{uJB-ZU(Ob3*g(t06q)u0IvhLfa72X zcpUgLF@WCzN5Lz>BDf0tH*tWEf&0Pxz&pSy*b5rqnczI|IpPC%gSUdazz`e-PXym* zUS)0{DdGmqwk#-jO)infwIc+DrHKJE8D9w&leMoC-)vU0PWZ+*%vZ4XyxQlm5R3*u z^kuUr@!*nXa{i?w@>rg(({pKl`dA@Pl)|?ikeFFS@rP=`b5S#m+%^f^4SAeu{>d&RAAxatA9#y|YYFo3@qL z$>wro6a}X$iDcp?8)nHdM?Q&eAI~Pn5`8at_zQp;L!E2lzjLrwGayf%@iW$UL)h^F&m*zgDbk#z5 zA+$zZCEz$;$fd5+nb?($Z8=OSZ7e&)-8<%$88!;ttRs2{%U0UFjId{=crM%I$F%Fn2S2XokD8-g!_B;u(1HH6wkW?n zOXps&!+VzG2K?`U(45IO0*PIDJM=+4E4nndL;X{Xp7 z(!epJxgnZt$XX~*s>}7$HyqcnwRC3Q@A8_U&OF9uEENWVSbCCU*cB474pth(Ncw&WnXu0$R$se4#dFQSR2+}Nll z^u>g#(4()|4jQT)hU*ejhgScfN=8Ah9c}f-w8A<%$+1uVqLp?rm@$26vdUb_zk9ZX znc0b{uimIFTu6Z}Y;RNx+t6Aqzr=&9eV|q2qNu^N+M3%0EV{(<>f_vS)9P|z%rqCO zmW%?`8`Xw#zngOT_q>Sczx+qdUhinoM_-V^Q_60sNXsVMAQbp0_1Ag3;mSSjE7i-d z&RgtBOrCcs^gCBC*xTI0yI$-~GQK{)FsHA$;n@CTN?-CuaTG^84A9e(ZRzvxUbu`w*RGv&Z^weYhAJJM|*C-?pdjz)}9azqxALll=dkc!d8V{vqf8 z{M6UK;_JT$oCG~^6zl@u$JhTP_#n6soCG7#1N*@&7x=YStG zFETe@lKDB8D;E#@Wju5z8nNpANWnZw1Z`Ea+45*|pDK6HA}6I#G*Y2_NpJd4fZj~c z=JjWwZ^gz)`W=6H-oEFYmoAsFMzX;fhK@fss;4}Ssv)PC z7>=i4s%-4^Z1wxYhhd{e+<^NMxeM!^MD zHYc#FS!{21NYxTI%x}3@PJ)%Gon%X?>}tKto;hW+#2M4vbl&VJXTqAoB7tM$uH}y@ zy`40sTya;8{G4slWj!-p!*d7zclnmn`8Q3r>Zh)hD(X4fOPET zf#&6_q9(O#*tTpFQswnedHvM>69|#n6*(-^|I$IUMNRNY zY|}O(WKCA37p>X|mDMXIIf+K;kc?Y5_JUQgQC4vwiFN83Eu-JQ)@r&J>$uW@e@+sG zUH$>fIVfgP`Dv|A7QUI)W|G$2EA7m< zPg6t6>V9e?mPDviqUsUY?&yod0xt|DdHV&25y-glJv7uPnJAl2Xw}<*-RKR zN9Nlp|4pgma(X;zq3)6aGaX*tGr1>8C_JwxD`#+-8B9Al)ouAlHqC0JT-vMO{>qbS zjg9kC%%tX`C$-8K5;VT>OLLIdBm543svFg@pVF!hRkDb4x^zNsoh)0ya(NglMrge< zX|BPs##lg{I_y(>CJL7dQb^8NDn%ys^jL+;+ABSirPtQoP6}ZCBtKU*UvC9ZCt~Fp qn)St-rtEh4&XX0Z1R_C&<`mA=3(aIRQucMqQZP0v^`(DO)cy~*sZFH- literal 0 HcmV?d00001 diff --git a/category_encoders/count.py b/category_encoders/count.py index e18c69d3..70a3d639 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -116,6 +116,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, self.min_group_size = min_group_size self.min_group_name = min_group_name self.combine_min_nan_groups = combine_min_nan_groups + self.feature_names = None self._check_set_create_attrs() @@ -159,17 +160,26 @@ def fit(self, X, y=None, **kwargs): self._fit_count_encode(X, y) + X_temp = self.transform(X, override_return_df=True) + self.feature_names = list(X_temp.columns) + if self.drop_invariant: self.drop_cols = [] - X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] + try: + [self.feature_names.remove(x) for x in self.drop_cols] + except KeyError as e: + if self.verbose > 0: + print("Could not remove column from feature names." + "Not found in generated cols.\n{}".format(e)) + return self - def transform(self, X, y=None): + def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. Parameters @@ -207,7 +217,7 @@ def transform(self, X, y=None): for col in self.drop_cols: X.drop(col, 1, inplace=True) - if self.return_df: + if self.return_df or override_return_df: return X else: return X.values @@ -262,7 +272,7 @@ def _transform_count_encode(self, X_in, y): .fillna(X[col]) ) - X[col] = X[col].map(self.mapping[col]) + X[col] = X[col].astype(object).map(self.mapping[col]) if isinstance(self._handle_unknown[col], (int, np.integer)): X[col] = X[col].fillna(self._handle_unknown[col]) @@ -348,14 +358,14 @@ def _check_set_create_attrs(self): "'combine_min_nan_groups' == 'force' for all columns." ) - if ( - self.combine_min_nan_groups is not None - and self.min_group_size is None - ): - raise ValueError( - "`combine_min_nan_groups` only works when `min_group_size` " - "is set for all columns." - ) + # if ( + # self.combine_min_nan_groups is not None + # and self.min_group_size is None + # ): + # raise ValueError( + # "`combine_min_nan_groups` only works when `min_group_size` " + # "is set for all columns." + # ) if ( self.min_group_name is not None @@ -423,3 +433,19 @@ def _check_set_create_dict_attrs(self): "is set for column %s." % (col,) ) + + def get_feature_names(self): + """ + Returns the names of all transformed / added columns. + + Returns + ------- + feature_names: list + A list with all feature names transformed or added. + Note: potentially dropped features are not included! + + """ + if not isinstance(self.feature_names, list): + raise ValueError("Estimator has to be fitted to return feature names.") + else: + return self.feature_names From e1fe86a8b4ab7b62f323548a17e9932da476a2e7 Mon Sep 17 00:00:00 2001 From: makrobios Date: Thu, 9 Jul 2020 18:35:51 +0200 Subject: [PATCH 03/14] [Fix] CountEncoder test_metamorphic test_encoders.py::TestEncoders::test_metamorphic --- .gitignore | 6 +++++- category_encoders/.count.py.swp | Bin 28672 -> 0 bytes category_encoders/count.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) delete mode 100644 category_encoders/.count.py.swp diff --git a/.gitignore b/.gitignore index dfcf8f8f..2e9e38fc 100644 --- a/.gitignore +++ b/.gitignore @@ -60,4 +60,8 @@ docs/_build/ # PyBuilder target/ -.pytest_cache/ \ No newline at end of file +.pytest_cache/ + +*~ +*.swp +*.swo \ No newline at end of file diff --git a/category_encoders/.count.py.swp b/category_encoders/.count.py.swp deleted file mode 100644 index 16dbc48b1e4ac392a74bf8e133fc0fec76ebaa8f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28672 zcmeI4dyFJkb%%>rOk!*UAtD3^U(d#LkLm5*#cM(a&%%0l4J+{4tk-y881>Y2SI^XL zPgSR?dUwahfP{%0;~1nwfy5Z%h$spW6h$T>Kn&!CAT|L^5Rr&@gg+950$2tjghRe_ zANA<2p51joilQ#*H{D%zALpKX?m73IdurBpzv}R{>V@q+9@kSn@6M~w>nuO~Yu?3o zdtP@#Zl3wd_U_$%=2D*b`tHWAVG>rb>u14woNRT1s2lf!r1nlcin8`_>uegI zm6>s1#)0V^7-jvPFTB9J;`w{@THP^ssrvb+-#ESe?8%G+GY-r+Fyp|C12YcHI56YD zi~}t_ z9P(#S?~USbbN=uA|8HI7c@Kbp1RnzL0k?wJfj-y?THtx$0`L@YKKSB=`0ii@>;>Pw z!1MkJycygEUIZ=%&jwEh-+PMZJp#T0-VN>qzY2Z{JbJ$8eGuFOUI{J%PXO=zIp_rY zz|+7de%A96a4C57$)5Lz;5Kk0I1haIJkL80o(}$rK*YV^dhjSgjR(LMSON|3c>)mYSOzWdJn%U15Ip@8AiRBp@VDkt_B!Zi zzUsz<(PosYW)^4uK<&B0-I-tTT*0G$v>qIedco-hb>;3Cs36LcFi2Z!x5{GfT$z`= zsyp!0R5gZv)brDJH%@|fKk+w%wsdxld~58>OYhsaPYshW%9_z=Gf2X2r$?zY$l77l zkDK%JjM-jNbgYnCgph7KOZ+J9$H``MX}(gNhZH#=`5cE#zDJY6ri;ss125ZuL!+e{ zNB19WtjxFjVb)w~sjX`DB`Obl1>McHQPkaN3xzGmqeXMCANbiQ2|5wOORKGH(TXXQ z`N?{aRVdUCC@_fqUS};%vp8ywjk%x1n`$C&MTMKgILRu5tF)}Er6&e~pG2Bun3}p; zrS2+~Y@=rGl%F#E{P|9E<-ZS|_BV%v0O?Yt>qwjh3+kqDFi=sPDQ2Y`sIafXOm!2O z6r{?J)LIz%$(D)}6-FohB=jR5YcXtBTIMI@O`lP%C-G>=Ad)}@{XR)W;mKgIRiPS!vW*}qH^^@_2>MxklAI&chUtW1 znV)TSFzEZEL8gu+qoA!0_)*@dC5ypDNLbV{G%TpKI3CF8dSN$HalbmcFoNT8n(jR5 z4?<)%jH7A+GgZf+tY>=`=|$9-AE=EPvLC2Te@mSRf}!fd;dS`X5`L+`%zjJdL@F+S zFx>Fhf-LO9sx31@U#)gl>0%IWhFOqQX~^PO_f(AjNE|Wx!;#81n8Gf)fNw&TN!(I; zh0!uo0hPyNrmK)qq2Y!g(>3&9;=ZYE=C!BbV7NKjbhD^MN1C@BNNE#h2c9$}mvCHj zTd(W|CDgT4nk7@&uF1CKNmI&KeU@9snEL7(M2`9_1~&khjrOZD#Z3(3XYNL34mA7W z#nv!TYjCB<3eaO)jnI5d@y)?1#aT;h17Toa%aNx?VgIZaqe*pf&EP?IG=RB!k%=N+ zYNs3iFj(HbqAo9uUuw)(>9k_mlM%#>b1s+gIVaPQp+`>4Ld${b9cG?C~{ zEERH)`Qad~jkJDy9R}U7 zAIc1&P#JF)VohmWYdTSp;_#WmK{|l=17|rF)y+WVlrj%6uQPQ zgi2bi4%Av2!(ctooEB9xL^c~GwimKrwOe|`o^jWu@#PqGE7K@DxiTM_p zuvSFOldI-QV{*yPa6vcf>EHGb!WXP#%)VpSc)c(CVyt6Y6tl}Xu!_%UkD_gMLZTGeePHK}an!@IJJFU( zg5kht^Chau7)@&o&F-P-Xl`!qfXKe=rQ@CqP}E^fn`3HGGpJCu(#N8)kGW=B)CRBQ zM_8to4XYRyTd#hfV;<(b%A2UJoJF_1Cky6+yeM0%meo`pt7J;evkOoiL^8}eon{&g z`Yk)x!6Ft*yAEc6Rnax0Y(Y$o-#?@HrLn#W|&tf#$F{cWHva54 zwhEVk4$L?(ec0xWPtl$$g55Y>;{UPhi_*-;)ze$(Dmg9fV3iyK z*$w723RNcfH6)B2WucBD%4T~A3ASSsIM6*s-N6XQ@X|^H`;B9$-6wJXD)bh?2pSX* zs#kF=E36xH2FK&yM|BXTgnv<4m zmBC7lVg}@wyDn35*3qheI7i5g^8n-MX+K?EmbMK6t^xNsR|*@7kGE<<@2BGAL~3I1 zr2;yxIN8pGsClk+`7DWpyDiST7G@puHW659dzmqB+?YEWTSw`UZeoV=KD=}%O?>jz?ZS`21_X-3qm z)E1RW6jy3iCPPPgI6xvauUpdru*t=Mp766QvGI^br=gxlR5wfXiAc+Ql>?~yY1Ad< zu_aEI*Zx1p;@(H(d>a41Zw=(v@by0i?gwuH>)_`S5B?n71KtUagAg1B zbKraU{*Qpa1s?!^0B!?s0;&h5j}r%YH~39( zJD3NO5AYFu{|xK}&jG&-{vF@{L*Q=kTi{OcVz3W95qyrA$7g}$3A`FCfhKqaW;_Jm z1?~ZF07t;nz{BJWd;t6g7=jq=1TFAy-pnWIk?c?7az-Nl(+->T`e6XWUzM4HLthiND5 zohAUb>0|5@b=7qB@44>f^Q;KKUWi^I*jBD8-^VRRVPh7zd3L!f4#_G_YKDk1G41AY zP#t_8-x$@IHhWN2V-X)4rMsq4n)5BE(JW9gyk|!!F>T1qhi|n>%T1uuDl(tMl-r#B zZl=xBz;s;D9Jia3br*5wu!*+iKv7QeOXo4|@*b%ya79h4rIyN+8D*%*@6bH71QnEv z9LQC#z_zvejcZN1fs&jM49-ZnM?BtbscMeD8lLW~^k;H&R4GzonX4p|w4~>&9j4J} zFlf%V{b-BK5m&-}8mX~n!UntxGhvbOtCS#Y z6Eam(sim4se2b_u*+2T!WxhO*b#(Zt9EdmLHTm~0^KItV&}1iB4|RTWs->waiE_tX zqyy>EnXyhUXL*r#T3XPBL|PJD+vU%ksgwMMV+c`hnrmBbp9r=pD@xkV*LAK~JEGHM z%j`D5g%Uo)4q&N0h`Y-RcHPMKU_~Uru}7aQFEUwdj0z@6($o@XC}e&Ox0*dNNA#&n zQGi8!eUQ-gruLL_=ZFE951}fHy_1(Rr|9;~!mW0?ljN)7i z>CsPF+gW8HPukTrQ(YXzG&$15`;L5b#O1l0?lU(Yi$>Apzjc4bA-9yDAV&nn(4|>v z^COy7f?Q9Ngt;eHLCZ03RUg>{db0qh*Kep*EqSXgwOZ_(SGCDr)f#eDFCFbdOlogO zM&q)SJSmRaqDd!$ES3>zEJRZzsUqe;y}q4EOW;~}pQRmFov6qXhk#{w7Fx|YP%aIw zr5)jwF-dZLOlQ@$wos~{jXlh>Bpxdd&bU~NSu50>P(~#yY*mX{lsbblEQ>Fad&{j= zIn|FvMT%6T`*cdJ)e)W$AeWfBCY!sT;Nm2QhPPH;`oP)EJglBWk$ z8k1c*Ju)IljwACS341{&zgrrntGUz`-J+vZY7aRZkJjYr6 zsK_ZP=7`;&9IqQSHN~2k3}DaEC!dZn`As^KMJBl?M%xG-;}&3Y1+j9R%dls~8q`iN z+^mVuHExW8)1-V!)D@nLGSWLYNp|BbU0v2|k$Gt!Ue>Fwpy}E}gx_ZM?O9QmIyTU5 z!NDD3jU#sNCjzAt>#_~fqfB<4&Tb3WH}Gq)d?lvBQOOlR-Xmr_!wy4lISYFS@#BOv zo$f}^Jz?xU`!((V;|_b@v;Mz(zyC}4`F{p(2WwyfJRN)!zy5E4`1=`n1^5;4BYgNT zf)9cBf%k&91LyCb;`=es0Ox~m;m!0&;h;9~F{eEfd~UjTmx?gRILH-S^&B)AD& z3HE|t0M7st6Zjyw9o!5S!Bya?;3Du%VgUaE{uJB-ZU(Ob3*g(t06q)u0IvhLfa72X zcpUgLF@WCzN5Lz>BDf0tH*tWEf&0Pxz&pSy*b5rqnczI|IpPC%gSUdazz`e-PXym* zUS)0{DdGmqwk#-jO)infwIc+DrHKJE8D9w&leMoC-)vU0PWZ+*%vZ4XyxQlm5R3*u z^kuUr@!*nXa{i?w@>rg(({pKl`dA@Pl)|?ikeFFS@rP=`b5S#m+%^f^4SAeu{>d&RAAxatA9#y|YYFo3@qL z$>wro6a}X$iDcp?8)nHdM?Q&eAI~Pn5`8at_zQp;L!E2lzjLrwGayf%@iW$UL)h^F&m*zgDbk#z5 zA+$zZCEz$;$fd5+nb?($Z8=OSZ7e&)-8<%$88!;ttRs2{%U0UFjId{=crM%I$F%Fn2S2XokD8-g!_B;u(1HH6wkW?n zOXps&!+VzG2K?`U(45IO0*PIDJM=+4E4nndL;X{Xp7 z(!epJxgnZt$XX~*s>}7$HyqcnwRC3Q@A8_U&OF9uEENWVSbCCU*cB474pth(Ncw&WnXu0$R$se4#dFQSR2+}Nll z^u>g#(4()|4jQT)hU*ejhgScfN=8Ah9c}f-w8A<%$+1uVqLp?rm@$26vdUb_zk9ZX znc0b{uimIFTu6Z}Y;RNx+t6Aqzr=&9eV|q2qNu^N+M3%0EV{(<>f_vS)9P|z%rqCO zmW%?`8`Xw#zngOT_q>Sczx+qdUhinoM_-V^Q_60sNXsVMAQbp0_1Ag3;mSSjE7i-d z&RgtBOrCcs^gCBC*xTI0yI$-~GQK{)FsHA$;n@CTN?-CuaTG^84A9e(ZRzvxUbu`w*RGv&Z^weYhAJJM|*C-?pdjz)}9azqxALll=dkc!d8V{vqf8 z{M6UK;_JT$oCG~^6zl@u$JhTP_#n6soCG7#1N*@&7x=YStG zFETe@lKDB8D;E#@Wju5z8nNpANWnZw1Z`Ea+45*|pDK6HA}6I#G*Y2_NpJd4fZj~c z=JjWwZ^gz)`W=6H-oEFYmoAsFMzX;fhK@fss;4}Ssv)PC z7>=i4s%-4^Z1wxYhhd{e+<^NMxeM!^MD zHYc#FS!{21NYxTI%x}3@PJ)%Gon%X?>}tKto;hW+#2M4vbl&VJXTqAoB7tM$uH}y@ zy`40sTya;8{G4slWj!-p!*d7zclnmn`8Q3r>Zh)hD(X4fOPET zf#&6_q9(O#*tTpFQswnedHvM>69|#n6*(-^|I$IUMNRNY zY|}O(WKCA37p>X|mDMXIIf+K;kc?Y5_JUQgQC4vwiFN83Eu-JQ)@r&J>$uW@e@+sG zUH$>fIVfgP`Dv|A7QUI)W|G$2EA7m< zPg6t6>V9e?mPDviqUsUY?&yod0xt|DdHV&25y-glJv7uPnJAl2Xw}<*-RKR zN9Nlp|4pgma(X;zq3)6aGaX*tGr1>8C_JwxD`#+-8B9Al)ouAlHqC0JT-vMO{>qbS zjg9kC%%tX`C$-8K5;VT>OLLIdBm543svFg@pVF!hRkDb4x^zNsoh)0ya(NglMrge< zX|BPs##lg{I_y(>CJL7dQb^8NDn%ys^jL+;+ABSirPtQoP6}ZCBtKU*UvC9ZCt~Fp qn)St-rtEh4&XX0Z1R_C&<`mA=3(aIRQucMqQZP0v^`(DO)cy~*sZFH- diff --git a/category_encoders/count.py b/category_encoders/count.py index 70a3d639..83531a10 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -262,7 +262,7 @@ def _fit_count_encode(self, X_in, y): def _transform_count_encode(self, X_in, y): """Perform the transform count encoding.""" X = X_in.copy(deep=True) - X.loc[:, self.cols] = X.fillna(value=np.nan) + X.fillna(value=np.nan, inplace=True) for col in self.cols: if self._min_group_size is not None: From 857fd581835ec1c03b7293ecdeea82e7dc53e590 Mon Sep 17 00:00:00 2001 From: makrobios Date: Thu, 9 Jul 2020 18:59:26 +0200 Subject: [PATCH 04/14] [Fix] CountEncoder test handle_missing test_encoders.py::TestEncoders::test_handle_missing_error test_encoders.py::TestEncoders::test_handle_missing_error_2cols --- category_encoders/count.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/category_encoders/count.py b/category_encoders/count.py index 83531a10..20601861 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -15,7 +15,7 @@ class CountEncoder(BaseEstimator, TransformerMixin): def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown=None, - handle_missing='count', + handle_missing='value', min_group_size=None, combine_min_nan_groups=None, min_group_name=None, normalize=False): """Count encoding for categorical features. @@ -192,6 +192,9 @@ def transform(self, X, y=None, override_return_df=False): p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().any(): + raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( @@ -239,10 +242,10 @@ def _fit_count_encode(self, X_in, y): % (col,) ) - elif self._handle_missing[col] not in ['count', 'return_nan', 'error', None]: + elif self._handle_missing[col] not in ['value', 'return_nan', 'error', None]: raise ValueError( '%s key in `handle_missing` should be one of: ' - ' `count`, `return_nan` and `error` not `%s`.' + ' `value`, `return_nan` and `error` not `%s`.' % (col, str(self._handle_missing[col])) ) From e7faa40223bff0c2a1590ff03692e2b92329a2d6 Mon Sep 17 00:00:00 2001 From: makrobios Date: Fri, 10 Jul 2020 14:44:47 +0200 Subject: [PATCH 05/14] Updated Docstring for CountEncoder --- category_encoders/count.py | 42 +++++++++++++++++++++----------------- tests/test_encoders.py | 9 ++++++-- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/category_encoders/count.py b/category_encoders/count.py index 20601861..bbef2377 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -14,7 +14,7 @@ class CountEncoder(BaseEstimator, TransformerMixin): def __init__(self, verbose=0, cols=None, drop_invariant=False, - return_df=True, handle_unknown=None, + return_df=True, handle_unknown='value', handle_missing='value', min_group_size=None, combine_min_nan_groups=None, min_group_name=None, normalize=False): @@ -38,31 +38,31 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, (otherwise it will be a numpy array). handle_missing: str how to handle missing values at fit time. Options are 'error', 'return_nan', - and 'count'. Default 'count', which treat NaNs as a countable category at + and 'value'. Default 'value', which treat NaNs as a countable category at fit time. - handle_unknown: str, int or dict of. + handle_unknown: str, int or dict of {column : option, ...}. how to handle unknown labels at transform time. Options are 'error' 'return_nan' and an int. Defaults to None which uses NaN behaviour specified at fit time. Passing an int will fill with this int value. - normalize: bool or dict of. + normalize: bool or dict of {column : bool, ...}. whether to normalize the counts to the range (0, 1). See Pandas `value_counts` for more details. - min_group_size: int, float or dict of. + min_group_size: int, float or dict of {column : option, ...}. the minimal count threshold of a group needed to ensure it is not combined into a "leftovers" group. If float in the range (0, 1), `min_group_size` is calculated as int(X.shape[0] * min_group_size). Note: This value may change type based on the `normalize` variable. If True this will become a float. If False, it will be an int. - min_group_name: None, str or dict of. + min_group_name: None, str or dict of {column : option, ...}. Set the name of the combined minimum groups when the defaults become too long. Default None. In this case the category names will be joined alphabetically with a `_` delimiter. Note: The default name can be long ae may keep changing, for example, in cross-validation. - combine_min_nan_groups: bool or dict of. + combine_min_nan_groups: bool or dict of {column : bool, ...}. whether to combine the leftovers group with NaN group. Default True. Can also be forced to combine with 'force' meaning small groups are effectively - counted as NaNs. Force can only used when 'handle_missing' is 'count' or 'error'. + counted as NaNs. Force can only used when 'handle_missing' is 'value' or 'error'. Note: Will not force if it creates an binary or invariant column. @@ -246,7 +246,6 @@ def _fit_count_encode(self, X_in, y): raise ValueError( '%s key in `handle_missing` should be one of: ' ' `value`, `return_nan` and `error` not `%s`.' - % (col, str(self._handle_missing[col])) ) self.mapping[col] = X[col].value_counts( @@ -256,8 +255,13 @@ def _fit_count_encode(self, X_in, y): self.mapping[col].index = self.mapping[col].index.astype(object) + + if self._handle_missing[col] == 'return_nan': self.mapping[col][np.NaN] = np.NaN + + elif self._handle_missing[col] == 'value': + self.mapping[col].loc[-2] = 0 if any([val is not None for val in self._min_group_size.values()]): self.combine_min_categories(X) @@ -361,14 +365,14 @@ def _check_set_create_attrs(self): "'combine_min_nan_groups' == 'force' for all columns." ) - # if ( - # self.combine_min_nan_groups is not None - # and self.min_group_size is None - # ): - # raise ValueError( - # "`combine_min_nan_groups` only works when `min_group_size` " - # "is set for all columns." - # ) + if ( + self.combine_min_nan_groups is not None + and self.min_group_size is None + ): + raise ValueError( + "`combine_min_nan_groups` only works when `min_group_size` " + "is set for all columns." + ) if ( self.min_group_name is not None @@ -389,8 +393,8 @@ def _check_set_create_dict_attrs(self): 'min_group_name': None, 'combine_min_nan_groups': True, 'min_group_size': None, - 'handle_unknown': 'count', - 'handle_missing': None, + 'handle_unknown': 'value', + 'handle_missing': 'value', } for attr_name, attr_default in dict_attrs.items(): diff --git a/tests/test_encoders.py b/tests/test_encoders.py index 3c73780a..5507c210 100644 --- a/tests/test_encoders.py +++ b/tests/test_encoders.py @@ -185,6 +185,7 @@ def test_handle_unknown_return_nan(self): y = pd.Series([1, 0]) for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + breakpoint() with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') @@ -216,6 +217,7 @@ def test_handle_missing_return_nan_test(self): y = pd.Series([1, 0, 1]) for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + breakpoint() with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_missing='return_nan') result = enc.fit(X, y).transform(X_t).iloc[2, :] @@ -229,13 +231,16 @@ def test_handle_unknown_value(self): train = pd.DataFrame({'city': ['chicago', 'los angeles']}) test = pd.DataFrame({'city': ['chicago', 'denver']}) y = pd.Series([1, 0]) - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded with self.subTest(encoder_name=encoder_name): - + breakpoint() enc = getattr(encoders, encoder_name)(handle_unknown='value') enc.fit(train, y) + print(enc.__class__.__name__) + print(enc.mapping) result = enc.transform(test) + # print(enc.mapping) + print("result: ", result) self.assertFalse(result.iloc[1, :].isnull().all()) def test_sklearn_compliance(self): From 2870455b4a6f3ef6929d79cfd58cf7c566d2c99f Mon Sep 17 00:00:00 2001 From: makrobios Date: Fri, 10 Jul 2020 14:44:47 +0200 Subject: [PATCH 06/14] Updated Docstring for CountEncoder --- category_encoders/count.py | 46 +++++++++++++++++++++----------------- tests/test_encoders.py | 9 ++++++-- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/category_encoders/count.py b/category_encoders/count.py index 20601861..3cbbca78 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -14,7 +14,7 @@ class CountEncoder(BaseEstimator, TransformerMixin): def __init__(self, verbose=0, cols=None, drop_invariant=False, - return_df=True, handle_unknown=None, + return_df=True, handle_unknown='value', handle_missing='value', min_group_size=None, combine_min_nan_groups=None, min_group_name=None, normalize=False): @@ -38,31 +38,31 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, (otherwise it will be a numpy array). handle_missing: str how to handle missing values at fit time. Options are 'error', 'return_nan', - and 'count'. Default 'count', which treat NaNs as a countable category at + and 'value'. Default 'value', which treat NaNs as a countable category at fit time. - handle_unknown: str, int or dict of. + handle_unknown: str, int or dict of {column : option, ...}. how to handle unknown labels at transform time. Options are 'error' - 'return_nan' and an int. Defaults to None which uses NaN behaviour + 'return_nan', 'value' and int. Defaults to None which uses NaN behaviour specified at fit time. Passing an int will fill with this int value. - normalize: bool or dict of. + normalize: bool or dict of {column : bool, ...}. whether to normalize the counts to the range (0, 1). See Pandas `value_counts` for more details. - min_group_size: int, float or dict of. + min_group_size: int, float or dict of {column : option, ...}. the minimal count threshold of a group needed to ensure it is not combined into a "leftovers" group. If float in the range (0, 1), `min_group_size` is calculated as int(X.shape[0] * min_group_size). Note: This value may change type based on the `normalize` variable. If True this will become a float. If False, it will be an int. - min_group_name: None, str or dict of. + min_group_name: None, str or dict of {column : option, ...}. Set the name of the combined minimum groups when the defaults become too long. Default None. In this case the category names will be joined alphabetically with a `_` delimiter. - Note: The default name can be long ae may keep changing, for example, + Note: The default name can be long and may keep changing, for example, in cross-validation. - combine_min_nan_groups: bool or dict of. + combine_min_nan_groups: bool or dict of {column : bool, ...}. whether to combine the leftovers group with NaN group. Default True. Can also be forced to combine with 'force' meaning small groups are effectively - counted as NaNs. Force can only used when 'handle_missing' is 'count' or 'error'. + counted as NaNs. Force can only used when 'handle_missing' is 'value' or 'error'. Note: Will not force if it creates an binary or invariant column. @@ -246,7 +246,6 @@ def _fit_count_encode(self, X_in, y): raise ValueError( '%s key in `handle_missing` should be one of: ' ' `value`, `return_nan` and `error` not `%s`.' - % (col, str(self._handle_missing[col])) ) self.mapping[col] = X[col].value_counts( @@ -256,8 +255,13 @@ def _fit_count_encode(self, X_in, y): self.mapping[col].index = self.mapping[col].index.astype(object) + + if self._handle_missing[col] == 'return_nan': self.mapping[col][np.NaN] = np.NaN + + elif self._handle_missing[col] == 'value': + self.mapping[col].loc[-2] = 0 if any([val is not None for val in self._min_group_size.values()]): self.combine_min_categories(X) @@ -361,14 +365,14 @@ def _check_set_create_attrs(self): "'combine_min_nan_groups' == 'force' for all columns." ) - # if ( - # self.combine_min_nan_groups is not None - # and self.min_group_size is None - # ): - # raise ValueError( - # "`combine_min_nan_groups` only works when `min_group_size` " - # "is set for all columns." - # ) + if ( + self.combine_min_nan_groups is not None + and self.min_group_size is None + ): + raise ValueError( + "`combine_min_nan_groups` only works when `min_group_size` " + "is set for all columns." + ) if ( self.min_group_name is not None @@ -389,8 +393,8 @@ def _check_set_create_dict_attrs(self): 'min_group_name': None, 'combine_min_nan_groups': True, 'min_group_size': None, - 'handle_unknown': 'count', - 'handle_missing': None, + 'handle_unknown': 'value', + 'handle_missing': 'value', } for attr_name, attr_default in dict_attrs.items(): diff --git a/tests/test_encoders.py b/tests/test_encoders.py index 3c73780a..5507c210 100644 --- a/tests/test_encoders.py +++ b/tests/test_encoders.py @@ -185,6 +185,7 @@ def test_handle_unknown_return_nan(self): y = pd.Series([1, 0]) for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + breakpoint() with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') @@ -216,6 +217,7 @@ def test_handle_missing_return_nan_test(self): y = pd.Series([1, 0, 1]) for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + breakpoint() with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_missing='return_nan') result = enc.fit(X, y).transform(X_t).iloc[2, :] @@ -229,13 +231,16 @@ def test_handle_unknown_value(self): train = pd.DataFrame({'city': ['chicago', 'los angeles']}) test = pd.DataFrame({'city': ['chicago', 'denver']}) y = pd.Series([1, 0]) - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded with self.subTest(encoder_name=encoder_name): - + breakpoint() enc = getattr(encoders, encoder_name)(handle_unknown='value') enc.fit(train, y) + print(enc.__class__.__name__) + print(enc.mapping) result = enc.transform(test) + # print(enc.mapping) + print("result: ", result) self.assertFalse(result.iloc[1, :].isnull().all()) def test_sklearn_compliance(self): From a2f81ada784d4dc5393d3a850ddef81b7b3020d6 Mon Sep 17 00:00:00 2001 From: makrobios Date: Fri, 10 Jul 2020 16:27:29 +0200 Subject: [PATCH 07/14] cleaning code --- category_encoders/count.py | 2 +- tests/test_encoders.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/category_encoders/count.py b/category_encoders/count.py index 3cbbca78..ba504475 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -453,6 +453,6 @@ def get_feature_names(self): """ if not isinstance(self.feature_names, list): - raise ValueError("Estimator has to be fitted to return feature names.") + raise ValueError("CountEncoder has to be fitted to return feature names.") else: return self.feature_names diff --git a/tests/test_encoders.py b/tests/test_encoders.py index 5507c210..b4bde6cd 100644 --- a/tests/test_encoders.py +++ b/tests/test_encoders.py @@ -185,7 +185,6 @@ def test_handle_unknown_return_nan(self): y = pd.Series([1, 0]) for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded - breakpoint() with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') @@ -217,7 +216,6 @@ def test_handle_missing_return_nan_test(self): y = pd.Series([1, 0, 1]) for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded - breakpoint() with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_missing='return_nan') result = enc.fit(X, y).transform(X_t).iloc[2, :] @@ -233,7 +231,6 @@ def test_handle_unknown_value(self): y = pd.Series([1, 0]) for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded with self.subTest(encoder_name=encoder_name): - breakpoint() enc = getattr(encoders, encoder_name)(handle_unknown='value') enc.fit(train, y) print(enc.__class__.__name__) From ffcf88eda1f5a894370a939bd8529fb7144612a3 Mon Sep 17 00:00:00 2001 From: makrobios Date: Fri, 10 Jul 2020 16:27:29 +0200 Subject: [PATCH 08/14] cleaning code --- category_encoders/count.py | 2 +- tests/test_encoders.py | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/category_encoders/count.py b/category_encoders/count.py index 3cbbca78..ba504475 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -453,6 +453,6 @@ def get_feature_names(self): """ if not isinstance(self.feature_names, list): - raise ValueError("Estimator has to be fitted to return feature names.") + raise ValueError("CountEncoder has to be fitted to return feature names.") else: return self.feature_names diff --git a/tests/test_encoders.py b/tests/test_encoders.py index 5507c210..3c73780a 100644 --- a/tests/test_encoders.py +++ b/tests/test_encoders.py @@ -185,7 +185,6 @@ def test_handle_unknown_return_nan(self): y = pd.Series([1, 0]) for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded - breakpoint() with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') @@ -217,7 +216,6 @@ def test_handle_missing_return_nan_test(self): y = pd.Series([1, 0, 1]) for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded - breakpoint() with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_missing='return_nan') result = enc.fit(X, y).transform(X_t).iloc[2, :] @@ -231,16 +229,13 @@ def test_handle_unknown_value(self): train = pd.DataFrame({'city': ['chicago', 'los angeles']}) test = pd.DataFrame({'city': ['chicago', 'denver']}) y = pd.Series([1, 0]) + for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded with self.subTest(encoder_name=encoder_name): - breakpoint() + enc = getattr(encoders, encoder_name)(handle_unknown='value') enc.fit(train, y) - print(enc.__class__.__name__) - print(enc.mapping) result = enc.transform(test) - # print(enc.mapping) - print("result: ", result) self.assertFalse(result.iloc[1, :].isnull().all()) def test_sklearn_compliance(self): From ff57575d694e7c893a20c9000bb25b467d24225c Mon Sep 17 00:00:00 2001 From: makrobios Date: Mon, 13 Jul 2020 01:39:13 +0200 Subject: [PATCH 09/14] set default for min_group_size min_group_size is set to 0.01, setting of some value required for combine_min_nan. --- category_encoders/count.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/category_encoders/count.py b/category_encoders/count.py index ba504475..9d5164a2 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -49,7 +49,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, for more details. min_group_size: int, float or dict of {column : option, ...}. the minimal count threshold of a group needed to ensure it is not - combined into a "leftovers" group. If float in the range (0, 1), + combined into a "leftovers" group. Default value is 0.01. + If float in the range (0, 1), `min_group_size` is calculated as int(X.shape[0] * min_group_size). Note: This value may change type based on the `normalize` variable. If True this will become a float. If False, it will be an int. @@ -385,6 +386,7 @@ def _check_set_create_attrs(self): if self.combine_min_nan_groups is None: self.combine_min_nan_groups = True + self.min_group_size = 0.01 def _check_set_create_dict_attrs(self): """Check attributes that can be dicts and format for all `self.cols`.""" From 78df7b1cf4f3b85635296d9df5ccfd547428d501 Mon Sep 17 00:00:00 2001 From: makrobios Date: Mon, 13 Jul 2020 22:28:50 +0200 Subject: [PATCH 10/14] added handle_unknown handling to _transform_count_encode. Let's following tests fail: FAILED test_encoders.py::TestEncoders::test_handle_missing_return_nan_test - AssertionError: False is not true FAILED test_encoders.py::TestEncoders::test_handle_missing_return_nan_train - AssertionError: False is not true --- category_encoders/count.py | 8 +++++++- tests/test_encoders.py | 4 ---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/category_encoders/count.py b/category_encoders/count.py index 9d5164a2..0a98d2d8 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -279,11 +279,17 @@ def _transform_count_encode(self, X_in, y): X[col].map(self._min_group_categories[col]) .fillna(X[col]) ) - + X[col] = X[col].astype(object).map(self.mapping[col]) if isinstance(self._handle_unknown[col], (int, np.integer)): X[col] = X[col].fillna(self._handle_unknown[col]) + + # elif (self._handle_unknown[col] == 'value' and + # X[col].isna().any() + # ): + # X[col].replace(np.nan, 0, inplace=True) + elif ( self._handle_unknown[col] == 'error' and X[col].isnull().any() diff --git a/tests/test_encoders.py b/tests/test_encoders.py index 484bc0da..0ec857c5 100644 --- a/tests/test_encoders.py +++ b/tests/test_encoders.py @@ -232,10 +232,6 @@ def test_handle_unknown_value(self): for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded with self.subTest(encoder_name=encoder_name): -<<<<<<< HEAD - -======= ->>>>>>> a2f81ada784d4dc5393d3a850ddef81b7b3020d6 enc = getattr(encoders, encoder_name)(handle_unknown='value') enc.fit(train, y) result = enc.transform(test) From 9ec7a8cd68a3a8fcba823610b653f5ff0791b097 Mon Sep 17 00:00:00 2001 From: makrobios Date: Tue, 14 Jul 2020 17:23:15 +0200 Subject: [PATCH 11/14] Add logic to _transform_count_encode needed for _handle_unknown == 'value' not to interfere with _handle_unknown = 'return_nan' --- category_encoders/count.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/category_encoders/count.py b/category_encoders/count.py index 0a98d2d8..5d44e9b1 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -281,14 +281,14 @@ def _transform_count_encode(self, X_in, y): ) X[col] = X[col].astype(object).map(self.mapping[col]) - if isinstance(self._handle_unknown[col], (int, np.integer)): X[col] = X[col].fillna(self._handle_unknown[col]) - # elif (self._handle_unknown[col] == 'value' and - # X[col].isna().any() - # ): - # X[col].replace(np.nan, 0, inplace=True) + elif (self._handle_unknown[col] == 'value' + and X[col].isna().any() + and self._handle_missing[col] != 'return_nan' + ): + X[col].replace(np.nan, 0, inplace=True) elif ( self._handle_unknown[col] == 'error' From 40403fefd1b7be93c94595537b96ce8fe81489bc Mon Sep 17 00:00:00 2001 From: makrobios Date: Wed, 15 Jul 2020 21:41:27 +0200 Subject: [PATCH 12/14] updated .gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 2e9e38fc..63a1ec76 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,10 @@ docs/_build/ target/ .pytest_cache/ +.tmp/ +checkcommits.sh +runtest.py + *~ *.swp From 1c94c2241fdd14ef693ab9701c765093ad198b9d Mon Sep 17 00:00:00 2001 From: makrobios Date: Wed, 15 Jul 2020 22:37:13 +0200 Subject: [PATCH 13/14] Troubleshooting combine_min_nan_groups FAILED test_encoders.py::TestEncoders::test_column_transformer FAILED test_encoders.py::TestEncoders::test_sklearn_compliance --- category_encoders/count.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/category_encoders/count.py b/category_encoders/count.py index 5d44e9b1..7a0cd7b8 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -11,7 +11,7 @@ __author__ = 'joshua t. dunn' - +# COUNT_ENCODER BRANCH class CountEncoder(BaseEstimator, TransformerMixin): def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', @@ -247,6 +247,7 @@ def _fit_count_encode(self, X_in, y): raise ValueError( '%s key in `handle_missing` should be one of: ' ' `value`, `return_nan` and `error` not `%s`.' + % (col, str(self._handle_missing[col])) ) self.mapping[col] = X[col].value_counts( @@ -261,8 +262,8 @@ def _fit_count_encode(self, X_in, y): if self._handle_missing[col] == 'return_nan': self.mapping[col][np.NaN] = np.NaN - elif self._handle_missing[col] == 'value': - self.mapping[col].loc[-2] = 0 + # elif self._handle_missing[col] == 'value': + #test_count.py failing self.mapping[col].loc[-2] = 0 if any([val is not None for val in self._min_group_size.values()]): self.combine_min_categories(X) @@ -289,7 +290,7 @@ def _transform_count_encode(self, X_in, y): and self._handle_missing[col] != 'return_nan' ): X[col].replace(np.nan, 0, inplace=True) - + elif ( self._handle_unknown[col] == 'error' and X[col].isnull().any() @@ -392,7 +393,7 @@ def _check_set_create_attrs(self): if self.combine_min_nan_groups is None: self.combine_min_nan_groups = True - self.min_group_size = 0.01 + # test_count.py failing: self.min_group_size = 0.01 def _check_set_create_dict_attrs(self): """Check attributes that can be dicts and format for all `self.cols`.""" From cafd264c7ea3403e27e405d5600528a126a1a25f Mon Sep 17 00:00:00 2001 From: makrobios Date: Wed, 15 Jul 2020 23:55:28 +0200 Subject: [PATCH 14/14] disable combine_min_nan_groups check combine_min_nan_groups is set to True by default but not min_group_size leading to an exception maybe change combine_min_nan_groups to False by default in the future? --- category_encoders/count.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/category_encoders/count.py b/category_encoders/count.py index 7a0cd7b8..69aa0af6 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -373,14 +373,16 @@ def _check_set_create_attrs(self): "'combine_min_nan_groups' == 'force' for all columns." ) + if ( self.combine_min_nan_groups is not None and self.min_group_size is None ): - raise ValueError( - "`combine_min_nan_groups` only works when `min_group_size` " - "is set for all columns." - ) + pass + # raise ValueError( + # "`combine_min_nan_groups` only works when `min_group_size` " + # "is set for all columns." + # ) if ( self.min_group_name is not None @@ -393,7 +395,6 @@ def _check_set_create_attrs(self): if self.combine_min_nan_groups is None: self.combine_min_nan_groups = True - # test_count.py failing: self.min_group_size = 0.01 def _check_set_create_dict_attrs(self): """Check attributes that can be dicts and format for all `self.cols`."""