-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommon_functions.py
169 lines (135 loc) · 6.28 KB
/
common_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
import scipy.sparse as sps
def get_clean_data(url: str, drop_columns: list) -> pd.DataFrame:
""" Downloads data from url and removes selected columns. Also removes all spaces before values of categorical featues
Parameters
----------
url: str
Link to download data
drop_columns: list
List of columns, that have to be dropped from datadrame
Returns
-------
df: pd.DataFrame:
Dataframe with initial data cleaning, including removement of missing data and spaces at the beginning of each categorical value.
"""
adult_columns = [
"Age",
"Workclass",
"final weight",
"Education",
"Education-Num",
"Marital Status",
"Occupation",
"Relationship",
"Ethnic group",
"Sex",
"Capital Gain",
"Capital Loss",
"Hours per week",
"Country",
"Income",
]
df = pd.read_csv(url, header=None, names=adult_columns).apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df = df.replace(to_replace= ' ?', value = np.nan)
df = df.dropna(how='any').reset_index(drop=True)
df = df.drop(columns=drop_columns)
return df
def preprocess_data(data: pd.DataFrame, numerical_features_list: list, categorical_features_list: list,
TARGET: str = 'Income', education: bool = True) -> pd.DataFrame:
"""Transform the data according to it's original format in order to feed it to the model.
Parameters
----------
data : pdandas.DataFrame
Dataframe with variables in columns and instances in rows, where data is represented in original data types.
numerical_features_list : list
List of features, that have numerical format in original dataframe
categorical_features_list : list
List of features, that are represented as categories in original dataframe
TARGET : str
Name of target variable
education: bool
Whether or not apply ordinal encoder to the Education feature
Returns
-------
preprocessed_data : pandas.DataFrame
Preprocessed data, ready to be fed to the model
"""
X = data.drop(columns=[TARGET])
y = list(data[TARGET])
if education:
columntransformer = ColumnTransformer(transformers = [
('ordinal', OrdinalEncoder(categories=[['Preschool','1st-4th','5th-6th','7th-8th','9th','10th','11th',
'12th','HS-grad','Some-college','Assoc-voc','Assoc-acdm',
'Bachelors','Masters','Prof-school','Doctorate']]),
make_column_selector(pattern = 'Education')),
('stand scaler', StandardScaler(), numerical_features_list),
('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
remainder='drop')
else:
columntransformer = ColumnTransformer(transformers = [
('stand scaler', StandardScaler(), numerical_features_list),
('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
remainder='drop')
X_trans = columntransformer.fit_transform(X)
if sps.issparse(X_trans):
X_trans = X_trans.toarray()
x_columns_names = columntransformer.get_feature_names_out()
X_trans = pd.DataFrame(X_trans, columns = x_columns_names)
if education == False:
X_trans = pd.merge(left=X_trans, right=pd.DataFrame(data["Education"]), left_index=True, right_index=True)
y_trans = pd.DataFrame(data = y, index=range(0, len(y)), columns=[TARGET])
y_trans[TARGET] = y_trans[TARGET].replace({'<=50K':0, '>50K':1})
preprocessed_data = pd.merge(left=y_trans, right=X_trans, left_index=True, right_index=True)
return preprocessed_data
def cluster_education(df: pd.DataFrame) -> pd.DataFrame:
"""Cluster Education values into 4 categories: Undergraduated, High school graduated, Some college and Above graduated
Parameters
-----------
df: pd.DataFrame
Initial dataframe with original data in Education column
Returns
--------
df: pd.DataFrame
The same dataframe, as was inputed, but with clustered Education values
"""
df.loc[
lambda x: x["Education-Num"].between(0, 8, "both"), "Education"
] = "Under-grad"
df.loc[
lambda x: x["Education-Num"] == 9, "Education"
] = "HS-grad"
df.loc[
lambda x: x["Education-Num"] == 10, "Education"
] = "Some-college"
df.loc[
lambda x: x["Education-Num"].between(11, 16, 'both'), "Education"
] = "Above-grad"
scale_mapper = {'Under-grad':0, 'Some-college':1, 'HS-grad':2, 'Above-grad':3}
df["Education"] = df["Education"].replace(scale_mapper)
return df
def cluster_categorical(data: pd.DataFrame) -> pd.DataFrame:
"""Cluster those cutegories, that make sence being clustered, like clustering countries into developed and developing
Parameters
----------
data : pandas.DataFrame
Original dataframe with variables in columns and instances in rows
Returns
-------
data : pandas.DataFrame
The same dataframe, but with some categories or some features clustered together
"""
# cluster Workclass
data['Workclass'] = data['Workclass'].replace({'Never-worked': 'Without-pay'})
# cluster Marital status
data['Marital Status'] = np.where(data['Marital Status'].isin(['Married-AF-spouse', 'Married-civ-spouse']), 'Married', 'Single')
# cluster Relationship
data['Relationship'] = np.where(data['Relationship'].isin(['Husband', 'Wife', 'Own-child']), 'Family', 'Not-in-Family')
# cluster Countries
data['Country'] = np.where(data['Country'].isin(['Hungary', 'Greece', 'Portugal', 'Poland', 'Holand-Netherlands', 'Scotland', 'Italy',
'England', 'Ireland', 'Germany', 'Hong', 'France', 'Japan', 'Canada', 'United-States']
), 'Developed', 'Developing')
return data