-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlogistic_regression.py
210 lines (155 loc) · 7.27 KB
/
logistic_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# Logistic Regression
#-------------Logistic Regression------------------------------
#Import Libraries
import pandas as pd
import seaborn as sns
#Import data
titanic_data = pd.read_csv("F:/WORK/pyWork/pyData/titanic.csv")
titanic_data.head(5)
titanic_data.tail(5)
print("No. of passengers in original dataset:" +str(len(titanic_data.index)))
#Analyzing Data
sns.countplot(x="survived",data=titanic_data)
sns.countplot(x="survived",hue="sex",data=titanic_data)
sns.countplot(x="survived",hue="pclass",data=titanic_data)
#CHECKING DATA TYPE OF A VARIABLE AND CONVERTING IT INTO ANOTHER TYPE-----
titanic_data.info()
titanic_data["age"].plot.hist()
#Converting var "age" from object type to float type
titanic_data["age"] = pd.to_numeric(titanic_data.age, errors='coerce')
titanic_data.info()
#Parameter: errors = 'coerce' in above fxn, replaces missing values (like "?") if any
#in "age" column by "nan" values.
titanic_data["age"].plot.hist()
#Converting var "fare" from object type to float type
titanic_data["fare"] = pd.to_numeric(titanic_data.fare, errors='coerce')
titanic_data.info()
#Parameter: errors = 'coerce' in above fxn, replaces missing values (like "?") if any
#in "fare" column by "nan" values.
titanic_data["fare"].plot.hist()
#Identifying/Finding missing values if any----
titanic_data.isnull()
titanic_data.isnull().sum()
sns.heatmap(titanic_data.isnull(),yticklabels=False, cmap="viridis")
#Note:
#Since missing values in "fare" are quite less, we can delete such rows.
#Since missing values in "age" are high, its better we do imputation in it.
sns.boxplot(x="age",data=titanic_data)
sns.boxplot(x="fare",data=titanic_data)
#By boxplot we observe that the no. of outliers in "age" are quite less, hence,
#if we plan to do imputation in "age" we can do it by "mean" imputation.
#Handling Missing Values------------
titanic_data.head(5)
#Droping all the rows which have a missing value in column (Fare)
#Drop NaN in a specific column
titanic_data.dropna(subset=['fare'],inplace=True)
sns.heatmap(titanic_data.isnull(),yticklabels=False)
#Imputing missing values in column (Age) with mean imputation
titanic_data["age"].fillna(titanic_data["age"].mean(), inplace=True)
sns.heatmap(titanic_data.isnull(),yticklabels=False)
#Hence, we do not have any missing values in the dataset now.
titanic_data.isnull().sum()
#Note:
#A Heat map is usually drawn for either continuous of categorical var
#Lets take few cont var columns and draw the heat map
#Cont = titanic_data[:,[5,6,7]]
#sns.heatmap(Cont)
#There are lot of string value var in dataset which have to be converted to numerical
#values for applying machine learing algoritm. Hence, we will now convert string var
#to numerical var.
titanic_data.info()
pd.get_dummies(titanic_data["sex"])
pd.get_dummies(titanic_data["sex"],drop_first=True)
Sex_Dummy = pd.get_dummies(titanic_data["sex"],drop_first=True)
Sex_Dummy.head(5)
pd.get_dummies(titanic_data["embarked"])
Embardked_Dummy = pd.get_dummies(titanic_data["embarked"],drop_first=True)
Embardked_Dummy.head(5)
pd.get_dummies(titanic_data["pclass"])
PClass_Dummy = pd.get_dummies(titanic_data["pclass"],drop_first=True)
PClass_Dummy.head(5)
#Now, lets concatenate these dummy var columns in our dataset.
titanic_data = pd.concat([titanic_data,Sex_Dummy,PClass_Dummy,Embardked_Dummy],axis=1)
titanic_data.head(5)
#dropping the columns whose dummy var have been created
titanic_data.drop(["sex","embarked","pclass","Passenger_id","name","ticket"],axis=1,inplace=True)
titanic_data.head(5)
#Splitting the dataset into Train & Test dataset
x=titanic_data.drop("survived",axis=1)
y=titanic_data["survived"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
predictions = logmodel.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)
#Hence, accuracy = (165+84)\(165+84+30+44) = 77.5%
#Calculating the coefficients:
print(logmodel.coef_)
#Calculating the intercept:
print(logmodel.intercept_)
#----To Improve the accuracy of the model, lets go with Backward ELimination Method &
# rebuild the logisitc model again with few independent variables--------
titanic_data_1 = titanic_data
titanic_data_1.head(5)
#--------------------------Backward Elimination--------------------------------
#Backward elimination is a feature selection technique while building a machine learning model. It is used
#to remove those features that do not have significant effect on dependent variable or prediction of output.
#Step: 1- Preparation of Backward Elimination:
#Importing the library:
import statsmodels.api as sm
#Adding a column in matrix of features:
x1=titanic_data_1.drop("survived",axis=1)
y1=titanic_data_1["survived"]
import numpy as nm
x1 = nm.append(arr = nm.ones((1291,1)).astype(int), values=x1, axis=1)
#Applying backward elimination process now
#Firstly we will create a new feature vector x_opt, which will only contain a set of
#independent features that are significantly affecting the dependent variable.
x_opt= x1[:, [0,1,2,3,4,5,6,7,8,9,10]]
#for fitting the model, we will create a regressor_OLS object of new class OLS of statsmodels library.
#Then we will fit it by using the fit() method.
regressor_OLS=sm.OLS(endog = y1, exog=x_opt).fit()
#We will use summary() method to get the summary table of all the variables.
regressor_OLS.summary()
#In the above summary table, we can clearly see the p-values of all the variables.
#And remove the ind var with p-value greater than 0.05
x_opt= x1[:, [0,1,2,4,5,6,7,8,9,10]]
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()
regressor_OLS.summary()
x_opt= x1[:, [0,1,2,4,5,6,7,9,10]]
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()
regressor_OLS.summary()
x_opt= x1[:, [0,1,2,5,6,7,9,10]]
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()
regressor_OLS.summary()
x_opt= x1[:, [0,1,2,5,6,7,10]]
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()
regressor_OLS.summary()
#Hence,independent var - age, sibsp, sex, pclass & embarked are significant variable
#for the predicting the value of Dependent Var "survived".
#So we can now predict efficiently using these variables.
#-------Building Logistic Regression model using ind var: age, sibsip, sex, pclass & embarked--------
# Splitting the dataset into training and test set.
from sklearn.model_selection import train_test_split
x_BE_train, x_BE_test, y_BE_train, y_BE_test= train_test_split(x_opt, y1, test_size= 0.25, random_state=0)
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(x_BE_train, y_BE_train)
predictions = logmodel.predict(x_BE_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_BE_test,predictions)
#Accuracy = (170+87)/(170+87+25+41) = 80%
#Calculating the coefficients:
print(logmodel.coef_)
#Calculating the intercept:
print(logmodel.intercept_)
#So, ur final Predicitve Modelling Equation becomes:
#Survived =
#exp(3.74 -0.03*age -0.27*sibsp -2.52*sex(male) -1.03*pclass(2) -2.1*pclass(3) -0.33*embd(S))
# \
#exp(3.74 -0.03*age -0.27*sibsp -2.52*sex(male) -1.03*pclass(2) -2.1*pclass(3) -0.33*embd(S)) + 1