-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhouse_price_prediction.py
160 lines (98 loc) · 4.39 KB
/
house_price_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: utf-8 -*-
"""House_Price_Prediction.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1fYvaKoktUzBtI16ApT8I5gNdSCXiG_OJ
##Importing the dataset
"""
#imported the dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics
house_price=sklearn.datasets.fetch_california_housing()
#name_of_the_dataset.function_name(imput_arguments).
print(house_price)
house_price_df=pd.DataFrame(house_price.data, columns=house_price.feature_names) #created a datframe by the help of pandas
house_price_df.head() # top values of the dataset
#how to include string/category variable in the code
house_price_df.describe(include='all') #to see the all features of the dataset
house_price_df.shape #shape row * column of the dataset
house_price_df.info()
#Float are continous variable.
##This means For example: 2 Sale Price- 21609 non-null float64 21613-21609=4 values are not availabe in ROWS
#9 ROWS values are not available
#How to find descriptive statistics for the single variable
house_price_df['MedInc'].mean()
house_price_df['MedInc'].std() #standard deviation of the MedInc column
house_price_df['MedInc'].min()
house_price_df['MedInc'].quantile(.25)
#unique function for the string variable
house_price_df["MedInc"].unique()
#Using the Numpy Library for standard deviation
#Library_name.inpuut_name(function_parameteres)
np.std(house_price_df['MedInc'])
"""##Plotting and Graphs(Line, Pie and bar Graph)"""
#Help to discover certain patern and information
import matplotlib.pyplot as plt
"""####1.) Line graph"""
#x-axis-plt.plot(data_set_name(variable_name))
#y-axis-plt.plot(data_set_name(variable_name))
plt.plot(house_price_df['MedInc'], color='red') #in red color
plt.xlabel("Population")
plt.ylabel("Median Income")
plt.title("Graphical Representaion")
plt.show()
# Group the data based on the condition of the house
#groupby function for string and categorical value
plt.plot(house_price_df['MedInc'],marker='o', markerfacecolor='blue', markersize=5,color='red',linewidth=5,linestyle='dashed')
plt.scatter(x=house_price_df['AveRooms'], y=house_price_df['MedInc'], color='green')
plt.xlabel('Average Rooms')
plt.ylabel("Median Income")
plt.title("Median Income Vs Average Rooms")
#histogram is use to plot frequency count(or simple count of records over range of the values a variable can have)
#divide the data into bins
#It will create 100 equal variable
plt.boxplot(house_price_df['HouseAge'])
#data is more skewd or center around high values
"""##Positive and Negative Correlation"""
correlation=house_price_df.corr()
house_price_df.fillna(house_price_df.mean(), inplace=True)
plt.figure(figsize=(20, 15))
sns.heatmap(correlation, cbar=True, square=True, fmt='1f', annot=True, annot_kws={'size':8},cmap='Reds')
X=house_price_df.drop(['MedInc'], axis=1)
Y=house_price_df['MedInc']
print(X)
print(Y)
"""#Spliting the data into training and test data"""
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.2, random_state=2)
#80% and 20% ratio of the split dataset
print(X.shape, X_train.shape, X_test.shape)
#Training using XGBoost- More than one model because of the decision Tree based Ensemble Machine Learning Algorithm
model=XGBRegressor()
house_price_df.loc[:, 'MedInc'].tolist() #to see the full column in the dataset
model.fit(X_train, Y_train)
training_data_prediction=model.predict(X_train)
print(training_data_prediction)
"""#Evaluation Phase"""
#R^2 (R- Square)
score_1=metrics.r2_score(Y_train, training_data_prediction)
#Mean Absolute Error
score_2=metrics.mean_absolute_error(Y_train, training_data_prediction)
print("R^2 Error: ", score_1)
print("Mean Absolute Error: ", score_2)
testing_data_prediction=model.predict(X_test)
score1_test=metrics.r2_score(Y_test, testing_data_prediction)
score2_test=metrics.mean_absolute_error(Y_test, testing_data_prediction)
print("R^2 Error: ", score1_test)
print("Mean Absolute Error: ", score2_test)
#Plotting the scatter plot of the Actual vs Predicted Outcome
plt.scatter(Y_train, training_data_prediction)
plt.xlabel('Actual Outcome')
plt.ylabel('Predicted Outcome')
plt.title('Actual Vs Predictied Outcome')
plt.show() #In the below graph we can see that both value are in the similar range