house_price_prediction.py

# -*- coding: utf-8 -*-
"""House_Price_Prediction.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1fYvaKoktUzBtI16ApT8I5gNdSCXiG_OJ

##Importing the dataset
"""

#imported the dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

house_price=sklearn.datasets.fetch_california_housing()

#name_of_the_dataset.function_name(imput_arguments).

print(house_price)

house_price_df=pd.DataFrame(house_price.data, columns=house_price.feature_names) #created a datframe by the help of pandas

house_price_df.head() # top values of the dataset

#how to include string/category variable in the code

house_price_df.describe(include='all')  #to see the all features of the dataset

house_price_df.shape #shape row * column of the dataset

house_price_df.info()

#Float are continous variable.
##This means For example: 2   Sale Price- 21609 non-null  float64 21613-21609=4 values are not availabe in ROWS
#9 ROWS values are not available
#How to find descriptive statistics for the single variable

house_price_df['MedInc'].mean()

house_price_df['MedInc'].std() #standard deviation of the MedInc column

house_price_df['MedInc'].min()

house_price_df['MedInc'].quantile(.25)

#unique function for the string variable

house_price_df["MedInc"].unique()

#Using the Numpy Library for standard deviation

#Library_name.inpuut_name(function_parameteres)
np.std(house_price_df['MedInc'])

"""##Plotting and Graphs(Line, Pie and bar Graph)"""

#Help to discover certain patern and information

import matplotlib.pyplot as plt

"""####1.) Line graph"""

#x-axis-plt.plot(data_set_name(variable_name))

#y-axis-plt.plot(data_set_name(variable_name))

plt.plot(house_price_df['MedInc'], color='red')   #in red color
plt.xlabel("Population")
plt.ylabel("Median Income")
plt.title("Graphical Representaion")
plt.show()

# Group the data based on the condition of the house
#groupby function for string and categorical value
plt.plot(house_price_df['MedInc'],marker='o', markerfacecolor='blue', markersize=5,color='red',linewidth=5,linestyle='dashed')

plt.scatter(x=house_price_df['AveRooms'], y=house_price_df['MedInc'], color='green')
plt.xlabel('Average Rooms')
plt.ylabel("Median Income")
plt.title("Median Income Vs Average Rooms")

#histogram is use to plot frequency count(or simple count of records over range of the values a variable can have)

#divide the data into bins

#It will create 100 equal variable

plt.boxplot(house_price_df['HouseAge'])

#data is more skewd or center around high values

"""##Positive and Negative Correlation"""

correlation=house_price_df.corr()

house_price_df.fillna(house_price_df.mean(), inplace=True)

plt.figure(figsize=(20, 15))

sns.heatmap(correlation, cbar=True, square=True, fmt='1f', annot=True, annot_kws={'size':8},cmap='Reds')

X=house_price_df.drop(['MedInc'], axis=1)
Y=house_price_df['MedInc']

print(X)
print(Y)

"""#Spliting the data into training and test data"""

X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.2, random_state=2)
#80% and 20% ratio of the split dataset

print(X.shape, X_train.shape, X_test.shape)

#Training using XGBoost- More than one model because of the decision Tree based Ensemble Machine Learning Algorithm

model=XGBRegressor()

house_price_df.loc[:, 'MedInc'].tolist()  #to see the full column in the dataset

model.fit(X_train, Y_train)

training_data_prediction=model.predict(X_train)

print(training_data_prediction)

"""#Evaluation Phase"""


#R^2  (R- Square)
score_1=metrics.r2_score(Y_train, training_data_prediction)
#Mean Absolute Error
score_2=metrics.mean_absolute_error(Y_train, training_data_prediction)

print("R^2 Error: ", score_1)
print("Mean Absolute Error: ", score_2)

testing_data_prediction=model.predict(X_test)

score1_test=metrics.r2_score(Y_test, testing_data_prediction)
score2_test=metrics.mean_absolute_error(Y_test, testing_data_prediction)

print("R^2 Error: ", score1_test)
print("Mean Absolute Error: ", score2_test)

#Plotting the scatter plot of the Actual vs Predicted Outcome
plt.scatter(Y_train, training_data_prediction)
plt.xlabel('Actual Outcome')
plt.ylabel('Predicted Outcome')
plt.title('Actual Vs Predictied Outcome')
plt.show()   #In the below graph we can see that both value are in the similar range