-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode.py
163 lines (118 loc) · 5.87 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# --------------
# Import the required Libraries
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import calendar
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
# Generate a line chart that visualizes the readings in the months
def line_chart(df,period,col):
""" A line chart that visualizes the readings in the months
This function accepts the dataframe df ,period(day/month/year) and col(feature), which plots the aggregated value of the feature based on the periods. Ensure the period labels are properly named.
Keyword arguments:
df - Pandas dataframe which has the data.
period - Period of time over which you want to aggregate the data
col - Feature of the dataframe
"""
if period == "Month":
data = df.groupby(df.index.month).mean()
elif period == "Day":
data = df.groupby(df.index.day).mean()
elif period == "Year":
data = df.groupby(df.index.year).mean()
calendar_months = calendar.month_name[1:]
x_series = calendar_months
y_series = data[col]
plt.plot(x_series, y_series)
plt.title('Temperature Trend, 2012')
plt.xlabel(period)
plt.xticks(rotation=90)
plt.ylabel(col)
plt.show()
# Function to perform univariate analysis of categorical columns
def plot_categorical_columns(df):
""" Univariate analysis of categorical columns
This function accepts the dataframe df which analyzes all the variable in the data and performs the univariate analysis using bar plot.
Keyword arguments:
df - Pandas dataframe which has the data.
"""
categorical_columns = df.select_dtypes(include=['object']).columns
for i in range(0,len(categorical_columns),2):
if len(categorical_columns) > i+1:
plt.figure(figsize=(10,4))
plt.subplot(121)
df[categorical_columns[i]].value_counts(normalize=True).plot(kind='bar')
plt.title(categorical_columns[i])
plt.subplot(122)
df[categorical_columns[i+1]].value_counts(normalize=True).plot(kind='bar')
plt.title(categorical_columns[i+1])
plt.tight_layout()
plt.show()
else:
df[categorical_columns[i]].value_counts(normalize=True).plot(kind='bar')
plt.title(categorical_columns[i])
plt.show()
# Function to plot continous plots
def plot_cont(df,plt_typ):
""" Univariate analysis of Numerical columns
This function accepts the dataframe df, plt_type(boxplot/distplot) which analyzes all the variable in the data and performs the univariate analysis using boxplot or distplot plot.
Keyword arguments:
df - Pandas dataframe which has the data.
plt_type - type of plot through which you want to visualize the data
"""
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
df = df[numeric_columns]
for i in range(0,len(numeric_columns),2):
if len(numeric_columns) > i+1:
plt.figure(figsize=(10,4))
plt.subplot(121)
if plt_typ == "boxplot":
sns.boxplot(df[numeric_columns[i]])
plt.subplot(122)
sns.boxplot(df[numeric_columns[i+1]])
elif plt_typ == "distplot":
sns.distplot(df[numeric_columns[i]])
plt.subplot(122)
sns.distplot(df[numeric_columns[i+1]])
else:
print("Pass either distplot/boxplot")
plt.tight_layout()
plt.show()
# Function to plot grouped values based on the feature
def group_values(df,col1,agg1,col2):
""" Agrregate values by grouping
This function accepts a dataframe, 2 column(feature) and aggregated function(agg1) which groupby the dataframe based on the column and plots the bar plot.
Keyword arguments:
df - Pandas dataframe which has the data.
col1 - Feature of the dataframe on which values will be aggregated.
agg1 - Dictionary of aggregate functions with feature as the key and func as the value
col2 - Feature of the dataframe to be plot against grouped data.
Returns:
grouping - Dataframe with all columns on which it is grouped on.
"""
aggregate = {'mean':np.mean,'max':np.max,'min':np.min}
grouping = df.groupby(col1).agg(aggregate[agg1])
plt.figure(figsize=(10,4))
plt.ylabel(col2)
grouping[col2].plot(kind="bar")
plt.show()
# Read the Data and pass the parameter as parse_dates=True, index_col='Date/Time'
weather_df = pd.read_csv(path, parse_dates=True, index_col='Date/Time')
print(weather_df.head(5))
print(weather_df.shape)
# Lets try to generate a line chart that visualizes the temperature readings in the months.
# Call the function line_chart() with the appropriate parameters.
line_chart(weather_df,"Month","Temp (C)")
# Now let's perform the univariate analysis of categorical features.
# Call the "function plot_categorical_columns()" with appropriate parameters.
plot_categorical_columns(weather_df)
# Let's plot the Univariate analysis of Numerical columns.
# Call the function "plot_cont()" with the appropriate parameters to plot distplot
plot_cont(weather_df,"distplot")
# Call the function "plot_cont()" with the appropriate parameters to plot boxplot
plot_cont(weather_df,"boxplot")
# Groupby the data by Weather and plot the graph of the mean visibility during different weathers. Call the function group_values to plot the graph.
# Feel free to try on diffrent features and aggregated functions like max, min.
group_values(weather_df,"Weather","mean","Visibility (km)")