-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAirBeam2 data processing visualization.py
379 lines (286 loc) · 16 KB
/
AirBeam2 data processing visualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
"""
@author: Ana MC Ilie
Citation: Ilie, A.M.C., McCarthy, N., Velasquez, L. et al. Air pollution exposure assessment at schools and playgrounds
in Williamsburg Brooklyn NYC, with a view to developing a set of policy solutions. J Environ Stud Sci (2022). https://doi.org/10.1007/s13412-022-00777-7
"""
import pandas as pd
from pandas.compat import StringIO
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import os
from datetime import datetime
from pandas import Series # To work on series
import statsmodels
import statsmodels.api as sm
from pylab import figure, axes, pie, title, show
import warnings # To ignore the warnings
warnings.filterwarnings("ignore")
import seaborn as sns
sns.set(color_codes=True)
sns.set_style("white")
plt.style.use("seaborn")
# Use seaborn style defaults and set the default figure size
sns.set(rc={'figure.figsize':(14, 8)})
# finding the current directory
abs_path = os.getcwd()
abs_path
# change to desired folder where .csv file is present - Use forward backslash
path = r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\cleaned data'
data = pd.read_csv(path + '/all airbeams PM2.5 CLEANED file.csv')
# Exploratory Data Analysis
data.dtypes # find the datatypes of all variables
data.columns # list of all column names - original list
data.shape # Dimensions of original dataset (rows, columns)
print(data.head(10)) # first 10 rows of dataset
data.index # index - number of rows and columns
data.info() # information on datatypes and number of elements
# Convert the datatype of certain columns to float type
data[['PM-2.5_W1','PM-2.5_W2A','PM-2.5_W2b','PM-2.5_W3','PM-2.5_W4','PM-2.5_W5','PM-2.5_W6a','PM-2.5_W6b','PM-2.5_W7','PM-2.5_W8','PM-2.5_W9','PM-2.5_W10a','PM-2.5_W10b','PM-2.5_W11a']] = data[['PM-2.5_W1','PM-2.5_W2A','PM-2.5_W2b','PM-2.5_W3','PM-2.5_W4','PM-2.5_W5','PM-2.5_W6a','PM-2.5_W6b','PM-2.5_W7','PM-2.5_W8','PM-2.5_W9','PM-2.5_W10a','PM-2.5_W10b','PM-2.5_W11a']].apply(pd.to_numeric)
data[['Timestamp_W1','Timestamp_W2a','Timestamp_W2b','Timestamp_W3','Timestamp_W4','Timestamp_W5','Timestamp_W6a','Timestamp_W6b','Timestamp_W7','Timestamp_W8','Timestamp_W9','Timestamp_W10a','Timestamp_W10b','Timestamp_W11a']] = data[['Timestamp_W1','Timestamp_W2a','Timestamp_W2b','Timestamp_W3','Timestamp_W4','Timestamp_W5','Timestamp_W6a','Timestamp_W6b','Timestamp_W7','Timestamp_W8','Timestamp_W9','Timestamp_W10a','Timestamp_W10b','Timestamp_W11a']].apply(pd.to_numeric)
# Timestamp needs to be converted to datetime object (for time series manipulations)
data['Timestamp_W1'] = pd.to_datetime(data['Timestamp_W1'])
data['Timestamp_W2a'] = pd.to_datetime(data['Timestamp_W2a'])
data['Timestamp_W2b'] = pd.to_datetime(data['Timestamp_W2b'])
data['Timestamp_W3'] = pd.to_datetime(data['Timestamp_W3'])
data['Timestamp_W4'] = pd.to_datetime(data['Timestamp_W4'])
data['Timestamp_W5'] = pd.to_datetime(data['Timestamp_W5'])
data['Timestamp_W6a'] = pd.to_datetime(data['Timestamp_W6a'])
data['Timestamp_W6b'] = pd.to_datetime(data['Timestamp_W6b'])
data['Timestamp_W7'] = pd.to_datetime(data['Timestamp_W7'])
data['Timestamp_W8'] = pd.to_datetime(data['Timestamp_W8'])
data['Timestamp_W9'] = pd.to_datetime(data['Timestamp_W9'])
data['Timestamp_W10a'] = pd.to_datetime(data['Timestamp_W10a'])
data['Timestamp_W10b'] = pd.to_datetime(data['Timestamp_W10b'])
data['Timestamp_W11a'] = pd.to_datetime(data['Timestamp_W11a'])
# Timestamp needs to be converted to datetime object (for time series manipulations)
data[['Timestamp_W1','Timestamp_W2a','Timestamp_W2b','Timestamp_W3','Timestamp_W4','Timestamp_W5','Timestamp_W6a','Timestamp_W6b','Timestamp_W7','Timestamp_W8','Timestamp_W9','Timestamp_W10a','Timestamp_W10b','Timestamp_W11a']] = data[['Timestamp_W1','Timestamp_W2a','Timestamp_W2b','Timestamp_W3','Timestamp_W4','Timestamp_W5','Timestamp_W6a','Timestamp_W6b','Timestamp_W7','Timestamp_W8','Timestamp_W9','Timestamp_W10a','Timestamp_W10b','Timestamp_W11a']].apply(pd.to_numeric)
# store descriptive statistics in a different dataframe and .csv file
desc_stat = data.describe()
desc_stat.to_csv(path+'\Descriptive_Stats.csv')
# see the results of desired column
print (desc_stat['PM-2.5_W1'])
print (desc_stat['PM-2.5_W2A'])
# Missing Value Analysis
# list number of missing values for each column
print (data.isnull().sum())
null_counts = data.isnull().sum()
null_counts.to_csv(path+ '\List number of missing values for each column.csv')
# Run following 6 lines together for plot
plt.figure(figsize=(14,8))
plt.xticks(np.arange(len(null_counts)), null_counts.index, rotation='vertical')
plt.xlabel('Columns')
plt.ylabel('Number of Attributes with Missing Data')
plt.bar(np.arange(len(null_counts)), null_counts)
plt.title('Missing Value Analysis')
# replacing all the zeros with NaN
data = data[data.columns].replace(0, np.nan)
# dropping all the rows with NaN in the columns
data.dropna(inplace=True)
# recheck the shape of dataframe if values changed
data.shape
# save the cleaned data file to another .csv, in the same folder
data.to_csv(path + "\Cleaned_Data_File_allairbeams.csv")
# Continous Variables in the dataset
data_cont = data.iloc[:, 0:42].reset_index()
data_cont = data_cont.drop(['index'], axis = 1)
data_cont.shape
data_cont.columns
# Boxplot Analysis to see presence of outliers and save the file.png
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W1"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W1.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W2A"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W2A.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W2b"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W2b.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W3"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W3.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W4"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W4.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W5"], data=data)
fig.savefig(r'C:\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W5.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W6a"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W6a.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W6b"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W6b.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W7"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W7.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W8"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W8.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W9"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W9.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W10a"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W10a.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W10b"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W10b.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.boxplot(y=data_cont["PM-2.5_W11a"], data=data)
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplot_W11a.png') # save the figure to file
plt.close(fig) # close the figure
# Basic Plots boxplots all variables
fig = plt.figure()
data_cont.boxplot()
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\boxplots all variables.png') # save the figure to file
plt.close(fig) # close the figure
# Pair Plot
sns.pairplot(data_cont)
# Line Plot - continous variables
fig = plt.figure()
pm_conc = ['PM-2.5_W1','PM-2.5_W2A','PM-2.5_W2b','PM-2.5_W3','PM-2.5_W4','PM-2.5_W5','PM-2.5_W6a','PM-2.5_W6b','PM-2.5_W7','PM-2.5_W8','PM-2.5_W9','PM-2.5_W10a','PM-2.5_W10b','PM-2.5_W11a']
plt.plot(data_cont[pm_conc])
plt.xlabel('Timestamp')
plt.ylabel('PM Concentration')
plt.show()
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\lineplot.png') # save the figure to file
plt.close(fig) # close the figure
# Histogram
data_cont.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)
# Distribution Plot - Kernel Density Estimate
plt.figure(figsize=(20,10))
sns.distplot(data_cont['PM-2.5_W1'], label='PM-2.5_W1')
sns.distplot(data_cont['PM-2.5_W2A'], label='PM-2.5_W2A')
sns.distplot(data_cont['PM-2.5_W2b'], label='PM-2.5_W2b')
sns.distplot(data_cont['PM-2.5_W3'], label='PM-2.5_W3')
sns.distplot(data_cont['PM-2.5_W4'], label='PM-2.5_W4')
sns.distplot(data_cont['PM-2.5_W5'], label='PM-2.5_W5')
sns.distplot(data_cont['PM-2.5_W6a'], label='PM-2.5_W6a')
sns.distplot(data_cont['PM-2.5_W6b'], label='PM-2.5_W6b')
sns.distplot(data_cont['PM-2.5_W7'], label='PM-2.5_W7')
sns.distplot(data_cont['PM-2.5_W8'], label='PM-2.5_W8')
sns.distplot(data_cont['PM-2.5_W9'], label='PM-2.5_W9')
sns.distplot(data_cont['PM-2.5_W10a'], label='PM-2.5_W10a')
sns.distplot(data_cont['PM-2.5_W10b'], label='PM-2.5_W10b')
sns.distplot(data_cont['PM-2.5_W11a'], label='PM-2.5_W11a')
plt.xlabel('Variable Estimate')
plt.title('Distribution Plot - Kernel Density Estimate')
plt.legend()
# Linear regression models
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W1", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W1.png') # save the figure to file
plt.close(fig) # close the figure
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W2b", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W2b.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W3", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W3.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W4", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W4.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W5", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W5.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W6a", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W6a.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W6b", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W6b.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W7", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W7.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W8", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W8.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W9", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W9.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W10a", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W10a.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W10b", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W10b.png') # save the figure to file
plt.close(fig)
fig = plt.figure()
sns.regplot(x="PM-2.5_W2A", y="PM-2.5_W11a", data=data);
fig.savefig(r'C:\Users\Data Analysis\Intercomparison\Fixed DEC site\March 18th 9am\linearreg_W11a.png') # save the figure to file
plt.close(fig)
# Conditioning on other variables
#sns.lmplot(x="PM-2.5_W2A", y="PM-2.5_W1", col="Timestamp_W1", data=data);
# Plotting a regression in other contexts
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W1", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W2b", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W3", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W4", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W5", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W6a", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W6b", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W7", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W8", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W9", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W10a", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W10b", data=data, kind="reg");
sns.jointplot(x="PM-2.5_W2A", y="PM-2.5_W11a", data=data, kind="reg");
# other pairplot
sns.pairplot(data, x_vars=['PM-2.5_W1','PM-2.5_W2b','PM-2.5_W3','PM-2.5_W4','PM-2.5_W5','PM-2.5_W6a','PM-2.5_W6b','PM-2.5_W7','PM-2.5_W8','PM-2.5_W9','PM-2.5_W10a','PM-2.5_W10b','PM-2.5_W11a'], y_vars=["PM-2.5_W2A"],
height=5, aspect=.8, kind="reg");
# CREATE LINEAR REGRESSION
regr = linear_model.LinearRegression()
## X usually means our input variables (or independent variables)
## Y usually means our output/dependent variable
X = data["PM-2.5_W1"]
y = data["PM-2.5_W2A"]
## let's add an intercept (beta_0) to our model
X = sm.add_constant(X)
# Note the difference in argument order
model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
predictions = model.predict(X)
# Print out the statistics
model.summary()
# to_csv(path + "\linear regression.csv")
# Time Series Analysis
data2 = data.set_index('Timestamp_W3')
data2.head(3)
data2['Month'] = data2.index.month
data2['Year'] = data2.index.year
data2['Weekday Name'] = data2.index.weekday_name
data2.sample(5, random_state = 0)
# time slicing
data_March = data2.loc['2019-03-12':'2019-03-17'] # Data
data_March2 = data2.loc['2019-03'] # Data
data_March['PM-2.5_W1'].plot(linewidth = 0.5)
cols_plot = ['PM-2.5_W1']
axes = data_March[cols_plot].plot(marker='.', alpha=0.5, linestyle='None', figsize=(11, 9), subplots=True)
for ax in axes:
ax.set_ylabel('PM Concentration')
# slicing and plotting at the same time
data2.loc['2019-03-7 15:00:00':'2019-03-17 20:30:00', 'PM-2.5_W1'].plot()