-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-eda-plots.R
153 lines (122 loc) · 4.94 KB
/
01-eda-plots.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#Load packages
library(tidyverse)
library(vroom)
library(here)
library(janitor)
#Load the 4 data files
files <- list.files(here("data_raw"), pattern = ".txt")
ds <- vroom(here("data_raw",files), id = "file", delim = " ", skip = 6)
ds <- ds %>% rename(scene_time = `sceneQTtime(d:h:m:s.tv/ts)`,
por_time = `porQTtime(d:h:m:s.tv/ts)`)
ds <- ds %>% rename_with(make_clean_names)
ds <- ds %>% mutate(file = str_extract(file, "\\d\\d\\d_(search|walk)")) %>%
separate(file, into = c("id", "cond")) %>%
mutate(cond = as.factor(cond))
#Let's first use the DataExplorer package to give us a snapshot of the entire dataset
library(DataExplorer)
plot_bar(ds) #Frequencies of all categorical variables
plot_histogram(ds) #Histograms of all continuous variables
#Vis_dat and vis_miss also give us useful snapshots
library(visdat)
vis_dat(ds) #snapshot of each column
vis_miss(ds) #visualize missing
#Vis_expect is a little more customizable
#Visualize both expected values and missing at the same time
vis_expect(select(ds, por_x), ~ .x > 0 & .x < 640)
#The ~ .x part is like what we used in map. It specifies a formula where .x stands in for the column we are visualizing
#Let's plot our own histogram with ggplot
#Simplest histogram
ggplot(data = ds, aes(x = por_x)) + geom_histogram()
ds %>% ggplot(aes(x = por_x)) + geom_histogram() #Or pipe data in
#Piping data in let's us filter first
ds %>% filter(por_x < 640 & por_x > 0) %>%
ggplot(aes(x = por_x)) + geom_histogram()
#Or we can adjust the graph axes to ignore the out of range data
ds %>% ggplot(aes(x = por_x)) + geom_histogram() + xlim(0, 640)
#xlim(lower_bound, upper_bound)
#A boxplot might be a better first glance
ds %>% filter(por_x < 640 & por_x > 0) %>%
ggplot(aes(y = por_x)) + geom_boxplot()
#What if we wanted to change the orientation of the boxplot?
#At this point, I'm convinced -- let's set the impossible por_x and por_y values to NA
#so that we can stop filtering
ds_cleaned <- ds %>% mutate(
por_x = ifelse(por_x >= 0 & por_x < 640, por_x, NA),
por_y = ifelse(por_y >= 0 & por_y < 480, por_y, NA)
)
#Might be better to check by participant and condition
#Adding aes elements for id (x) and cond (fill) will change how the data are grouped
ds_cleaned %>% ggplot(aes(x = cond, y = por_x)) + geom_boxplot()
#Now we can better see the outliers and distributions in the remaining data
#Reference lines can also help
#when your plot commands start to get long, make sure the + ends each line
#Just like with the pipe, starting a line with + will lead to a syntax error
ds_cleaned %>%
ggplot(aes(x = id, fill = cond, y = por_x)) +
geom_hline(yintercept = 320) +
geom_boxplot() +
ylim(0, 640)
#For categorical data, geom_bar will automatically plot counts by category
ds_cleaned %>%
mutate(avg_fps = factor(avg_fps, levels = c(29.96, 29.97))) %>%
ggplot(aes(avg_fps)) + geom_bar()
#Scatter plot - use geom_point
ds_cleaned %>% ggplot(aes(x = por_x, y = por_y)) + geom_point()
#For overlapping or dense data, try bin2d
ds_cleaned %>% ggplot(mapping = aes(x = por_x, y = por_y)) + geom_bin2d()
#facets are incredibly useful for eda
ds_cleaned %>%
ggplot(mapping = aes(x = por_x, y = por_y)) +
geom_point() +
facet_wrap(c("id","cond")) +
xlim(0, 640) +
ylim(0, 480)
#Saving your last ggplot to disk is easy
ggsave(here("eda","all_ppts_xy_plots.pdf"))
#But if you have more than 5-6 individual graphs,
#it might be better to save them to a file to inspect one at a time
ids <- unique(ds_cleaned$id) #What does this do?
for (i in ids) {
ds_cleaned %>% filter(id == i) %>%
ggplot(aes(x = por_x, y = por_y)) +
geom_bin2d() +
facet_wrap("cond") +
xlim(0, 640) +
ylim(0,480)
ggsave(here("eda","individual_xy_plots",paste0(i,".png")))
}
### TOO LONG OF AN EXAMPLE FOR CLASS, BUT LEAVING IT FOR REFERENCE
# Faceting is a nice tool if you want to panel the same figure together
# If you want to panel different types of figures together, I suggest patchwork
#Writing a lot of individual files can get to be annoying,
#but it's easy to put together different graphs with the patchwork package
library(patchwork)
ids <- unique(ds_cleaned$id)
for (i in ids) {
#Plots can be stored to objects
p1 <- ds_cleaned %>% filter(id == i) %>%
ggplot(mapping = aes(x = por_x, y = por_y)) +
geom_bin2d() +
facet_wrap(~ cond) +
xlim(0, 640) +
ylim(0,480)
#Store plot 2
p2 <- ds_cleaned %>% filter(id == i) %>%
ggplot(mapping = aes(x = por_x)) +
geom_histogram() +
facet_wrap(~ cond) +
xlim(0, 640)
#Store plot 3
p3 <- ds_cleaned %>% filter(id == i) %>%
ggplot(mapping = aes(x = por_y)) +
geom_histogram() +
facet_wrap(~ cond) +
xlim(0, 480)
#Divide means put plots one on top of another
#Add means put them side by side
#Any guesses what this does?
p1/(p2 + p3)
#p1 + p2 + p3 means panel horizontally
#p1/p2/p3 means panel vertically
ggsave(here("eda","individual_composite",paste0(i,".png")))
}