-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathgroup_exercise.R
118 lines (86 loc) · 5.25 KB
/
group_exercise.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
library(tidyverse)
library(here)
#This project has two raw data files at different scales from a study of
#infants, children, and adults watching a series of 7 video clips
#I wrote Steps 1 and 2 to import and merge the data, and kept them here for your reference
#Skip down to Step 3 to work on EDA
#SOURCE DESCRIPTION
#FILE 1: auc.csv
#Columns = stim (stimulus video, levels/labels provided below)
# id (unique participant identifier)
# age (in days)
# AUC_sal (area-under-the-curve for a saliency model)
# AUC_dist (area-under-the-curve for a distance model)
#AUC values indicate how well each model predicted where participants looked
#when watching a video
#AUC values can range from 0-1 where .5 is chance and 1 is perfect prediction
#FILE 2: participants_info_full_headers.csv
#Columns = id (unique participant identifier, matches auc.csv)
# age_group (a categorical age variable with levels:
# ".5-1 y" "1-1.5 y" "1.5-2 y" "2-4 y" "4-6 y" "8-10 y" "adult"
# precision (a quality measure of the eye data, smaller is better)
# 7 columns of "Seen X" the stimulus video before the study
# coded as SEEN (1), NOT SEEN (2), NOT SURE (3)
#STEP 1: READ IN THE AUC DATA AND CODE STIM AS A FACTOR
auc <- read_csv(here("data_raw", "auc_bystim.csv"))
stim_levels <- 1:7
stim_labels <- c("Fallon","Feist","Pentatonix","Science","Rube","Plane","Dogs")
auc <- auc %>% mutate(stim = factor(stim, levels = stim_levels, labels = stim_labels))
#STEP 2: READ IN THE PPT INFO DATA
#Wrangle the ppt info data so that you can merge it into the auc data
#Drop any data where the AUC values are missing
#In the final, merged data, make the "watched" variable is coded as a factor with
# levels "seen" (1), "not seen" (2), "not sure" (3)
#Write the cleaned file to data_cleaned/
#Read in the ppt data and rename columns to be easier to work with
ppt <- read_csv(here("data_raw","participants_info_full_headers.csv")) %>%
rename(id = `participant ID`,
age_group = `Age group`,
precision = "Precision")
#Each question about watching each video is a column, so pivot_longer
#Use separate to get just the video name into it's own column
ppt_long <- ppt %>% pivot_longer(cols = starts_with("Seen"), names_to = "stim", values_to = "watched")
ppt_long <- ppt_long %>% separate(stim, into = c(NA, "stim"))
#Code stim and watched as factors
ppt_long <- ppt_long %>% mutate(
stim = factor(stim, levels = stim_labels, labels = stim_labels),
watched = factor(watched, levels = 1:3, labels = c("Yes","No","Not Sure")))
#Join the ppt data to the AUC data (by id and by stim since each participant has observations for each stim)
ds <- left_join(auc, ppt_long, by = c("id", "stim"))
ds <- ds %>% drop_na(AUC_sal:AUC_dist) #Drop participants for whom we don't have data for the DV
#Write the data to file
ds %>% write_csv(here("data_cleaned","cleaned.csv"))
#STEP 3: EXPLORATORY DATA ANALYSIS
#3A PRECISION: Is the precision acceptable (< 2.5) for each participant?
#Visualize the distribution of precision to see if there are values above 2.5
#Create a summary to figure out which participants would we need to exclude if > 2.5 meant the data are unuseable?
#Use a summary table and plots to investigate whether data equally precise for participants of different ages
#Visualize the distribution of precision to see if there are values above 2.5
ds %>% ggplot(aes(x = precision)) + geom_histogram() + geom_vline(xintercept = 2.5)
#Create a summary to figure out which participants would we need to exclude if > 2.5 meant the data are unuseable?
ds %>% group_by(id, age_group) %>%
summarize(precision = mean(precision, na.RM = T)) %>%
filter(precision > 2.5)
ds %>% filter(precision > 2.5)
#Use a summary table and plots to investigate whether data equally precise for participants of different ages
ds %>% group_by(age_group) %>% summarize(across(precision, list(M = mean, MIN = min, MAX = max)))
ds %>% ggplot(aes(x = age_group, y = precision)) + geom_boxplot() + geom_hline(yintercept = 2.5)
ds %>% ggplot(aes(x = age, y = precision)) + geom_point() + geom_hline(yintercept = 2.5)
#3B AGE: Are there any errors in age?
#Convert age to years so that it can be more easily compared to age_group
#Visualize age in years by age_group to see whether participants are the correct age for their group
#Make a summary table of age in years by age group to check whether all participants' ages are correct
#Convert age to years so that it can be more easily compared to age_group
ds <- ds %>% mutate(age_years = age/365.25)
#Visualize age in years by age_group to see whether participants are the correct age for their group
ds %>% group_by(id, age_group) %>%
summarize(age_years = mean(age_years)) %>%
ggplot(aes(y = age_group, x = age_years)) + geom_boxplot()
#Another option would be to facet by age group and to let the scales be "free" to get a better look
ds %>% group_by(id, age_group) %>%
summarize(age_years = mean(age_years)) %>%
ggplot(aes(y = age_years)) +
geom_boxplot() +
facet_wrap("age_group", scales = "free")
#Make a summary table of age in years by age group to check whether all participants' ages are correct
ds %>% group_by(age_group) %>% summarize(min_age = min(age_years), max_age = max(age_years))