-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathOmnibus.R
497 lines (377 loc) · 18.8 KB
/
Omnibus.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
# Omnibus tests on mgx and mtx data from the HMP2. Baseline only data.
dir.create("R_Metagenomic_Statistics") # Create a new directory
dir.create("R_Metagenomic_Statistics/Data") # Create a new directory
setwd("R_Metagenomic_Statistics") # Change the current working directory
# Load the packages needed
library(vegan)
library(plyr)
#####
# MGX species
#####
# You can either download the file from bitbucket page or do it with the below code:
download.file("https://raw.githubusercontent.com/biobakery/omnibus-and-maaslin2-rscripts-and-hmp2-data/master/taxonomic_profiles_pcl_week0.csv", "./Data/taxonomic_profiles_pcl_week0.csv") # Download the mgx species data and put it into the data directory
# Read the taxonomic data into R environment
tax = read.csv(file = "./Data/taxonomic_profiles_pcl_week0.csv", header = T, row.names = 1, check.names = FALSE)
head(tax) # Inspect the tax
dim(tax) # dimensions of the tax
str(tax) # structure of the tax
names(tax) # column names of tax
row.names(tax) # row.names of tax
# Prepare the data
# Extract the metadata
metadata = data.frame(tax[1:5])
metadata[1:5,] # check the output
str(metadata) #structure of metadata
is.na(metadata) # Check for NAs that will mess with the PERMANOVAS: Age has some
count(is.na(metadata$consent_age)) # Check for how many there are: 6/96
# If this was a discrete variable we could just classify the NAs as Unknown and keep them in the model,
# but since Age is a continuous variable typically we would either remove those from the data or impute the median.
# In this case let's impute the median in order to keep samples.
unique(metadata$consent_age)
metadata$consent_age[is.na(metadata$consent_age)] = median(metadata$consent_age, na.rm = T)
unique(metadata$consent_age) # Check the output: good to go
# Extract species data and transpose the df
species = data.frame(t(tax[6:ncol(tax)]))
str(species) # everything is numeric, good to go
row.names(species)
species[1:8,1:4] # check the output
# subset to species only
# which don't have "t__"
tmp.ind = grep("\\|t__", rownames(species), invert = T) # grep the rows that do not include strain stratifications
tmp.ind # check the output
tmp = species[tmp.ind,] # Create a new dataframe with only those row numbers
tmp.ind = grep("\\|s__", rownames(tmp)) # grep the rows that only include down to species stratifications
tmp.ind # check the output
species = tmp[tmp.ind,] # Create a new dataframe with only those row numbers
rm(tmp,tmp.ind) # remove temp files to clear up space
row.names(species) # Check the output to make sure that we only have species level stratifications
# trim species names
rownames(species) = gsub(".*\\|", "", rownames(species))
row.names(species) # Check the output, looks great
colSums(species) # Check the sample sums to make sure they are in proportion format (0-1) and are all ~1
# filter for beta div (we will keep species as is for alpha diversity)
dim(species)
dim(species[apply(species, 1, function(x) sum(x > 0.0001) > 0.1 * ncol(species)), ])
species_filt = species[apply(species, 1, function(x) sum(x > 0.0001) > 0.1 * ncol(species)), ]
#Let's transpose it for easier use downstream
species_filt = data.frame(t(species_filt), check.names = F)
species = data.frame(t(species), check.names = F)
bray_species = vegdist(species_filt, method = "bray") # Calculate Bray-Curtis dissimilarity
#####
# Alpha diversity
# use unfiltered data for alpha div
shannon = data.frame(diversity(species, index = "shannon"))
simpson = data.frame(diversity(species, index = "simpson"))
invsimpson = data.frame(diversity(species, index = "invsimpson"))
# merge all into one dataframe
alphadiv = cbind(shannon, simpson, invsimpson)
head(alphadiv) # check the output
# rename the columns to shorter names
names(alphadiv) = c("Shannon", "Simpson", "InvSimpson")
head(alphadiv) # check the output
# append metadata
alpha_met_df <- merge(alphadiv, metadata, by = "row.names")
row.names(alpha_met_df) = alpha_met_df$Row.names # Make the samples the rownames again
alpha_met_df$Row.names = NULL # Get rid of the Row.names column
head(alpha_met_df)
### Shannon alpha diversity
#### Univariate
# Can do this with a for loop
for (col in names(alpha_met_df)[-c(2:3)]){
alpha_shannon_univ = anova(lm(as.formula(paste("Shannon ~ ", col)), data = alpha_met_df[-c(2:3)]))
print(alpha_shannon_univ)
}
# Alternatively, can do it one-by-one
shannon_site = anova(lm(Shannon ~ site_name, data = alpha_met_df))
shannon_site
shannon_sex = anova(lm(Shannon ~ sex, data = alpha_met_df))
shannon_sex
shannon_race = anova(lm(Shannon ~ race, data = alpha_met_df))
shannon_race
shannon_age = anova(lm(Shannon ~ consent_age, data = alpha_met_df))
shannon_age
shannon_diagnosis = anova(lm(Shannon ~ diagnosis, data = alpha_met_df))
shannon_diagnosis
#### Multivariate
# Can do it without being verbose
shannon_multi = anova(lm(Shannon ~ ., data = alpha_met_df[-c(2:3)]))
shannon_multi
# Alternatively, can write out each variable in the model
shannon_multi = anova(lm(Shannon ~ site_name + sex + race + consent_age + diagnosis, data = alpha_met_df))
shannon_multi
### Simpson alpha diversity
#### Univariate
# Can do this with a for loop
for (col in names(alpha_met_df)[-c(1,3)]){
alpha_simpson_univ = anova(lm(as.formula(paste("Simpson ~ ", col)), data = alpha_met_df[-c(1,3)]))
print(alpha_simpson_univ)
}
# Alternatively, can do it one-by-one
simpson_site = anova(lm(Simpson ~ site_name, data = alpha_met_df))
simpson_site
simpson_sex = anova(lm(Simpson ~ sex, data = alpha_met_df))
simpson_sex
simpson_race = anova(lm(Simpson ~ race, data = alpha_met_df))
simpson_race
simpson_age = anova(lm(Simpson ~ consent_age, data = alpha_met_df))
simpson_age
simpson_diagnosis = anova(lm(Simpson ~ diagnosis, data = alpha_met_df))
simpson_diagnosis
#### Multivariate
# Can do it without being verbose
simpson_multi = anova(lm(Simpson ~ ., data = alpha_met_df[-c(1,3)]))
simpson_multi
# Alternatively, can write out each variable in the model
simpson_multi = anova(lm(Simpson ~ site_name + sex + race + consent_age + diagnosis, data = alpha_met_df))
simpson_multi
### InvSimpson alpha diversity
#### Univariate
# Can do this with a for loop
for (col in names(alpha_met_df)[-c(1:2)]){
alpha_invsimpson_univ = anova(lm(as.formula(paste("InvSimpson ~ ", col)), data = alpha_met_df[-c(1:2)]))
print(alpha_invsimpson_univ)
}
# Alternatively, can do it one-by-one
invsimpson_site = anova(lm(InvSimpson ~ site_name, data = alpha_met_df))
invsimpson_site
invsimpson_sex = anova(lm(InvSimpson ~ sex, data = alpha_met_df))
invsimpson_sex
invsimpson_race = anova(lm(InvSimpson ~ race, data = alpha_met_df))
invsimpson_race
invsimpson_age = anova(lm(InvSimpson ~ consent_age, data = alpha_met_df))
invsimpson_age
invsimpson_diagnosis = anova(lm(InvSimpson ~ diagnosis, data = alpha_met_df))
invsimpson_diagnosis
#### Multivariate
# Can do it without being verbose
invsimpson_multi = anova(lm(InvSimpson ~ ., data = alpha_met_df[-c(1:2)]))
invsimpson_multi
# Alternatively, can write out each variable in the model
invsimpson_multi = anova(lm(InvSimpson ~ site_name + sex + race + consent_age + diagnosis, data = alpha_met_df))
invsimpson_multi
# Now to look at all 3 measures together for diagnosis (univariate)
rbind(shannon_diagnosis, simpson_diagnosis, invsimpson_diagnosis)
#####
#Beta Diversity: PERMANOVA Tests using Bray-Curtis Dissimilarity
### Univariate
# Can do this with a for loop
for (col in names(metadata)){
adonis_univ <- adonis(as.formula(paste("bray_species ~ ", col)), data = metadata)
print(adonis_univ)
}
# Alternatively, can do it one-by-one
adonis_site_tax = adonis(bray_species ~ site_name, data = metadata)
adonis_site_tax
adonis_sex_tax = adonis(bray_species ~ sex, data = metadata)
adonis_sex_tax
adonis_age_tax = adonis(bray_species ~ consent_age, data = metadata)
adonis_age_tax
adonis_race_tax = adonis(bray_species ~ race, data = metadata)
adonis_race_tax
adonis_diagnosis_tax = adonis(bray_species ~ diagnosis, data = metadata)
adonis_diagnosis_tax
#### Multivariate
# Can do it without being verbose
adonis_multi_tax = adonis(bray_species ~ ., data = metadata)
adonis_multi_tax
# Alternatively, can write out each variable in the model
adonis_multi_tax = adonis(bray_species ~ site_name + sex + race + consent_age + diagnosis, data = metadata)
adonis_multi_tax
#####
# MGX pathways
#####
# You can either download the file from bitbucket page or do it with the below code:
download.file("https://raw.githubusercontent.com/biobakery/omnibus-and-maaslin2-rscripts-and-hmp2-data/master/dna_pathabundance_relab_pcl_week0.csv", "./Data/dna_pathabundance_relab_pcl_week0.csv")
# Read the dna pathway data into R environment
dna_path = read.csv(file = "./Data/dna_pathabundance_relab_pcl_week0.csv", header = T, row.names = 1, check.names = FALSE)
head(dna_path) # Inspect the data
dim(dna_path) # dimensions of the data
str(dna_path) # structure of the data
names(dna_path) # column names of data
row.names(dna_path) # row.names of data
# Remove metadata and keep only pathways and transpose the data
dna_path = data.frame(t(dna_path[6:ncol(dna_path)]))
str(dna_path) # everything is numeric, good to go
row.names(dna_path)
# Remove species stratifications
tmp.ind = grep("\\|.*", rownames(dna_path), invert = T) # grep the rows that do not include species stratifications
tmp.ind # check the output
dna_path_unstratified = dna_path[tmp.ind,] # Create a new dataframe with only those unstratified rows
rm(tmp.ind) # Remove tmp.ind to clear space
row.names(dna_path_unstratified) # check the output: looks great
colSums(dna_path_unstratified) # Check the sample sums to make sure they are in proportion format (0-1) and are all ~1
# filter for beta div
dim(dna_path_unstratified)
dim(dna_path_unstratified[apply(dna_path_unstratified, 1, function(x) sum(x > 0.0001) > 0.1 * ncol(dna_path_unstratified)), ])
dna_path_unstratified_filt = dna_path_unstratified[apply(dna_path_unstratified, 1, function(x) sum(x > 0.0001) > 0.1 * ncol(dna_path_unstratified)), ]
#Let's transpose it for easier use downstream
dna_path_unstratified_filt = data.frame(t(dna_path_unstratified_filt), check.names = F)
dna_path_unstratified = data.frame(t(dna_path_unstratified), check.names = F)
bray_dna_path_unstratified = vegdist(dna_path_unstratified_filt, method = "bray") # Calculate Bray-Curtis dissimilarity
#####
#Beta Diversity: PERMANOVA Tests using Bray-Curtis Dissimilarity
### Univariate
# Can do this with a for loop
for (col in names(metadata)){
adonis_univ <- adonis(as.formula(paste("bray_dna_path_unstratified ~ ", col)), data = metadata)
print(adonis_univ)
}
# Alternatively, can do it one-by-one
adonis_site_dna_path = adonis(bray_dna_path_unstratified ~ site_name, data = metadata)
adonis_site_dna_path
adonis_sex_dna_path = adonis(bray_dna_path_unstratified ~ sex, data = metadata)
adonis_sex_dna_path
adonis_age_dna_path = adonis(bray_dna_path_unstratified ~ consent_age, data = metadata)
adonis_age_dna_path
adonis_race_dna_path = adonis(bray_dna_path_unstratified ~ race, data = metadata)
adonis_race_dna_path
adonis_diagnosis_dna_path = adonis(bray_dna_path_unstratified ~ diagnosis, data = metadata)
adonis_diagnosis_dna_path
#### Multivariate
# Can do it without being verbose
adonis_multi_dna_path = adonis(bray_dna_path_unstratified ~ ., data = metadata)
adonis_multi_dna_path
# Alternatively, can write out each variable in the model
adonis_multi_dna_path = adonis(bray_dna_path_unstratified ~ site_name + sex + race + consent_age + diagnosis, data = metadata)
adonis_multi_dna_path
#####
# MTX pathways
#####
# You can either download the file from bitbucket page or do it with the below code:
download.file("https://raw.githubusercontent.com/biobakery/omnibus-and-maaslin2-rscripts-and-hmp2-data/master/rna_pathabundance_relab_pcl_week0.csv", "./Data/rna_pathabundance_relab_pcl_week0.csv")
# Read the rna pathway data into R environment
rna_path = read.csv(file = "./Data/rna_pathabundance_relab_pcl_week0.csv", header = T, row.names = 1, check.names = FALSE)
head(rna_path) # Inspect the data
dim(rna_path) # dimensions of the data
str(rna_path) # structure of the data
names(rna_path) # column names of data
row.names(rna_path) # row.names of data
# Remove metadata and keep only pathways and transpose the data
rna_path = data.frame(t(rna_path[6:ncol(rna_path)]))
str(rna_path) # everything is numeric, good to go
row.names(rna_path)
# minimize the metadata to just the samples available in these data
dim(metadata)
list = names(rna_path) # make a list of sample ids to subset on
list # check the output
metadata_rna = subset(metadata, row.names(metadata) %in% list)
dim(metadata_rna)
metadata_rna # check the output
# Remove species stratifications
tmp.ind = grep("\\|.*", rownames(rna_path), invert = T) # grep the rows that do not include species stratifications
tmp.ind # check the output
rna_path_unstratified = rna_path[tmp.ind,] # Create a new dataframe with only those unstratified rows
rm(tmp.ind) # Remove tmp.ind to clear space
row.names(rna_path_unstratified) # check the output: looks great
colSums(rna_path_unstratified) # Check the sample sums to make sure they are in proportion format (0-1) and are all ~1
# filter for beta div
dim(rna_path_unstratified)
dim(rna_path_unstratified[apply(rna_path_unstratified, 1, function(x) sum(x > 0.0001) > 0.1 * ncol(rna_path_unstratified)), ])
rna_path_unstratified_filt = rna_path_unstratified[apply(rna_path_unstratified, 1, function(x) sum(x > 0.0001) > 0.1 * ncol(rna_path_unstratified)), ]
#Let's transpose it for easier use downstream
rna_path_unstratified_filt = data.frame(t(rna_path_unstratified_filt), check.names = F)
rna_path_unstratified = data.frame(t(rna_path_unstratified), check.names = F)
bray_rna_path_unstratified = vegdist(rna_path_unstratified_filt, method = "bray") # Calculate Bray-Curtis dissimilarity
#####
#Beta Diversity: PERMANOVA Tests using Bray-Curtis Dissimilarity
### Univariate
# Can do this with a for loop
for (col in names(metadata_rna)){
adonis_univ <- adonis(as.formula(paste("bray_rna_path_unstratified ~ ", col)), data = metadata_rna)
print(adonis_univ)
}
# Alternatively, can do it one-by-one
adonis_site_rna_path = adonis(bray_rna_path_unstratified ~ site_name, data = metadata_rna)
adonis_site_rna_path
adonis_sex_rna_path = adonis(bray_rna_path_unstratified ~ sex, data = metadata_rna)
adonis_sex_rna_path
adonis_age_rna_path = adonis(bray_rna_path_unstratified ~ consent_age, data = metadata_rna)
adonis_age_rna_path
adonis_race_rna_path = adonis(bray_rna_path_unstratified ~ race, data = metadata_rna)
adonis_race_rna_path
adonis_diagnosis_rna_path = adonis(bray_rna_path_unstratified ~ diagnosis, data = metadata_rna)
adonis_diagnosis_rna_path
#### Multivariate
# Can do it without being verbose
adonis_multi_rna_path = adonis(bray_rna_path_unstratified ~ ., data = metadata_rna)
adonis_multi_rna_path
# Alternatively, can write out each variable in the model
adonis_multi_rna_path = adonis(bray_rna_path_unstratified ~ site_name + sex + race + consent_age + diagnosis, data = metadata_rna)
adonis_multi_rna_path
#####
# RNA/DNA pathway ratios
#####
# You can either download the file from bitbucket page or do it with the below code:
download.file("https://raw.githubusercontent.com/biobakery/omnibus-and-maaslin2-rscripts-and-hmp2-data/master/rna_dna_path_relative_expression_week0.csv", "./Data/rna_dna_path_relative_expression_week0.csv")
# Read the rna_dna pathway data into R environment
rna_dna_path = read.csv(file = "./Data/rna_dna_path_relative_expression_week0.csv", header = T, row.names = 1, check.names = FALSE)
head(rna_dna_path) # Inspect the data
dim(rna_dna_path) # dimensions of the data
str(rna_dna_path) # structure of the data
names(rna_dna_path) # column names of data
row.names(rna_dna_path) # row.names of data
# Transpose the data
rna_dna_path = data.frame(t(rna_dna_path))
str(rna_dna_path) # everything is numeric, good to go
row.names(rna_dna_path)
# minimize the metadata to just the samples available in these data
dim(metadata)
list = names(rna_dna_path) # make a list of sample ids to subset on
list # check the output
metadata_rna_dna = subset(metadata, row.names(metadata) %in% list)
dim(metadata_rna_dna)
metadata_rna_dna # check the output: we only have one race now, so let's get rid of that column
metadata_rna_dna$race = NULL
# Remove species stratifications
tmp.ind = grep("\\|.*", rownames(rna_dna_path), invert = T) # grep the rows that do not include species stratifications
tmp.ind # check the output
rna_dna_path_unstratified = rna_dna_path[tmp.ind,] # Create a new dataframe with only those unstratified rows
rm(tmp.ind) # Remove tmp.ind to clear space
row.names(rna_dna_path_unstratified) # check the output: looks great
# filter for beta div
#Only keep RNA/DNA pathways that passed filtering for DNA pathways
#Create a list of the pathway names (col names) for subsetting the data
list = names(dna_path_unstratified_filt)
list
#Check the dimensions to make sure it matches with the DNA numbers before subsetting
dim(dna_path_unstratified_filt)
dim(subset(rna_dna_path_unstratified, row.names(rna_dna_path_unstratified) %in% list))
#subset
rna_dna_path_unstratified_filt = rna_dna_path_unstratified[list,]
dim(rna_dna_path_unstratified_filt)
head(rna_dna_path_unstratified_filt)
#Let's transpose the dataframe for easier use downstream
rna_dna_path_unstratified_filt = data.frame(t(rna_dna_path_unstratified_filt), check.names = F)
# log transform the RNA/DNA ratio
rna_dna_path_unstratified_filt_log = log2(rna_dna_path_unstratified_filt + 1)
head(rna_dna_path_unstratified_filt_log)
euclidean_rna_dna_path_unstratified = vegdist(rna_dna_path_unstratified_filt_log, method = "euclidean") # Calculate Euclidean distance on the pathway ratios
#####
#Beta Diversity: PERMANOVA Tests using Euclidean distances
### Univariate
# Can do this with a for loop
for (col in names(metadata_rna_dna)){
adonis_univ <- adonis(as.formula(paste("euclidean_rna_dna_path_unstratified ~ ", col)), data = metadata_rna_dna)
print(adonis_univ)
}
# Alternatively, can do it one-by-one
adonis_site_rna_dna_path = adonis(euclidean_rna_dna_path_unstratified ~ site_name, data = metadata_rna_dna)
adonis_site_rna_dna_path
adonis_sex_rna_dna_path = adonis(euclidean_rna_dna_path_unstratified ~ sex, data = metadata_rna_dna)
adonis_sex_rna_dna_path
adonis_age_rna_dna_path = adonis(euclidean_rna_dna_path_unstratified ~ consent_age, data = metadata_rna_dna)
adonis_age_rna_dna_path
adonis_diagnosis_rna_dna_path = adonis(euclidean_rna_dna_path_unstratified ~ diagnosis, data = metadata_rna_dna)
adonis_diagnosis_rna_dna_path
#### Multivariate
# Can do it without being verbose
adonis_multi_rna_dna_path = adonis(euclidean_rna_dna_path_unstratified ~ ., data = metadata_rna_dna)
adonis_multi_rna_dna_path
# Alternatively, can write out each variable in the model
adonis_multi_rna_dna_path = adonis(euclidean_rna_dna_path_unstratified ~ site_name + sex + consent_age + diagnosis, data = metadata_rna_dna)
adonis_multi_rna_dna_path
### Now let's look at all of the PERMANOVA results together
adonis_multi_tax
adonis_multi_dna_path
adonis_multi_rna_path
adonis_multi_rna_dna_path