forked from microbiome/course_2022_oulu
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path08-5-ex-sol-ADHD.Rmd
174 lines (150 loc) · 7.23 KB
/
08-5-ex-sol-ADHD.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
---
output:
html_document: default
pdf_document: default
---
* **Importing and processing data**
```{r, message=FALSE, warning=FALSE}
# Defining file paths
biom_file_path <- "data/Aggregated_humanization2.biom"
sample_meta_file_path <- "data/Mapping_file_ADHD_aggregated.csv"
tree_file_path <- "data/Data_humanization_phylo_aggregation.tre"
library(mia)
# Imports the data
se <- loadFromBiom(biom_file_path)
names(rowData(se)) <- c("Kingdom", "Phylum", "Class", "Order",
"Family", "Genus")
# Goes through the whole DataFrame. Removes '.*[kpcofg]__' from strings, where [kpcofg]
# is any character from listed ones, and .* any character.
rowdata_modified <- BiocParallel::bplapply(rowData(se),
FUN = stringr::str_remove,
pattern = '.*[kpcofg]__')
# Genus level has additional '\"', so let's delete that also
rowdata_modified <- BiocParallel::bplapply(rowdata_modified,
FUN = stringr::str_remove,
pattern = '\"')
# rowdata_modified is a list, so it is converted back to DataFrame format.
rowdata_modified <- DataFrame(rowdata_modified)
# And then assigned back to the SE object
rowData(se) <- rowdata_modified
# We use this to check what type of data it is
# read.table(sample_meta_file_path)
# It seems like a comma separated file and it does not include headers
# Let us read it and then convert from data.frame to DataFrame
# (required for our purposes)
sample_meta <- DataFrame(read.table(sample_meta_file_path, sep = ",", header = FALSE))
# Add sample names to rownames
rownames(sample_meta) <- sample_meta[,1]
# Delete column that included sample names
sample_meta[,1] <- NULL
# We can add headers
colnames(sample_meta) <- c("patient_status", "cohort", "patient_status_vs_cohort", "sample_name")
# Then it can be added to colData
colData(se) <- sample_meta
# Convert to tse format
tse <- as(se, "TreeSummarizedExperiment")
# Reads the tree file
tree <- ape::read.tree(tree_file_path)
# Add tree to rowTree
rowTree(tse) <- tree
```
* Visualize community variation with different methods (PCA, MDS, t-SNE...) by using the options in the alternative method, plotReducedDim [OMA](https://microbiome.github.io/OMA/microbiome-diversity.html#estimating-beta-diversity). Compare results obtained with different dissimilarities (Euclidean, Bray-Curtis, Unifrac..) and transformations (CLR, compositional..) of your own choice."
``` {r, message=FALSE, warning=FALSE}
library(scater)
# Performing MDS ordination using Bray-Curtis dissimilarity
# Which is stored at the TSE object
tse <- runMDS(tse, FUN = vegan::vegdist, name = "MDS_BC", exprs_values = "counts")
tse <- transformCounts(tse, method = "log10", pseudocount = 1)
tse <- runTSNE(tse, name = "TSNE", exprs_values = "log10")
plotReducedDim(tse, "MDS_BC", colour_by = "patient_status")
plotReducedDim(tse, "TSNE", colour_by = "patient_status")
```
* Investigate the influence of the data transformations on
statistical analysis: Visualize community variation with PCoA with
the following options: 1) Bray-Curtis distances for compositional
data; 2) Euclidean distances for CLR-transformed data.
```{r}
# Computing relative abundance and its Bray-Curtis dissimilarity
tse <- transformCounts(tse, method = "relabundance")
relabundance_assay <- assays(tse)$relabundance
relabundance_assay <- t(relabundance_assay)
bray_dist_relA <- vegan::vegdist(relabundance_assay, method = "bray")
bray_dist_relA <- t(bray_dist_relA)
# Computing clr data and its euclidean distance
tse <- transformCounts(tse, method = "clr", pseudocount = 1)
clr_assay <- assays(tse)$clr
clr_assay <- t(clr_assay)
euclidean_dist_clr <- vegan::vegdist(clr_assay, method = "euclidean")
euclidean_dist_clr <- t(euclidean_dist_clr)
# Performing PCoA and making the dataframes
bray_dist_relA_pcoa <- ecodist::pco(bray_dist_relA)
euclidean_dist_clr_pcoa <- ecodist::pco(euclidean_dist_clr)
bray_dist_relA_pcoa_df <- data.frame(pcoa1 = bray_dist_relA_pcoa$vectors[,1],
pcoa2 = bray_dist_relA_pcoa$vectors[,2])
euclidean_dist_clr_pcoa_df <- data.frame(pcoa1 = euclidean_dist_clr_pcoa$vectors[,1],
pcoa2 = euclidean_dist_clr_pcoa$vectors[,2])
# Visualizing
ggplot(data = bray_dist_relA_pcoa_df, aes(x=pcoa1, y=pcoa2)) +
geom_point() +
labs(x = "PC 1",
y = "PC 2",
title = "Bray-Curtis PCoA with Relative Abundance transformation") +
theme(title = element_text(size = 12))
ggplot(data = euclidean_dist_clr_pcoa_df, aes(x=pcoa1, y=pcoa2)) +
geom_point() +
labs(x = "PC 1",
y = "PC 2",
title = "Euclidean PCoA with clr transformation") +
theme(title = element_text(size = 12))
```
* Community-level comparisons: Use PERMANOVA to investigate whether
the community composition differs between two groups of individuals
(e.g. males and females, or some other grouping of your
choice). You can also include covariates such as age and gender,
and see how this affects the results.
```{r}
permanova_cohort <- vegan::adonis(relabundance_assay ~ cohort,
data = colData(tse),
permutations = 99) # 999 or 9999 is recommended
permanova_patient_status <- vegan::adonis(relabundance_assay ~ patient_status,
data = colData(tse),
permutations = 99)
print(paste0("relabundance_assay ~ cohort => p-value: ",
as.data.frame(permanova_cohort$aov.tab)["cohort", "Pr(>F)"]))
print(paste0("relabundance_assay ~ patient_status => p-value: ",
as.data.frame(permanova_patient_status$aov.tab)["patient_status", "Pr(>F)"]))
```
* Perform community typing for the data using the DMM method [OMA](https://microbiome.github.io/OMA/microbiome-community.html#community-typing)
``` {r, message=FALSE, warning=FALSE}
library(miaViz)
# We run DMN for different values of k (number of possible communities)
tse_dmn <- runDMN(tse, name = "DMN", k = 1:7)
# Observing the fit
plotDMNFit(tse_dmn, type = "laplace")
```
```{r}
# Getting the best model
getBestDMNFit(tse_dmn, type = "laplace")
```
```{r}
# PCoA with Bray-Curtis; with DMM clusters shown with colors
# Calculating the DMN groups using "patient_status" for grouping
dmn_group <- calculateDMNgroup(tse_dmn, variable = "patient_status", exprs_values = "counts",
k = 3, seed=.Machine$integer.max)
# Getting the probabilities for which each sample was assigned
prob <- DirichletMultinomial::mixture(getBestDMNFit(tse_dmn))
colnames(prob) <- c("comp1", "comp2", "comp3")
vec <- colnames(prob)[max.col(prob,ties.method = "first")]
# Visualizing
euclidean_dmm_pcoa_df <- cbind(euclidean_dist_clr_pcoa_df,
dmm_component = vec)
euclidean_dmm_plot <- ggplot(data = euclidean_dmm_pcoa_df,
aes(x=pcoa1, y=pcoa2,
color = dmm_component)) +
geom_point() +
labs(x = "Coordinate 1",
y = "Coordinate 2",
title = "PCoA with Aitchison distances") +
theme(title = element_text(size = 12))
euclidean_dmm_plot
```