-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtaxa_summary.R
executable file
·147 lines (143 loc) · 5.41 KB
/
taxa_summary.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Define some functions for quick taxa summary
fast_melt = function(physeq,
includeSampleVars = character(),
omitZero = FALSE){
require("phyloseq")
require("data.table")
# supports "naked" otu_table as `physeq` input.
otutab = as(otu_table(physeq), "matrix")
if(!taxa_are_rows(physeq)){otutab <- t(otutab)}
otudt = data.table(otutab, keep.rownames = TRUE)
setnames(otudt, "rn", "TaxaID")
# Enforce character TaxaID key
otudt[, TaxaIDchar := as.character(TaxaID)]
otudt[, TaxaID := NULL]
setnames(otudt, "TaxaIDchar", "TaxaID")
# Melt count table
mdt = melt.data.table(otudt,
id.vars = "TaxaID",
variable.name = "SampleID",
value.name = "count")
if(omitZero){
# Omit zeroes and negative numbers
mdt <- mdt[count > 0]
}
# Omit NAs
mdt <- mdt[!is.na(count)]
# Calculate relative abundance
mdt[, RelativeAbundance := count / sum(count), by = SampleID]
if(!is.null(tax_table(physeq, errorIfNULL = FALSE))){
# If there is a tax_table, join with it. Otherwise, skip this join.
taxdt = data.table(as(tax_table(physeq, errorIfNULL = TRUE), "matrix"), keep.rownames = TRUE)
setnames(taxdt, "rn", "TaxaID")
# Enforce character TaxaID key
taxdt[, TaxaIDchar := as.character(TaxaID)]
taxdt[, TaxaID := NULL]
setnames(taxdt, "TaxaIDchar", "TaxaID")
# Join with tax table
setkey(taxdt, "TaxaID")
setkey(mdt, "TaxaID")
mdt <- taxdt[mdt]
}
# includeSampleVars = c("DaysSinceExperimentStart", "SampleType")
# includeSampleVars = character()
# includeSampleVars = c()
# includeSampleVars = c("aksjdflkas")
wh.svars = which(sample_variables(physeq) %in% includeSampleVars)
if( length(wh.svars) > 0 ){
# Only attempt to include sample variables if there is at least one present in object
sdf = as(sample_data(physeq), "data.frame")[, wh.svars, drop = FALSE]
sdt = data.table(sdf, keep.rownames = TRUE)
setnames(sdt, "rn", "SampleID")
# Join with long table
setkey(sdt, "SampleID")
setkey(mdt, "SampleID")
mdt <- sdt[mdt]
}
setkey(mdt, "TaxaID")
return(mdt)
}
summarize_taxa = function(physeq, Rank, GroupBy = NULL){
require("phyloseq")
require("data.table")
Rank <- Rank[1]
if(!Rank %in% rank_names(physeq)){
message("The argument to `Rank` was:\n", Rank,
"\nBut it was not found among taxonomic ranks:\n",
paste0(rank_names(physeq), collapse = ", "), "\n",
"Please check the list shown above and try again.")
}
if(!is.null(GroupBy)){
GroupBy <- GroupBy[1]
if(!GroupBy %in% sample_variables(physeq)){
message("The argument to `GroupBy` was:\n", GroupBy,
"\nBut it was not found among sample variables:\n",
paste0(sample_variables(physeq), collapse = ", "), "\n",
"Please check the list shown above and try again.")
}
}
# Start with fast melt
mdt = fast_melt(physeq)
if(!is.null(GroupBy)){
# Add the variable indicated in `GroupBy`, if provided.
sdt = data.table(SampleID = sample_names(physeq),
var1 = get_variable(physeq, GroupBy))
setnames(sdt, "var1", GroupBy)
# Join
setkey(sdt, SampleID)
setkey(mdt, SampleID)
mdt <- sdt[mdt]
}
# Summarize
if(!is.null(GroupBy)){
summarydt = mdt[, list(meanRA = mean(RelativeAbundance),
sdRA = sd(RelativeAbundance),
minRA = min(RelativeAbundance),
maxRA = max(RelativeAbundance)),
by = c(Rank, GroupBy)]
} else {
Nsamples = nsamples(physeq)
# No GroupBy argument, can be more precise with the mean, sd, etc.
summarydt = mdt[, list(meanRA = sum(RelativeAbundance) / Nsamples,
sdRA = sd(c(RelativeAbundance, numeric(Nsamples - .N))),
minRA = ifelse(test = .N < Nsamples,
yes = 0L,
no = min(RelativeAbundance)),
maxRA = max(RelativeAbundance)),
by = c(Rank)]
}
return(summarydt)
}
plot_taxa_summary = function(physeq, Rank, GroupBy = NULL){
require("phyloseq")
require("data.table")
require("ggplot2")
# Get taxa summary table
dt1 = summarize_taxa(physeq, Rank = Rank, GroupBy = GroupBy)
# Set factor appropriately for plotting
RankCol = which(colnames(dt1) == Rank)
setorder(dt1, -meanRA)
dt1[, RankFac := factor(dt1[[Rank]],
levels = rev(dt1[[Rank]]))]
dt1[, ebarMax := max(c(0, min(meanRA + sdRA))), by = eval(Rank)]
dt1[, ebarMin := max(c(0, min(meanRA - sdRA))), by = eval(Rank)]
# Set zeroes to one-tenth the smallest value
ebarMinFloor = dt1[(ebarMin > 0), min(ebarMin)]
ebarMinFloor <- ebarMinFloor / 10
dt1[(ebarMin == 0), ebarMin := ebarMinFloor]
pRank = ggplot(dt1, aes(x = meanRA, y = RankFac)) +
scale_x_log10() +
xlab("Mean Relative Abundance") +
ylab(Rank) +
theme_bw()
if(!is.null(GroupBy)){
# pRank <- pRank + facet_wrap(facets = as.formula(paste("~", GroupBy)))
pRank <- pRank + geom_point(mapping = aes_string(colour = GroupBy),
size = 5)
} else {
# Don't include error bars for faceted version
pRank <- pRank + geom_errorbarh(aes(xmax = ebarMax,
xmin = ebarMin))
}
return(pRank)
}