-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathMAGScoT.R
298 lines (240 loc) · 14.8 KB
/
MAGScoT.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
### MAGScoT - Bin Scoring and Refinment Tool for Metagenome Assblies
library("optparse")
option_list = list(
make_option(c("-i", "--input"), type="character", default=NULL,
help="Tab-separated input file with three columns: bin, contig, set; no header!", metavar="character"),
make_option(c("--hmm"), type="character", default=NULL,
help="Tab-separated input file with marker mapping; three columns: gene id, marker, e-value. \n Gene IDs must represent contig IDs after removal of _[0-9] at the end of the name (prodigal default)", metavar="character"),
make_option(c("-p", "--profile"), type="character", default="default",
help="Profile used for scoring, all derived from GTDB release 207. default[=bac120+ar53], ar53, bac120. [default]", metavar="character"),
make_option(c("-o", "--out"), type="character", default="MAGScoT",
help="output file name base [default=%default]", metavar="character"),
make_option(c("--bin_separator"), type="character", default="cleanbin", help="Separator for refined output bin names [default=%default]"),
make_option(c("-a", "--score_a"), type="double", default=1, help="Scoring parameter a [default=%default]"),
make_option(c("-b", "--score_b"), type="double", default=0.5, help="Scoring parameter b [default=%default]"),
make_option(c("-c", "--score_c"), type="double", default=0.5, help="Scoring parameter c [default=%default]"),
make_option(c("--max_cont"), type="double", default=1, help="Maximum contamination; MAGs with values higher than this threshold are not considered [default=%default]"),
make_option(c("-t", "--threshold"), type="double", default=0.5, help="Scoring minimum completeness threshold [default=%default]"),
make_option(c("--score_only"), action="store_true", dest="score_only", help="Only do scoring, no refinement [false]"),
make_option(c("--skip_merge_bins"), action="store_true", dest="skip_merge_bins", help="Skip bin merging [false]"),
make_option(c("-m", "--min_markers"), type="double", default=25, help="Minimum number of unique markers in bins to be considered as seed for bin merging [default=%default]"),
make_option(c("-s", "--min_sharing"), type="double", default=0.8, help="Minimum percentage of shared markers for bin sharing. [default=%default]"),
make_option(c("-n", "--n_iterations"), type="double", default=2, help="Number of merging iterations to perform. [default=%default]")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$input)){
print_help(opt_parser)
stop("At least one argument must be supplied (input file).n", call.=FALSE)
}
cat("Loading packages...\n")
suppressMessages(library(dplyr))
suppressMessages(library(readr))
suppressMessages(library(funr))
suppressMessages(library(digest))
### READ IN BINNiNG INFO
### FORMAT: 3 columns => bin, contig, set
contig_to_bin<-read_delim(file=opt$input, delim="\t", col_names=c("bin","contig","set"), show_col_types = FALSE)
contig_to_bin=contig_to_bin %>% filter(!is.na(set))
if(ncol(contig_to_bin) != 3){
cat("Contig to bin input file should have three tab-separated colums without a header: Bin ID, Contig ID, Binning Tool\n")
cat("Your file ",opt$input, " has ",ncol(contig_to_bin)," tab-separated columns.\n")
cat("Please check your input files and run MAGScoT again.\nAborting execution!\n")
quit(save="no")
}
if(length(unique(contig_to_bin$bin)) > length(unique(contig_to_bin$contig))){
cat("Contig to bin input file should have three tab-separated colums without a header: Bin ID, Contig ID, Binning Tool\n")
cat("The number of distinct items in the column 'Contig ID' is smaller than in the column 'Bin ID'.\n Is your input correctly formatted?\n")
cat("Please check your input files and run MAGScoT again.\nAborting execution!\n")
quit(save="no")
}
contig_to_bin_unique = contig_to_bin %>% dplyr::select(contig) %>% distinct %>% arrange(contig) %>% mutate(contig_idx = seq_along(contig))
num_binsets = length(unique(contig_to_bin$set))
if(num_binsets==1){
cat("Only output from one binning alogorithm provided. Skipping merging and refinement. Scoring bins only.\n")
opt$score_only = T
opt$n_iter = 0
}
if(opt$n_iter >= num_binsets & is.null(opt$skip_merge_bins) == T){
cat("Number of bin-merging iterations cannot be equal or larger than number or binning sofwares used.\n")
cat("Setting iterations to:", num_binsets-1,"\n")
opt$n_iter = num_binsets -1
}
id=opt$out
### Parameter setup a=1,b=0.5,c=0.5 is DAS Tool default; metawrap is much more stringent on contamination (b=5)
a=opt$score_a
b=opt$score_b
c=opt$score_c
cutoff=opt$threshold
### Parameters for bin merging
min_markers = opt$min_markers
min_sharing = opt$min_sharing
#### Read markers file
if(opt$profile=="default"){
markers=read.table(file.path(funr::get_script_path(),"profiles","gtdb_rel207_default_markers.tsv"), stringsAsFactors=F, head=T)
}else if(opt$profile=="bac120"){
markers=read.table(file.path(funr::get_script_path(),"profiles","gtdb_rel207_bac120_markers.tsv"), stringsAsFactors=F, head=T)
}else if(opt$profile=="ar53"){
markers=read.table(file.path(funr::get_script_path(),"profiles","gtdb_rel207_ar53_markers.tsv"), stringsAsFactors=F, head=T)
}else{
print_help(opt_parser)
stop("No valid profile selected.n", call.=FALSE)
}
markers = markers %>% mutate(marker=gsub("[.]HMM|[.]hmm","", marker))
### READ IN Singe-copy gene annotation
### only best annotation per gene is kept => sort by X3 (evalue) and remove duplicates
### if the same gene occurs twice in a contig it is only counted once
hmm<-read_delim(opt$hmm, delim="\t",col_names=FALSE, show_col_types = FALSE)
if(ncol(hmm) != 3){
cat("HMM input file should have three tab-separated colums without a header: Protein ID, Marker ID, e-value\n")
cat("Your file ",opt$hmm, " has ",ncol(hmm)," tab-separated columns.\n")
cat("Please check your input files and run MAGScoT again.\nAborting execution!\n")
quit(save="no")
}
gene_to_contig=hmm %>% arrange(X3) %>%
filter(!duplicated(X1)) %>%
dplyr::select(X1, X2) %>%
mutate(contig=gsub("_[0-9]+$","",X1), gene=X2) %>%
dplyr::select(contig, gene) %>%
distinct()
## calculate hashes for each bin based on their contigs
binhashes = contig_to_bin %>% left_join(contig_to_bin_unique) %>% group_by(bin) %>%
arrange(bin,contig) %>% summarise(xx=paste0(contig_idx, collapse="")) %>%
group_by(bin) %>% mutate(hash=digest::digest(xx))
#### Bin Mergeing Routine
### Bins from distinct binning tools are merged if they have sufficient overlap
### these newly created bins are added to the dataset if they are distinct from the original bins
if(is.null(opt$skip_merge_bins) == F | is.null(opt$score_only) == F | opt$n_iter == 0){
cat("Skipping bin merging...\n")
}else{
for(it in 1:opt$n_iter){
cat("Bin-Merging iteration: ", it, "\n")
if(it==1){
contig_to_bin_bisco_it = contig_to_bin
contig_to_bin_bisco = contig_to_bin
}
gene_to_bin = left_join(contig_to_bin_bisco_it %>% filter(contig %in% gene_to_contig$contig, bin %in% binhashes$bin), gene_to_contig, by="contig")
## subset to candidates meeting the inclusion criteria: n_markers >= min_markers and n_markers <= 150
merge_cand = gene_to_bin %>% distinct_at(.vars=c("bin","gene"), .keep_all=T) %>% group_by(bin) %>% summarise(n_markers = n()) %>%
left_join( gene_to_bin %>% group_by(bin) %>% summarise(n_markers_tot = n()), by="bin") %>% filter(n_markers >= min_markers, n_markers_tot <= 150) %>% pull(bin) %>% unique()
if(length(merge_cand)==0) break
## comparing the contigs carrying marker genes of all candidate bins
binmatch=sapply(seq_along(merge_cand), function(thisbin_id){
cat("Processing candidate bin ",thisbin_id, " of ", length(merge_cand),"\r")
thisbin = merge_cand[thisbin_id]
thisbin_contigs = contig_to_bin_bisco_it %>% filter(bin == thisbin, contig %in% gene_to_contig$contig) %>% pull(contig)
thisbin_genes = gene_to_contig %>% filter(contig %in% thisbin_contigs)
if(it>1){thisbin_blacklist = bisco_out %>% filter(bin==thisbin) %>% select(bin_a, bin_b) %>% unlist %>% as.vector()}else{thisbin_blacklist=c()}
return(contig_to_bin %>% filter(contig %in% thisbin_genes$contig) %>% left_join(thisbin_genes, by="contig") %>%
distinct_at(.vars=c("bin","gene"), .keep_all=T) %>% group_by(bin) %>% summarize(shared_markers=n()) %>%
mutate(shared_markers_rel = shared_markers/length(unique(thisbin_genes$gene))) %>% filter(bin != thisbin, !bin %in% thisbin_blacklist, shared_markers_rel >= min_sharing) %>%
mutate(seed=thisbin, bin_a=ifelse(seed>bin, seed, bin), bin_b=ifelse(seed>bin, bin, seed)) %>% select(seed, bin_a, bin_b, shared_markers, shared_markers_rel))
}, simplify=F)
## get contig information for merged bins and calculate their hashes
bisco_out = binmatch %>% do.call("rbind",.) %>% distinct_at(.vars=c("bin_a","bin_b"), .keep_all=T) %>% filter(shared_markers >= min_markers) %>% mutate(bin=paste0(paste0("MAGScoT_",it,"_"), seq_along(shared_markers)))
contig_to_bin_bisco_it = bisco_out %>% apply(., 1, function(x) contig_to_bin %>% filter(bin %in% c(x[2], x[3])) %>%
mutate(bin=x[6], set=paste0("MAGScoT_",it)) %>% distinct_at(.vars=c("bin","contig"), .keep_all=T)) %>% do.call("rbind", .) %>% data.frame(row.names=NULL)
binhashes_bisco = contig_to_bin_bisco_it %>% left_join(contig_to_bin_unique, by="contig") %>% group_by(bin) %>%
arrange(bin,contig) %>% summarise(xx=paste0(contig_idx, collapse="")) %>%
group_by(bin) %>% mutate(hash=digest::digest(xx))
## merge original bins and merged bins that are distinct from the original bins based on their hashes
contig_to_bin_bisco = rbind(
contig_to_bin_bisco,
contig_to_bin_bisco_it %>% filter(bin %in% (binhashes_bisco %>% filter(!hash %in% binhashes$hash) %>% pull(bin))))
## re-calculate bin hashes for the new set of bins
binhashes = contig_to_bin_bisco %>% left_join(contig_to_bin_unique, by="contig") %>% group_by(bin) %>%
arrange(bin,contig) %>% summarise(xx=paste0(contig_idx, collapse="")) %>%
group_by(bin) %>% mutate(hash=digest::digest(xx))
if(it==1){bisco_out_all = bisco_out}else{bisco_out_all = rbind(bisco_out_all, bisco_out)}
if(nrow(contig_to_bin_bisco_it)==0){cat("Exiting bin merging, no (more) overlaps found\n"); break}
cat("\n")
}
contig_to_bin = contig_to_bin_bisco
}
#### Bin scorin and refinement
contig_to_bin.remain<-contig_to_bin
contig_to_bin.out<-contig_to_bin[0,]
mag_exclude = c()
### Initial values
i=1
allmax=100
recalc = contig_to_bin.remain$bin %>% unique
while(nrow(contig_to_bin.remain)>1){
cat("Refining bins, iteration:", i, "\n")
cat("Extracting SCG information for bins...\n")
### at first run (exist("zz") == F) all gene_to_bin statistics (zz) are calculated
### at later iterations, only bins with changed contents are re-calculated; this saves a lot of time
if(length(recalc) > 0){
gene_to_bin = left_join(contig_to_bin.remain %>% filter(contig %in% gene_to_contig$contig), gene_to_contig, by="contig")
if(exists("zz")){
zz = zz %>% filter(bin %in% scoreframe$bin, !bin %in% recalc, !bin %in% mag_exclude)
zz = rbind(zz, gene_to_bin %>% filter(bin %in% recalc, !bin %in% mag_exclude) %>% select(bin, gene) %>% group_by(bin,gene) %>% summarize(count=n(), .groups="drop") %>% left_join(markers, by=c("gene"="marker")))
}else{
zz=gene_to_bin %>% select(bin, gene) %>% group_by(bin,gene) %>% summarize(count=n(), .groups="drop") %>% left_join(markers, by=c("gene"="marker"))
}
} else {
zz = zz %>% filter(bin %in% scoreframe$bin)
}
df_list = zz %>% group_by(bin, set) %>% summarize(uniqueSCGs = n(), multipleSCGs = sum(count>1), sumSCGs = sum(count)) %>%
left_join(markers %>% group_by(set) %>% summarise(n_markers=n())) %>%
left_join(contig_to_bin.remain %>% group_by(bin) %>% summarise(n_contigs=n())) %>%
mutate(
Completeness = uniqueSCGs / n_markers,
additionalSCGs = sumSCGs - uniqueSCGs - multipleSCGs,
Contamination = multipleSCGs / uniqueSCGs,
MultiContam = additionalSCGs / n_markers,
Contamination_Score = b*Contamination + c*MultiContam,
score = a*Completeness - Contamination_Score) %>%
ungroup()
scoreframe = df_list %>% arrange(-score, n_contigs, grepl("MAGScoT",bin)) %>%
filter(!duplicated(bin)) %>%
left_join(contig_to_bin %>% dplyr::select(bin,bintool=set) %>% distinct, by="bin")
if(i==1){
write_tsv(scoreframe, paste0(id,".scores.out"))
cat("Scores for all initial bins written to:" , paste0(id,".scores.out"), "\n")
if(!is.null(opt$score_only)){
quit(save="no")
}
}
if(nrow(scoreframe)==0 | scoreframe[1,"score"] < cutoff){
cat("No bin surpasses the cutoff of:" , cutoff, "\n")
break
}
### bins where neither bac not arc markers are above the cutoff (50% by DAS tool default) are removed as they will never reach the scoring cutoff
mag_exclude = c(mag_exclude, scoreframe %>% filter(Completeness < cutoff) %>% pull(bin))
scoreframe = scoreframe %>% filter(!bin %in% mag_exclude, Contamination <= opt$max_cont)
### as long as consecutive best hits are from the same set, no high-scoring bins are affected by definition (each contig only occurs once per set)
### this iteration can mean big speedup
winnerset=scoreframe$bintool[1]
while(nrow(scoreframe)>0 & scoreframe$bintool[1] == winnerset){
winnerbin = scoreframe$bin[1]
if(exists("scoreframe.out")==F){scoreframe.out<-head(scoreframe, 1)}else{scoreframe.out<-rbind(scoreframe.out,head(scoreframe,1))}
cat("The highest scoring bin is:", winnerbin,"\n")
### add contigs for winning bin to output table
contig_to_bin.out <- rbind(contig_to_bin.out,contig_to_bin.remain %>% filter(bin==winnerbin))
#print(tail(scoreframe.out,1))
cat("Remaining:", length(unique(contig_to_bin.remain$contig)), "contigs and", nrow(scoreframe),"candidate bins\n")
scoreframe=scoreframe[-1,]
if(substr(winnerset, 1, 7)=="MAGScoT") break
}
### assess which bins were affected by the winning bin(s); store these for re-scoring in the next iteration
recalc = contig_to_bin.remain %>% filter(contig %in% contig_to_bin.out$contig) %>% pull(bin) %>% unique
### remove bins below treshold (already removed in df); remove contigs which have been assigned to a refined bin
contig_to_bin.remain <- contig_to_bin.remain %>% filter(bin %in% scoreframe$bin, !contig %in% contig_to_bin.out$contig)
i=i+1
}
if(exists("scoreframe.out")){
### rename refined bins
scoreframe.out = scoreframe.out %>% mutate(refined_id = paste0(id,"_",opt$bin_separator,"_",formatC(seq_along(bin), width = 6, format = "d", flag = "0")))
contig_to_bin.out$binnew<-scoreframe.out$refined_id[match(contig_to_bin.out$bin, scoreframe.out$bin)]
###
cat("Refinement lead to a total of", nrow(scoreframe.out)," bins with a score >=", cutoff,"\n")
stats_outfile = paste0(id,".refined.out")
binning_outfile = paste0(id,".refined.contig_to_bin.out")
cat("Refinement stats are written to:", stats_outfile,"\n")
cat("Contig-to-refined-bin mapping is written to:", binning_outfile, "\n")
write_tsv(scoreframe.out,stats_outfile)
write_tsv(contig_to_bin.out %>% dplyr::select(binnew,contig),binning_outfile)
} else {
cat("No bins with score >=", cutoff, "were found in the dataset.\n")
}