-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.nf
214 lines (169 loc) · 7.5 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env nextflow
log.info """\
FASTA - TO - GWAS PIPELINE
==========================
genus: ${params.genus}
phenotypes: ${params.phenotypes}
antibiotic: ${params.antibiotic}
genotype_method: ${params.genotype_method}
"""
.stripIndent()
/*
========================================================================================
HELP
========================================================================================
*/
def printHelp() {
log.info """
Usage:
nextflow run main.nf
Options:
--manifest Manifest containing paths to FASTA files (mandatory)
--phenotypes A tab file containing phenotypes for all samples (mandatory)
--genus Genus name for samples (mandatory if starting from FASTA files)
--genotype_method Genotype method to run GWAS, from a choice of three (unitig|pa|snp) (mandatory)
Note: unitig is recommended.
--reference Manifest containing paths to reference FASTA and GFF files (mandatory for significant k-mer/unitig analysis)
--mygff Input a manifest containing paths to already annotated GFF files; must match sample_ids in manifest (optional)
--mytree Input user preferred phylogenetic tree (optional)
--mvcf Input already mergerd vcf.gz file (optional)
--fe Run GWAS using fixed model (SEER) (optional)
--help Print this help message (optional)
Alternative Options for Some Processes:
[PanarooAnalysis]
--panaroo_clean_mode Default: "strict"
--panaroo_alignment Default: "core"
--panaroo_aligner Default: "mafft"
--panaroo_core_threshold Default: 0.95
[PhylogeneticAnalysis]
--iqtree_model Default: "GTR"
[Pyseer]
--pyseer_min_af Default: 0.01
--pyseer_max_af Default: 0.99
""".stripIndent()
}
if (params.help) {
printHelp()
exit(0)
}
/*
========================================================================================
IMPORT MODULES/SUBWORKFLOWS
========================================================================================
*/
//
// MODULES
//
include { validate_parameters } from './modules/helper_functions'
include { ProkkaAnnotate } from './modules/ProkkaAnnotate'
include { PanarooAnalysis } from './modules/PanarooAnalysis'
include { PhylogeneticAnalysis } from './modules/PhylogeneticAnalysis'
include { PyseerKinshipMatrix } from './modules/PyseerKinshipMatrix'
include { UnitigCaller } from './modules/UnitigCaller'
include { MergeVCF } from './modules/MergeVCF'
include { PyseerGenotypeMatrix } from './modules/PyseerGenotypeMatrix'
include { PyseerUnitig; PyseerPreAbs; PyseerVariants } from './modules/Pyseer'
include { SignificantKmers; KmerMap; WriteReferenceText; AnnotateKmers; GeneHitPlot} from './modules/SignificantKmerAnalysis'
/*
========================================================================================
VALIDATE INPUTS
========================================================================================
*/
validate_parameters()
/*
========================================================================================
RUN MAIN WORKFLOW
========================================================================================
*/
workflow {
manifest_ch = Channel.fromPath(params.manifest)
genomes_ch = manifest_ch.splitCsv(header: true, sep: ',')
.map{ row -> tuple(row.sample_id, row.assembly_path) }
pheno = Channel.fromPath(params.phenotypes)
if (params.mytree) {
tree = Channel.fromPath(params.mytree)
gff_files = Channel.fromPath(params.mygff)
.splitCsv(header: true, sep: ',')
.map{ row -> tuple(row.sample_id, row.ann_genome_path) }
.map{ it -> it[1] }
.collect()
}
else {
if (params.mygff) {
gff_files = Channel.fromPath(params.mygff)
.splitCsv(header: true, sep: ',')
.map{ row -> tuple(row.sample_id, row.ann_genome_path) }
.map{ it -> it[1] }
.collect()
PanarooAnalysis(gff_files)
alignment = PanarooAnalysis.out.panaroo_output_core_aln
PhylogeneticAnalysis(alignment)
tree = PhylogeneticAnalysis.out.phylo_tree
}
else {
ProkkaAnnotate(genomes_ch)
gff_files = ProkkaAnnotate.out.prokka_output_gff
.map { it -> it[1] }
.collect()
PanarooAnalysis(gff_files)
alignment = PanarooAnalysis.out.panaroo_output_core_aln
PhylogeneticAnalysis(alignment)
tree = PhylogeneticAnalysis.out.phylo_tree
}
}
if (params.genotype_method == "pa"){
PyseerKinshipMatrix(tree)
k_matrix = PyseerKinshipMatrix.out.kinship_matrix
pre_abs = PanarooAnalysis.out.panaroo_output_pre_abs
PyseerPreAbs(pre_abs,pheno,k_matrix)
}
else if (params.genotype_method == "snp"){
if (params.mvcf){
// But currently this has a problem: Must use distance matrix with fixed effects (SEER)
// This is the classifical way of using MDS
// Distance matrix needs mash (1); or we could extract patristic distances from a phylogeny (2).
// 1>> mash sketch -s 10000 -o samples *.fa \ mash dist samples.msh samples.msh | square_mash > mash.tsv
// 2>> python scripts/phylogeny_distance.py core_genome.tree > phylogeny_distances.tsv
// But, we can use core_gene VCF to generate kinship matrix and use FaST-LMM model?
// similarity_pyseer --vcf core_gene_snps.vcf sample_list.txt > genotype_kinship.tsv
if (params.fe){
// fixed effect way of doing GWAS
}
else {
variants = Channel.fromPath(params.mvcf)
PyseerGenotypeMatrix(variants, manifest_ch)
k_matrix = PyseerGenotypeMatrix.out.kinship_matrix
PyseerVariants(variants,pheno,k_matrix)
}
}
else {
// Either use snippy to call variant from fasta, given a reference, then use Process: MergeVCF
// Or ask user to input another manifest containing directory of all vcf, then use Process: MergeVCF
}
}
else if (params.genotype_method == "unitig"){
PyseerKinshipMatrix(tree)
k_matrix = PyseerKinshipMatrix.out.kinship_matrix
UnitigCaller(manifest_ch)
unitig = UnitigCaller.out.unitig_out
PyseerUnitig(unitig,pheno,k_matrix)
pyseer_result = PyseerUnitig.out.pyseer_out
SignificantKmers(pyseer_result)
sig_kmer = SignificantKmers.out.sig_kmer_out
if (params.reference){
ref_manifest_ch = Channel.fromPath(params.reference)
ref_ch = ref_manifest_ch.splitCsv(header: false, sep:"\t")
.map{ row -> tuple(row[0], row[1]) }
.combine(sig_kmer)
KmerMap(ref_ch)
WriteReferenceText(manifest_ch,ref_manifest_ch)
reftxt = WriteReferenceText.out.write_ref_text_out
AnnotateKmers(sig_kmer,reftxt,gff_files)
genehit = AnnotateKmers.out.annotated_kmers_out
GeneHitPlot(genehit)
}
}
else {
println "Please use a correct genotype method (unitig|pa|snp)."
}
}