Skip to content

Commit

Permalink
Wrapper workflow to build a pangenome
Browse files Browse the repository at this point in the history
  • Loading branch information
sminot committed Jan 30, 2024
1 parent 4eea9f4 commit 23709f3
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 0 deletions.
1 change: 1 addition & 0 deletions bin/bin_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ def summarize_genomes(self):
x=prop_in_bin.index.values,
y=prop_in_bin[kw].values,
hovertext=prop_in_bin[kw].values,
showlegend=False
),
row=ix+1,
col=1
Expand Down
148 changes: 148 additions & 0 deletions build_pangenome.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#!/usr/bin/env nextflow

// Using DSL-2
nextflow.enable.dsl=2

// Import helpers
GroovyShell shell = new GroovyShell()
def helpers = shell.parse(new File("${workflow.projectDir}/helpers.gvy"))

// Import sub-workflows
include { align_genomes } from './modules/align_genomes' addParams(output: "${params.output}/align")
include { download_genes } from './modules/download_genes'
include { download_genomes } from './modules/download_genomes'
include { deduplicate } from './modules/deduplicate' addParams(output: "${params.output}/gene_catalog")
include { bin_genes } from './modules/processes/bin_genes' addParams(output: "${params.output}/bin_pangenome")

// Standalone entrypoint
workflow {

// Show help message if the user specifies the --help flag at runtime
helpers.help_message(
"""
Build a pangenome dataset from a collection of genes and genomes
1. Genomes and genes will be downloaded from NCBI if needed
2. Genes will be deduplicated to yield a nonredundant gene catalog
3. Genomes will be aligned against the gene catalog
4. Genes and genomes will be binned by co-occurrence
Parameters:
--genome_tsv (optional) Table of genomes to download from NCBI (e.g. https://www.ncbi.nlm.nih.gov/datasets/genome/)
--genome_csv (optional) Table of genomes to download from legacy NCBI (e.g. https://www.ncbi.nlm.nih.gov/genome/browse#!/prokaryotes/)
--genes (optional) Folder containing the set of gene amino acid FASTAs to deduplicate
--genomes (optional) Folder containing the set of genome nucleotide FASTAs to align against
--output Folder where output files will be written
--min_coverage Minimum proportion of a gene which must align in order to retain the alignment
(default: ${params.min_coverage}, ranges 0-100)
--min_identity Minimum percent identity of the amino acid alignment required to retain the alignment
(default: ${params.min_identity}, ranges 0-100)
--max_evalue Maximum E-value threshold used to filter all alignments
(default: ${params.max_evalue})
--aligner Algorithm used for alignment (default: ${params.aligner}, options: diamond, blast)
--query_gencode Genetic code used for conceptual translation of genome sequences
(default: ${params.query_gencode})
--max_overlap Any alignment which overlaps a higher-scoring alignment by more than this
amount will be filtered out (default: ${params.max_overlap}, range: 0-100)
--min_genomes_per_gene Minimum number of genomes for a gene to be found in to be included (default: 1)
--max_dist_genes Maximum Jaccard distance threshold used to group genes into bins
--min_bin_size Minimum number of genes needed to retain a bin
--max_dist_genomes Maximum Euclidean distance threshold used to group genomes based on gene bin content
""",
params.help
)

// Make sure that the required parameters were provided
helpers.require_param(params.output, "output")

//////////////////////////////////////
// MANUALLY INPUT GENES AND GENOMES //
//////////////////////////////////////

// Remove any trailing slash from the genome and gene folder
if(params.genomes){params.genomes = params.genomes.replaceAll('/$', '')}
if(params.genes){params.genes = params.genes.replaceAll('/$', '')}

// Get all of the genomes
Channel
.fromPath("${params.genomes}/*")
.filter { it.exists() }
.set { genomes_ch }

// Get all of the genes
Channel
.fromPath("${params.genes}/*")
.filter { it.exists() }
.set { genes_ch }

////////////////////////////////
// DOWNLOAD GENOMES FROM NCBI //
////////////////////////////////

// Parse the CSV path(s), if provided
if(params.genome_csv.length() > 1){
Channel
.fromPath( params.genome_csv.split(',').toList() )
.filter { it.exists() }
.set { genome_manifest_csv }
}else{
Channel.empty().set { genome_manifest_csv }
}

// Parse the TSV path(s), if provided
if(params.genome_tsv.length() > 1){
Channel
.fromPath( params.genome_tsv.split(',').toList() )
.filter { it.exists() }
.set { genome_manifest_tsv }
}else{
Channel.empty().set { genome_manifest_tsv }
}

// Download the genomes for each genome
download_genomes(
genome_manifest_csv,
genome_manifest_tsv
)

//////////////////////////////
// DOWNLOAD GENES FROM NCBI //
//////////////////////////////

// Download the genes for each genome
download_genes(
genome_manifest_csv,
genome_manifest_tsv
)

//////////////////////////////
// DEDUPLICATE GENE CATALOG //
//////////////////////////////

// Combine genes from NCBI and from the folder input
deduplicate(genes_ch.mix(download_genes.out.genes))

//////////////////////
// ALIGN TO GENOMES //
//////////////////////

// Run the genome alignment sub-workflow
align_genomes(
genomes_ch.mix(download_genomes.out.genomes),
deduplicate.out.fasta
)

///////////////////////////
// BIN GENES AND GENOMES //
///////////////////////////

// Bin the genes
bin_genes(
align_genomes.out.concat_alignments,
deduplicate.out.annot
)

}

0 comments on commit 23709f3

Please sign in to comment.