Skip to content

Commit

Permalink
Group genes into bins based on co-occurrence across genomes
Browse files Browse the repository at this point in the history
  • Loading branch information
sminot committed Jan 16, 2024
1 parent 35ff197 commit 03326f3
Show file tree
Hide file tree
Showing 5 changed files with 694 additions and 0 deletions.
74 changes: 74 additions & 0 deletions bin_genes.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env nextflow

// Using DSL-2
nextflow.enable.dsl=2

// Import helpers
GroovyShell shell = new GroovyShell()
def helpers = shell.parse(new File("${workflow.projectDir}/helpers.gvy"))

// Import sub-workflows
include { bin_genes } from './modules/processes/bin_genes'

// Standalone entrypoint
workflow {

// Show help message if the user specifies the --help flag at runtime
helpers.help_message(
"""
Bin genes based on which genomes they are found within
Processes the output of align_genomes.nf, grouping genes into bins and
summarizing the genomes on the basis of those bins.
Steps:
1. Filter alignments by coverage and identity thresholds
2. Filter genes by the number of genomes each is found in
3. Group genes into bins
4. Filter bins by minimum size (number of genes per bin)
5. Group genomes by shared gene bin composition
Parameters:
--genome_aln Alignments from align_genomes.nf (e.g. genomes.aln.csv.gz)
--gene_annot Annotations of each gene from deduplicate.nf (e.g. centroids.annot.csv.gz)
--output Folder where output files will be written
--min_coverage Minimum proportion of a gene which must align in order to retain each alignment
(default: ${params.min_coverage}, ranges 0-100)
--min_identity Minimum percent identity of the amino acid alignment required to retain each alignment
(default: ${params.min_identity}, ranges 0-100)
--min_genomes_per_gene Minimum number of genomes for a gene to be found in to be included (default: 1)
--max_dist_genes Maximum Jaccard distance threshold used to group genes into bins
--min_bin_size Minimum number of genes needed to retain a bin
--max_dist_genomes Maximum Euclidean distance threshold used to group genomes based on gene bin content
Outputs:
bins.csv.gz Table listing genes and bins (including optional gene annotations)
""",
params.help
)

// Make sure that the required parameters were provided
helpers.require_param(params.output, "output")
helpers.require_param(params.genome_aln, "genome_aln")

// Get the genome alignments
genome_aln = file(params.genome_aln, checkIfExists: true)

// Get the gene annotations
gene_annot = file(params.gene_annot, checkIfExists: true)

// Bin the genes
bin_genes(
genome_aln,
gene_annot
)

}
15 changes: 15 additions & 0 deletions modules/processes/bin_genes.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
process bin_genes {
container "${params.container__pandas}"
label 'io_limited'
publishDir "${params.output}", mode: 'copy', overwrite: true

input:
path genome_aln
path gene_annot

output:
path "*"

script:
template "bin_genes.py"
}
7 changes: 7 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,13 @@ params {
save_sketches = true
kmer_size = 9
search_results = "search_results"

// Bin genes
gene_annot = "$projectDir/templates/centroids.annot.csv.gz"
min_genomes_per_gene = 1
max_dist_genes = 0.05
max_dist_genomes = 0.05
min_bin_size = 5

// Docker containers reused across processes
container__pandas = "quay.io/fhcrc-microbiome/python-pandas:9597001"
Expand Down
Loading

0 comments on commit 03326f3

Please sign in to comment.