Group genes into bins based on co-occurrence across genomes

FredHutch · Jan 16, 2024 · 03326f3 · 03326f3
1 parent 35ff197
commit 03326f3
Show file tree

Hide file tree

Showing 5 changed files with 694 additions and 0 deletions.
diff --git a/bin_genes.nf b/bin_genes.nf
@@ -0,0 +1,74 @@
+#!/usr/bin/env nextflow
+
+// Using DSL-2
+nextflow.enable.dsl=2
+
+// Import helpers
+GroovyShell shell = new GroovyShell()
+def helpers = shell.parse(new File("${workflow.projectDir}/helpers.gvy"))
+
+// Import sub-workflows
+include { bin_genes } from './modules/processes/bin_genes'
+
+// Standalone entrypoint
+workflow {
+
+    // Show help message if the user specifies the --help flag at runtime
+    helpers.help_message(
+        """
+        Bin genes based on which genomes they are found within
+        
+        Processes the output of align_genomes.nf, grouping genes into bins and
+        summarizing the genomes on the basis of those bins.
+
+        Steps:
+            1. Filter alignments by coverage and identity thresholds
+            2. Filter genes by the number of genomes each is found in
+            3. Group genes into bins
+            4. Filter bins by minimum size (number of genes per bin)
+            5. Group genomes by shared gene bin composition
+
+        Parameters:
+
+        --genome_aln        Alignments from align_genomes.nf (e.g. genomes.aln.csv.gz)
+        --gene_annot        Annotations of each gene from deduplicate.nf (e.g. centroids.annot.csv.gz)
+        --output            Folder where output files will be written
+
+        --min_coverage      Minimum proportion of a gene which must align in order to retain each alignment
+                            (default: ${params.min_coverage}, ranges 0-100)
+        --min_identity      Minimum percent identity of the amino acid alignment required to retain each alignment
+                            (default: ${params.min_identity}, ranges 0-100)
+
+        --min_genomes_per_gene  Minimum number of genomes for a gene to be found in to be included (default: 1)
+
+        --max_dist_genes    Maximum Jaccard distance threshold used to group genes into bins
+
+        --min_bin_size  Minimum number of genes needed to retain a bin
+
+        --max_dist_genomes  Maximum Euclidean distance threshold used to group genomes based on gene bin content
+
+
+        Outputs:
+
+        bins.csv.gz     Table listing genes and bins (including optional gene annotations)
+        """,
+        params.help
+    )
+
+    // Make sure that the required parameters were provided
+    helpers.require_param(params.output, "output")
+    helpers.require_param(params.genome_aln, "genome_aln")
+
+    // Get the genome alignments
+    genome_aln = file(params.genome_aln, checkIfExists: true)
+
+    // Get the gene annotations
+    gene_annot = file(params.gene_annot, checkIfExists: true)
+
+    // Bin the genes
+    bin_genes(
+        genome_aln,
+        gene_annot
+    )
+
+}
diff --git a/modules/processes/bin_genes.nf b/modules/processes/bin_genes.nf
@@ -0,0 +1,15 @@
+process bin_genes {
+    container "${params.container__pandas}"
+    label 'io_limited'
+    publishDir "${params.output}", mode: 'copy', overwrite: true
+
+    input:
+    path genome_aln
+    path gene_annot
+
+    output:
+    path "*"
+
+    script:
+    template "bin_genes.py"
+}
diff --git a/nextflow.config b/nextflow.config
@@ -164,6 +164,13 @@ params {
     save_sketches = true
     kmer_size = 9
     search_results = "search_results"
+
+    // Bin genes
+    gene_annot = "$projectDir/templates/centroids.annot.csv.gz"
+    min_genomes_per_gene = 1
+    max_dist_genes = 0.05
+    max_dist_genomes = 0.05
+    min_bin_size = 5
 
     // Docker containers reused across processes
     container__pandas = "quay.io/fhcrc-microbiome/python-pandas:9597001"