Align reads to pangenome

FredHutch · Feb 4, 2024 · 110f691 · 110f691
1 parent aa436f2
commit 110f691
Showing 1 changed file with 111 additions and 0 deletions.
diff --git a/align_pangenome.nf b/align_pangenome.nf
@@ -0,0 +1,111 @@
+#!/usr/bin/env nextflow
+
+// Using DSL-2
+nextflow.enable.dsl=2
+
+// Import helpers
+GroovyShell shell = new GroovyShell()
+def helpers = shell.parse(new File("${workflow.projectDir}/helpers.gvy"))
+
+// Import sub-workflows
+include { bin_metagenomes } from './modules/processes/bin_metagenomes'
+include { align_reads } from './modules/align_reads'
+include { find_reads } from './modules/find_reads'
+
+// Standalone entrypoint
+workflow {
+
+    // Show help message if the user specifies the --help flag at runtime
+    helpers.help_message(
+        """
+        Align a set of metagenomic sequencing data to a binned pangenome in
+        which the genes have been grouped by similar occurrence across genomes
+        (using the build_pangenome.nf workflow).
+
+        Those binned gene groups will be used to summarize a batch of metagenomic
+        data, and the similarity of gene abundance across metagenomes will be
+        tested for concordence with the similarity across genomes.
+
+        By applying the results of gene binning to the metagenome alignment
+        results using the same gene catalog, the diversity of a collection of
+        organisms can be collapsed into units of those co-occurrent genetic
+        elements.
+
+        Parameters:
+
+        --genes             Path of single deduplicated amino acid FASTA file to be used for alignment
+        --reads             Folder containing the set of FASTQ files to align against those genes
+        --reads_suffix      File ending for all reads (default: ${params.reads_suffix})
+        --paired            Set to true if input folder contains paired-end FASTQ files (default: false)
+        --samplesheet       Optional file listing the FASTQ inputs to process (headers: sample,R1,R2)
+
+        --min_score_reads   Minimum alignment score (default: ${params.min_score_reads})
+        --min_identity      Minimum percent identity of the amino acid alignment required to retain the alignment
+                            (default: ${params.min_identity}, ranges 0-100)
+        --max_evalue        Maximum E-value threshold used to filter all alignments
+                            (default: ${params.max_evalue})
+        --aligner           Algorithm used for alignment (default: ${params.aligner}, options: diamond, blast)
+        --query_gencode     Genetic code used for conceptual translation of genome sequences
+                            (default: ${params.query_gencode})
+        --max_overlap       Any alignment which overlaps a higher-scoring alignment by more than this
+                            amount will be filtered out (default: ${params.max_overlap}, range: 0-100)
+        --aln_fmt           Column headings used for alignment outputs (see DIAMOND documentation for details)
+                            (default: ${params.aln_fmt})
+
+        --gene_bins         Grouping of genes into bins (e.g. gene_bins.csv)
+        --group_profile     Gene content of genome groups (e.g. group_profile.csv)
+        --output            Folder where output files will be written
+
+        --metadata          Optional: Metadata table used to compare samples (CSV)
+        --category          Optional: Column from metadata table used for comparison
+
+        --min_n_reads       Exclude any samples with fewer than this number of reads aligned (default: 0)
+        --min_n_genes       Exclude any samples with fewer than this number of genes detected (default: 0)
+
+        """,
+        params.help
+    )
+
+    // Make sure that the required parameters were provided
+    helpers.require_param(params.genes, "genes")
+    helpers.require_param(params.gene_bins, "gene_bins")
+    helpers.require_param(params.group_profile, "group_profile")
+    helpers.require_param(params.output, "output")
+
+    // If there is no samplesheet
+    if ( "${params.samplesheet}" != "false" ){
+        // Requre the reads parameter
+        helpers.require_param(params.reads, "reads")
+    }
+
+    // Get the reads either from samplesheet or the input folder
+    find_reads()
+
+    // Align those reads against the centroids
+    align_reads(
+        file("${params.genes}"),
+        find_reads.out
+    )
+
+    // Get the gene bins
+    gene_bins = file(params.gene_bins, checkIfExists: true)
+
+    // Get the genome groups
+    genome_groups = file(params.genome_groups, checkIfExists: true)
+
+    // Get the genome groups
+    group_profile = file(params.group_profile, checkIfExists: true)
+
+    // Get the metadata table
+    metadata = file(params.metadata, checkIfExists: true)
+
+    // Bin the metagenomes
+    bin_metagenomes(
+        align_reads.out.csv,
+        gene_bins,
+        genome_groups,
+        group_profile,
+        metadata
+    )
+
+}