Skip to content

Commit

Permalink
Align reads to pangenome
Browse files Browse the repository at this point in the history
  • Loading branch information
sminot committed Feb 4, 2024
1 parent aa436f2 commit 110f691
Showing 1 changed file with 111 additions and 0 deletions.
111 changes: 111 additions & 0 deletions align_pangenome.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env nextflow

// Using DSL-2
nextflow.enable.dsl=2

// Import helpers
GroovyShell shell = new GroovyShell()
def helpers = shell.parse(new File("${workflow.projectDir}/helpers.gvy"))

// Import sub-workflows
include { bin_metagenomes } from './modules/processes/bin_metagenomes'
include { align_reads } from './modules/align_reads'
include { find_reads } from './modules/find_reads'

// Standalone entrypoint
workflow {

// Show help message if the user specifies the --help flag at runtime
helpers.help_message(
"""
Align a set of metagenomic sequencing data to a binned pangenome in
which the genes have been grouped by similar occurrence across genomes
(using the build_pangenome.nf workflow).
Those binned gene groups will be used to summarize a batch of metagenomic
data, and the similarity of gene abundance across metagenomes will be
tested for concordence with the similarity across genomes.
By applying the results of gene binning to the metagenome alignment
results using the same gene catalog, the diversity of a collection of
organisms can be collapsed into units of those co-occurrent genetic
elements.
Parameters:
--genes Path of single deduplicated amino acid FASTA file to be used for alignment
--reads Folder containing the set of FASTQ files to align against those genes
--reads_suffix File ending for all reads (default: ${params.reads_suffix})
--paired Set to true if input folder contains paired-end FASTQ files (default: false)
--samplesheet Optional file listing the FASTQ inputs to process (headers: sample,R1,R2)
--min_score_reads Minimum alignment score (default: ${params.min_score_reads})
--min_identity Minimum percent identity of the amino acid alignment required to retain the alignment
(default: ${params.min_identity}, ranges 0-100)
--max_evalue Maximum E-value threshold used to filter all alignments
(default: ${params.max_evalue})
--aligner Algorithm used for alignment (default: ${params.aligner}, options: diamond, blast)
--query_gencode Genetic code used for conceptual translation of genome sequences
(default: ${params.query_gencode})
--max_overlap Any alignment which overlaps a higher-scoring alignment by more than this
amount will be filtered out (default: ${params.max_overlap}, range: 0-100)
--aln_fmt Column headings used for alignment outputs (see DIAMOND documentation for details)
(default: ${params.aln_fmt})
--gene_bins Grouping of genes into bins (e.g. gene_bins.csv)
--group_profile Gene content of genome groups (e.g. group_profile.csv)
--output Folder where output files will be written
--metadata Optional: Metadata table used to compare samples (CSV)
--category Optional: Column from metadata table used for comparison
--min_n_reads Exclude any samples with fewer than this number of reads aligned (default: 0)
--min_n_genes Exclude any samples with fewer than this number of genes detected (default: 0)
""",
params.help
)

// Make sure that the required parameters were provided
helpers.require_param(params.genes, "genes")
helpers.require_param(params.gene_bins, "gene_bins")
helpers.require_param(params.group_profile, "group_profile")
helpers.require_param(params.output, "output")

// If there is no samplesheet
if ( "${params.samplesheet}" != "false" ){
// Requre the reads parameter
helpers.require_param(params.reads, "reads")
}

// Get the reads either from samplesheet or the input folder
find_reads()

// Align those reads against the centroids
align_reads(
file("${params.genes}"),
find_reads.out
)

// Get the gene bins
gene_bins = file(params.gene_bins, checkIfExists: true)

// Get the genome groups
genome_groups = file(params.genome_groups, checkIfExists: true)

// Get the genome groups
group_profile = file(params.group_profile, checkIfExists: true)

// Get the metadata table
metadata = file(params.metadata, checkIfExists: true)

// Bin the metagenomes
bin_metagenomes(
align_reads.out.csv,
gene_bins,
genome_groups,
group_profile,
metadata
)

}

0 comments on commit 110f691

Please sign in to comment.