diff --git a/src/components/Menu.astro b/src/components/Menu.astro index 4a80fe0a..e95cca47 100644 --- a/src/components/Menu.astro +++ b/src/components/Menu.astro @@ -37,16 +37,10 @@ const isHomepage = currentPath === "/" || currentPath === "/index.html"; Examples
diff --git a/src/pages/example1.md b/src/pages/example1.md index 7c68420e..c4df72a5 100644 --- a/src/pages/example1.md +++ b/src/pages/example1.md @@ -7,96 +7,86 @@ layout: "@layouts/MarkdownPage.astro"- This example shows how to write a pipeline with two simple Bash processes, so that the results produced by the first process are consumed by the second process. + This example shows a simple Nextflow pipeline consisting of two Bash processes.
```groovy #!/usr/bin/env nextflow -params.in = "$baseDir/data/sample.fa" +/* + * Pipeline parameters + */ + +// Primary input +params.greeting = "Hello World!" /* - * Split a fasta file into multiple files + * Redirect a string to a text file */ -process splitSequences { +process sayHello { input: - path 'input.fa' + val x output: - path 'seq_*' + path 'output.txt' + script: """ - awk '/^>/{f="seq_"++d} {print > f}' < input.fa + echo '$x' > output.txt """ } /* - * Reverse the sequences + * Convert lowercase letters to uppercase letters */ -process reverse { +process convertToUpper { input: - path x + path y output: stdout + script: """ - cat $x | rev + cat $y | tr '[a-z]' '[A-Z]' """ } /* - * Define the workflow + * Workflow definition */ workflow { - splitSequences(params.in) \ - | reverse \ - | view -} -``` - - - -### Synopsis - -- **Line 1** The script starts with a shebang declaration. This allows you to launch your pipeline just like any other Bash script. -- **Line 3**: Declares a pipeline parameter named `params.in` that is initialized with the value `$HOME/sample.fa`. This value can be overridden when launching the pipeline, by simply adding the option `--in- With Nextflow, you are not limited to Bash scripts -- you can use any scripting language! In other words, for each process you can use the language that best fits the specific task or that you simply prefer. + This example shows a simple Nextflow pipeline consisting of two processes written in different languages.
```groovy #!/usr/bin/env nextflow +/* + * Pipeline parameters + */ + +// Range params.range = 100 /* * A trivial Perl script that produces a list of number pairs */ process perlTask { + + input: + val x + output: stdout @@ -29,7 +38,7 @@ process perlTask { use warnings; my $count; - my $range = !{params.range}; + my $range = !{x}; for ($count = 0; $count < 10; $count++) { print rand($range) . ', ' . rand($range) . "\n"; } @@ -40,12 +49,14 @@ process perlTask { * A Python script which parses the output of the previous script */ process pyTask { + input: stdin output: stdout + script: """ #!/usr/bin/env python import sys @@ -64,7 +75,18 @@ process pyTask { } workflow { - perlTask | pyTask | view + + // Creates channel using the Channel.of() channel factory + range_ch = Channel.of(params.range) + + // A Perl script that produces a list of number pairs + perlTask(range_ch) + + // A Python script which parses the output of the previous script + pyTask(perlTask.out) + + // View pyTask output + pyTask.out.view() } ``` @@ -72,9 +94,16 @@ workflow { ### Synopsis -In the above example we define a simple pipeline with two processes. +This example shows a simple Nextflow pipeline consisting of two processes written in different languages. The `perlTask` process starts with a Perl _shebang_ declaration and executes a Perl script that produces pairs of numbers. Since Perl uses the `$` character for variables, the special `shell` block is used instead of the normal `script` block to distinguish the Perl variables from Nextflow variables. Similarly, the `pyTask` process starts with a Python _shebang_ declaration. It takes the output from the Perl script and executes a Python script that averages the number pairs. The output from the `pyTask` process is then printed to screen. + +### Try it + +To try this pipeline: + +1. Follow the [Nextflow installation guide](https://www.nextflow.io/docs/latest/install.html#install-nextflow) to install Nextflow (if not already available). +2. Copy the script above and save it as `mixed-languages.nf`. +3. Launch the pipeline: -The first process executes a Perl script, because the script block definition starts -with a Perl _shebang_ declaration (line 14). Since Perl uses the `$` character for variables, we use the special `shell` block instead of the normal `script` block to easily distinguish the Perl variables from the Nextflow variables. + nextflow run mixed-languages.nf -In the same way, the second process will execute a Python script, because the script block starts with a Python shebang (line 36). +**NOTE**: To run this example with versions of Nextflow older than 22.04.0, you must include the `-dsl2` flag with `nextflow run`. diff --git a/src/pages/example3.md b/src/pages/example3.md index 89373cab..d32c2661 100644 --- a/src/pages/example3.md +++ b/src/pages/example3.md @@ -1,111 +1,152 @@ --- -title: BLAST pipeline +title: RNA-Seq pipeline layout: "@layouts/MarkdownPage.astro" ---- This example splits a FASTA file into chunks and executes a BLAST query for each chunk in parallel. Then, all the sequences for the top hits are collected and merged into a single result file. + This example shows a basic RNA-Seq pipeline.
```groovy #!/usr/bin/env nextflow /* - * Defines the pipeline input parameters (with a default value for each one). - * Each of the following parameters can be specified as command line options. + * Pipeline parameters */ -params.query = "$baseDir/data/sample.fa" -params.db = "$baseDir/blast-db/pdb/tiny" -params.out = "result.txt" -params.chunkSize = 100 -db_name = file(params.db).name -db_dir = file(params.db).parent +// Input data +params.reads = "${workflow.projectDir}/data/ggal/ggal_gut_{1,2}.fq" +// Reference file +params.transcriptome = "${workflow.projectDir}/data/ggal/ggal_1_48850000_49020000.Ggal71.500bpflank.fa" -workflow { - /* - * Create a channel emitting the given query fasta file(s). - * Split the file into chunks containing as many sequences as defined by the parameter 'chunkSize'. - * Finally, assign the resulting channel to the variable 'ch_fasta' - */ - Channel - .fromPath(params.query) - .splitFasta(by: params.chunkSize, file:true) - .set { ch_fasta } - - /* - * Execute a BLAST job for each chunk emitted by the 'ch_fasta' channel - * and emit the resulting BLAST matches. - */ - ch_hits = blast(ch_fasta, db_dir) - - /* - * Each time a file emitted by the 'blast' process, an extract job is executed, - * producing a file containing the matching sequences. - */ - ch_sequences = extract(ch_hits, db_dir) - - /* - * Collect all the sequences files into a single file - * and print the resulting file contents when complete. - */ - ch_sequences - .collectFile(name: params.out) - .view { file -> "matching sequences:\n ${file.text}" } +// Output directory +params.outdir = "results" + +/* + * Index reference transcriptome file + */ +process INDEX { + tag "$transcriptome.simpleName" + container "community.wave.seqera.io/library/salmon:1.10.3--482593b6cd04c9b7" + conda "bioconda::salmon=1.10.3" + + input: + path transcriptome + + output: + path 'index' + + script: + """ + salmon index --threads $task.cpus -t $transcriptome -i index + """ +} + +/* + * Generate FastQC reports + */ +process FASTQC { + tag "FASTQC on $sample_id" + publishDir params.outdir, mode:'copy' + container "community.wave.seqera.io/library/fastqc:0.12.1--5cfd0f3cb6760c42" + conda "bioconda::fastqc:0.12.1" + + input: + tuple val(sample_id), path(reads) + + output: + path "fastqc_${sample_id}_logs" + + script: + """ + mkdir fastqc_${sample_id}_logs + fastqc -o fastqc_${sample_id}_logs -f fastq -q ${reads} + """ } +/* + * Quantify reads + */ +process QUANT { + tag "$pair_id" + publishDir params.outdir, mode:'copy' + container "community.wave.seqera.io/library/salmon:1.10.3--482593b6cd04c9b7" + conda "bioconda::salmon=1.10.3" -process blast { input: - path 'query.fa' - path db + path index + tuple val(pair_id), path(reads) output: - path 'top_hits' + path pair_id + script: """ - blastp -db $db/$db_name -query query.fa -outfmt 6 > blast_result - cat blast_result | head -n 10 | cut -f 2 > top_hits + salmon quant --threads $task.cpus --libType=U -i $index -1 ${reads[0]} -2 ${reads[1]} -o $pair_id """ } +/* + * Generate MultiQC report + */ +process MULTIQC { + publishDir params.outdir, mode:'copy' + container "community.wave.seqera.io/library/multiqc:1.24.1--789bc3917c8666da" + conda "bioconda::multiqc:1.24.1" -process extract { input: - path 'top_hits' - path db + path '*' output: - path 'sequences' + path 'multiqc_report.html' + script: """ - blastdbcmd -db $db/$db_name -entry_batch top_hits | head -n 10 > sequences + multiqc . """ } + +workflow { + + // Paired reference data + read_pairs_ch = channel.fromFilePairs( params.reads, checkIfExists: true ) + + // Index reference transcriptome file + INDEX(params.transcriptome) + + // Generate FastQC reports + FASTQC(read_pairs_ch) + + // Quantify reads + QUANT(INDEX.out, read_pairs_ch) + + // Generate MultiQC report + MULTIQC(QUANT.out.mix(FASTQC.out).collect()) +} ```- This example shows how to put together a basic RNA-Seq pipeline. It maps a collection of read-pairs to a given reference genome and outputs the respective transcript model. + This example shows a simple variant calling pipeline using container technology.
```groovy -#!/usr/bin/env nextflow - /* - * The following pipeline parameters specify the reference genomes - * and read pairs and can be provided as command line options + * Pipeline parameters */ -params.reads = "$baseDir/data/ggal/ggal_gut_{1,2}.fq" -params.transcriptome = "$baseDir/data/ggal/ggal_1_48850000_49020000.Ggal71.500bpflank.fa" -params.outdir = "results" -workflow { - read_pairs_ch = channel.fromFilePairs( params.reads, checkIfExists: true ) +// Primary input +params.reads_bam = "${workflow.projectDir}/data/bam/*.bam" - INDEX(params.transcriptome) - FASTQC(read_pairs_ch) - QUANT(INDEX.out, read_pairs_ch) -} +// Accessory files +params.reference = "${workflow.projectDir}/data/ref/ref.fasta" +params.reference_index = "${workflow.projectDir}/data/ref/ref.fasta.fai" +params.reference_dict = "${workflow.projectDir}/data/ref/ref.dict" +params.calling_intervals = "${workflow.projectDir}/data/ref/intervals.bed" + +// Base name for final output file +params.cohort_name = "family_trio" + +/* + * Generate BAM index file + */ +process SAMTOOLS_INDEX { -process INDEX { - tag "$transcriptome.simpleName" + container 'community.wave.seqera.io/library/samtools:1.20--b5dfbd93de237464' + conda "bioconda::samtools=1.20" input: - path transcriptome + path input_bam output: - path 'index' + tuple path(input_bam), path("${input_bam}.bai") - script: """ - salmon index --threads $task.cpus -t $transcriptome -i index + samtools index '$input_bam' + """ } -process FASTQC { - tag "FASTQC on $sample_id" - publishDir params.outdir +/* + * Call variants with GATK HapolotypeCaller in GVCF mode + */ +process GATK_HAPLOTYPECALLER { + + container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867" + conda "bioconda::gatk4=4.5.0.0" input: - tuple val(sample_id), path(reads) + tuple path(input_bam), path(input_bam_index) + path ref_fasta + path ref_index + path ref_dict + path interval_list output: - path "fastqc_${sample_id}_logs" + path "${input_bam}.g.vcf" + path "${input_bam}.g.vcf.idx" - script: """ - fastqc.sh "$sample_id" "$reads" + gatk HaplotypeCaller \ + -R ${ref_fasta} \ + -I ${input_bam} \ + -O ${input_bam}.g.vcf \ + -L ${interval_list} \ + -ERC GVCF """ } -process QUANT { - tag "$pair_id" - publishDir params.outdir +/* + * Consolidate GVCFs and apply joint genotyping analysis + */ +process GATK_JOINTGENOTYPING { + + container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867" + conda "bioconda::gatk4=4.5.0.0" input: - path index - tuple val(pair_id), path(reads) + path vcfs + path idxs + val cohort_name + path ref_fasta + path ref_index + path ref_dict + path interval_list output: - path pair_id + path "${cohort_name}.joint.vcf" + path "${cohort_name}.joint.vcf.idx" script: + def input_vcfs = vcfs.collect { "-V ${it}" }.join(' ') """ - salmon quant --threads $task.cpus --libType=U -i $index -1 ${reads[0]} -2 ${reads[1]} -o $pair_id + gatk GenomicsDBImport \ + ${input_vcfs} \ + --genomicsdb-workspace-path ${cohort_name}_gdb \ + -L ${interval_list} + + gatk GenotypeGVCFs \ + -R ${ref_fasta} \ + -V gendb://${cohort_name}_gdb \ + -O ${cohort_name}.joint.vcf \ + -L ${interval_list} """ } + +workflow { + + // Create input channel from BAM files + // We convert it to a tuple with the file name and the file path + // See https://www.nextflow.io/docs/latest/script.html#getting-file-attributes + bam_ch = Channel.fromPath(params.reads_bam, checkIfExists: true) + + // Create reference channels using the fromPath channel factory + // The collect converts from a queue channel to a value channel + // See https://www.nextflow.io/docs/latest/channel.html#channel-types for details + ref_ch = Channel.fromPath(params.reference, checkIfExists: true).collect() + ref_index_ch = Channel.fromPath(params.reference_index, checkIfExists: true).collect() + ref_dict_ch = Channel.fromPath(params.reference_dict, checkIfExists: true).collect() + calling_intervals_ch = Channel.fromPath(params.calling_intervals, checkIfExists: true).collect() + + // Create index file for input BAM file + SAMTOOLS_INDEX(bam_ch) + + // Call variants from the indexed BAM file + GATK_HAPLOTYPECALLER( + SAMTOOLS_INDEX.out, + ref_ch, + ref_index_ch, + ref_dict_ch, + calling_intervals_ch + ) + + all_vcfs = GATK_HAPLOTYPECALLER.out[0].collect() + all_tbis = GATK_HAPLOTYPECALLER.out[1].collect() + + // Consolidate GVCFs and apply joint genotyping analysis + GATK_JOINTGENOTYPING( + all_vcfs, + all_tbis, + params.cohort_name, + ref_ch, + ref_index_ch, + ref_dict_ch, + calling_intervals_ch + ) +} ```- This example shows how to put together a basic Machine Learning pipeline. It fetches a dataset from OpenML, trains a variety of machine learning models on a prediction target, and selects the best model based on some evaluation criteria. + This example shows how to put together a basic Machine Learning pipeline.
```groovy @@ -48,22 +48,22 @@ workflow {