-
Notifications
You must be signed in to change notification settings - Fork 56
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* added haplotypcaller nio, added disk sizing within task. * update gatk image to 4.0.6.0 , minor correction to picard command in JD * Added conditional task to convert input CRAM to bam before running haplotypcaller. * updated gatk version in joint discovery json
- Loading branch information
bshifaw
authored
Jul 23, 2018
1 parent
3087acc
commit b9bbbdc
Showing
6 changed files
with
346 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,247 @@ | ||
## Copyright Broad Institute, 2017 | ||
## | ||
## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample | ||
## according to the GATK Best Practices (June 2016), scattered across intervals. | ||
## | ||
## Requirements/expectations : | ||
## - One analysis-ready BAM file for a single sample (as identified in RG:SM) | ||
## - Set of variant calling intervals lists for the scatter, provided in a file | ||
## | ||
## Outputs : | ||
## - One GVCF file and its index | ||
## | ||
## Cromwell version support | ||
## - Successfully tested on v31 | ||
## - Does not work on versions < v23 due to output syntax | ||
## | ||
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
## | ||
## LICENSING : | ||
## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | ||
## be subject to different licenses. Users are responsible for checking that they are | ||
## authorized to run all programs before running this script. Please see the dockers | ||
## for detailed licensing information pertaining to the included programs. | ||
# WORKFLOW DEFINITION | ||
workflow HaplotypeCallerGvcf_GATK4 { | ||
File input_bam | ||
File input_bam_index | ||
File ref_dict | ||
File ref_fasta | ||
File ref_fasta_index | ||
File scattered_calling_intervals_list | ||
|
||
Boolean? make_gvcf | ||
Boolean making_gvcf = select_first([make_gvcf,true]) | ||
|
||
String? gatk_docker_override | ||
String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"]) | ||
String? gatk_path_override | ||
String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) | ||
String? gitc_docker_override | ||
String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) | ||
|
||
Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) | ||
|
||
#is the input a cram file? | ||
Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" | ||
|
||
String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") | ||
String vcf_basename = sample_basename | ||
String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" | ||
String output_filename = vcf_basename + output_suffix | ||
|
||
# We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. | ||
# If we take the number we are scattering by and reduce by 20 we will have enough disk space | ||
# to account for the fact that the data is quite uneven across the shards. | ||
Int potential_hc_divisor = length(scattered_calling_intervals) - 20 | ||
Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 | ||
|
||
|
||
if ( is_cram ) { | ||
call CramToBamTask { | ||
input: | ||
input_cram = input_bam, | ||
sample_name = sample_basename, | ||
ref_dict = ref_dict, | ||
ref_fasta = ref_fasta, | ||
ref_fasta_index = ref_fasta_index, | ||
docker = gitc_docker | ||
} | ||
} | ||
|
||
# Call variants in parallel over grouped calling intervals | ||
scatter (interval_file in scattered_calling_intervals) { | ||
|
||
# Generate GVCF by interval | ||
call HaplotypeCaller { | ||
input: | ||
input_bam = select_first([CramToBamTask.output_bam, input_bam]), | ||
input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]), | ||
interval_list = interval_file, | ||
output_filename = output_filename, | ||
ref_dict = ref_dict, | ||
ref_fasta = ref_fasta, | ||
ref_fasta_index = ref_fasta_index, | ||
hc_scatter = hc_divisor, | ||
make_gvcf = making_gvcf, | ||
docker = gatk_docker, | ||
gatk_path = gatk_path | ||
} | ||
} | ||
|
||
# Merge per-interval GVCFs | ||
call MergeGVCFs { | ||
input: | ||
input_vcfs = HaplotypeCaller.output_vcf, | ||
input_vcfs_indexes = HaplotypeCaller.output_vcf_index, | ||
output_filename = output_filename, | ||
docker = gatk_docker, | ||
gatk_path = gatk_path | ||
} | ||
|
||
# Outputs that will be retained when execution is complete | ||
output { | ||
File output_vcf = MergeGVCFs.output_vcf | ||
File output_vcf_index = MergeGVCFs.output_vcf_index | ||
} | ||
} | ||
|
||
# TASK DEFINITIONS | ||
task CramToBamTask { | ||
# Command parameters | ||
File ref_fasta | ||
File ref_fasta_index | ||
File ref_dict | ||
File input_cram | ||
String sample_name | ||
|
||
# Runtime parameters | ||
String docker | ||
Int? machine_mem_gb | ||
Int? disk_space_gb | ||
Boolean use_ssd = false | ||
Int? preemptible_attempts | ||
|
||
Float output_bam_size = size(input_cram, "GB") / 0.60 | ||
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") | ||
Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 | ||
|
||
command { | ||
set -e | ||
set -o pipefail | ||
|
||
samtools view -h -T ${ref_fasta} ${input_cram} | | ||
samtools view -b -o ${sample_name}.bam - | ||
samtools index -b ${sample_name}.bam | ||
mv ${sample_name}.bam.bai ${sample_name}.bai | ||
} | ||
runtime { | ||
docker: docker | ||
memory: select_first([machine_mem_gb, 15]) + " GB" | ||
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" | ||
preemptible: preemptible_attempts | ||
} | ||
output { | ||
File output_bam = "${sample_name}.bam" | ||
File output_bai = "${sample_name}.bai" | ||
} | ||
} | ||
|
||
# HaplotypeCaller per-sample in GVCF mode | ||
task HaplotypeCaller { | ||
String input_bam | ||
String input_bam_index | ||
File interval_list | ||
String output_filename | ||
File ref_dict | ||
File ref_fasta | ||
File ref_fasta_index | ||
Float? contamination | ||
Boolean make_gvcf | ||
Int hc_scatter | ||
|
||
String gatk_path | ||
String? java_options | ||
String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) | ||
|
||
# Runtime parameters | ||
String docker | ||
Int? mem_gb | ||
Int? disk_space_gb | ||
Boolean use_ssd = false | ||
Int? preemptible_attempts | ||
|
||
Int machine_mem_gb = select_first([mem_gb, 7]) | ||
Int command_mem_gb = machine_mem_gb - 1 | ||
|
||
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") | ||
Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 | ||
|
||
command <<< | ||
set -e | ||
|
||
${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ | ||
HaplotypeCaller \ | ||
-R ${ref_fasta} \ | ||
-I ${input_bam} \ | ||
-L ${interval_list} \ | ||
-O ${output_filename} \ | ||
-contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} | ||
>>> | ||
|
||
runtime { | ||
docker: docker | ||
memory: machine_mem_gb + " GB" | ||
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" | ||
preemptible: select_first([preemptible_attempts, 3]) | ||
} | ||
|
||
output { | ||
File output_vcf = "${output_filename}" | ||
File output_vcf_index = "${output_filename}.tbi" | ||
} | ||
} | ||
# Merge GVCFs generated per-interval for the same sample | ||
task MergeGVCFs { | ||
Array[File] input_vcfs | ||
Array[File] input_vcfs_indexes | ||
String output_filename | ||
|
||
String gatk_path | ||
|
||
# Runtime parameters | ||
String docker | ||
Int? mem_gb | ||
Int? disk_space_gb | ||
Boolean use_ssd = false | ||
Int? preemptible_attempts | ||
|
||
Int machine_mem_gb = select_first([mem_gb, 3]) | ||
Int command_mem_gb = machine_mem_gb - 1 | ||
|
||
command <<< | ||
set -e | ||
|
||
${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ | ||
MergeVcfs \ | ||
--INPUT ${sep=' --INPUT ' input_vcfs} \ | ||
--OUTPUT ${output_filename} | ||
>>> | ||
|
||
runtime { | ||
docker: docker | ||
memory: machine_mem_gb + " GB" | ||
disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" | ||
preemptible: select_first([preemptible_attempts, 3]) | ||
} | ||
|
||
|
||
output { | ||
File output_vcf = "${output_filename}" | ||
File output_vcf_index = "${output_filename}.tbi" | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.