From b9bbbdcfca7ece0d011ac1225ce6818b33720f48 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 23 Jul 2018 16:14:23 -0400 Subject: [PATCH] Wdlupdate (#23) * added haplotypcaller nio, added disk sizing within task. * update gatk image to 4.0.6.0 , minor correction to picard command in JD * Added conditional task to convert input CRAM to bam before running haplotypcaller. * updated gatk version in joint discovery json --- haplotypecaller-gvcf-gatk4-nio.wdl | 247 ++++++++++++++++++ ...typecaller-gvcf-gatk4.hg38.wgs.inputs.json | 46 ++-- haplotypecaller-gvcf-gatk4.wdl | 79 +++++- ...discovery-gatk4-local.hg38.wgs.inputs.json | 2 +- joint-discovery-gatk4-local.wdl | 4 +- joint-discovery-gatk4.hg38.wgs.inputs.json | 2 +- 6 files changed, 346 insertions(+), 34 deletions(-) create mode 100644 haplotypecaller-gvcf-gatk4-nio.wdl diff --git a/haplotypecaller-gvcf-gatk4-nio.wdl b/haplotypecaller-gvcf-gatk4-nio.wdl new file mode 100644 index 0000000..0d6faef --- /dev/null +++ b/haplotypecaller-gvcf-gatk4-nio.wdl @@ -0,0 +1,247 @@ +## Copyright Broad Institute, 2017 +## +## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample +## according to the GATK Best Practices (June 2016), scattered across intervals. +## +## Requirements/expectations : +## - One analysis-ready BAM file for a single sample (as identified in RG:SM) +## - Set of variant calling intervals lists for the scatter, provided in a file +## +## Outputs : +## - One GVCF file and its index +## +## Cromwell version support +## - Successfully tested on v31 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the dockers +## for detailed licensing information pertaining to the included programs. + +# WORKFLOW DEFINITION +workflow HaplotypeCallerGvcf_GATK4 { + File input_bam + File input_bam_index + File ref_dict + File ref_fasta + File ref_fasta_index + File scattered_calling_intervals_list + + Boolean? make_gvcf + Boolean making_gvcf = select_first([make_gvcf,true]) + + String? gatk_docker_override + String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"]) + String? gatk_path_override + String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) + String? gitc_docker_override + String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) + + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) + + #is the input a cram file? + Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" + + String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") + String vcf_basename = sample_basename + String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" + String output_filename = vcf_basename + output_suffix + + # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. + # If we take the number we are scattering by and reduce by 20 we will have enough disk space + # to account for the fact that the data is quite uneven across the shards. + Int potential_hc_divisor = length(scattered_calling_intervals) - 20 + Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 + + + if ( is_cram ) { + call CramToBamTask { + input: + input_cram = input_bam, + sample_name = sample_basename, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + docker = gitc_docker + } + } + + # Call variants in parallel over grouped calling intervals + scatter (interval_file in scattered_calling_intervals) { + + # Generate GVCF by interval + call HaplotypeCaller { + input: + input_bam = select_first([CramToBamTask.output_bam, input_bam]), + input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]), + interval_list = interval_file, + output_filename = output_filename, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + hc_scatter = hc_divisor, + make_gvcf = making_gvcf, + docker = gatk_docker, + gatk_path = gatk_path + } + } + + # Merge per-interval GVCFs + call MergeGVCFs { + input: + input_vcfs = HaplotypeCaller.output_vcf, + input_vcfs_indexes = HaplotypeCaller.output_vcf_index, + output_filename = output_filename, + docker = gatk_docker, + gatk_path = gatk_path + } + + # Outputs that will be retained when execution is complete + output { + File output_vcf = MergeGVCFs.output_vcf + File output_vcf_index = MergeGVCFs.output_vcf_index + } +} + +# TASK DEFINITIONS + +task CramToBamTask { + # Command parameters + File ref_fasta + File ref_fasta_index + File ref_dict + File input_cram + String sample_name + + # Runtime parameters + String docker + Int? machine_mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + + Float output_bam_size = size(input_cram, "GB") / 0.60 + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 + + command { + set -e + set -o pipefail + + samtools view -h -T ${ref_fasta} ${input_cram} | + samtools view -b -o ${sample_name}.bam - + samtools index -b ${sample_name}.bam + mv ${sample_name}.bam.bai ${sample_name}.bai + } + runtime { + docker: docker + memory: select_first([machine_mem_gb, 15]) + " GB" + disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" + preemptible: preemptible_attempts + } + output { + File output_bam = "${sample_name}.bam" + File output_bai = "${sample_name}.bai" + } +} + +# HaplotypeCaller per-sample in GVCF mode +task HaplotypeCaller { + String input_bam + String input_bam_index + File interval_list + String output_filename + File ref_dict + File ref_fasta + File ref_fasta_index + Float? contamination + Boolean make_gvcf + Int hc_scatter + + String gatk_path + String? java_options + String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + + Int machine_mem_gb = select_first([mem_gb, 7]) + Int command_mem_gb = machine_mem_gb - 1 + + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 + + command <<< + set -e + + ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ + HaplotypeCaller \ + -R ${ref_fasta} \ + -I ${input_bam} \ + -L ${interval_list} \ + -O ${output_filename} \ + -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} + >>> + + runtime { + docker: docker + memory: machine_mem_gb + " GB" + disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" + preemptible: select_first([preemptible_attempts, 3]) + } + + output { + File output_vcf = "${output_filename}" + File output_vcf_index = "${output_filename}.tbi" + } +} +# Merge GVCFs generated per-interval for the same sample +task MergeGVCFs { + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String output_filename + + String gatk_path + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + + Int machine_mem_gb = select_first([mem_gb, 3]) + Int command_mem_gb = machine_mem_gb - 1 + + command <<< + set -e + + ${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ + MergeVcfs \ + --INPUT ${sep=' --INPUT ' input_vcfs} \ + --OUTPUT ${output_filename} + >>> + + runtime { + docker: docker + memory: machine_mem_gb + " GB" + disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" + preemptible: select_first([preemptible_attempts, 3]) + } + + + output { + File output_vcf = "${output_filename}" + File output_vcf_index = "${output_filename}.tbi" + } +} + diff --git a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json index 19f0add..0709c4e 100644 --- a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json +++ b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json @@ -1,7 +1,9 @@ { "##_COMMENT1": "INPUT BAM", - "HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam", - "HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai", + "#HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam", + "#HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai", + "HaplotypeCallerGvcf_GATK4.input_bam": "gs://broad-public-datasets/NA12878/NA12878.cram", + "HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://broad-public-datasets/NA12878/NA12878.cram.crai", "##_COMMENT2": "REFERENCE FILES", "HaplotypeCallerGvcf_GATK4.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", @@ -10,27 +12,33 @@ "##_COMMENT3": "INTERVALS", "HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt", + + "##_COMMENT4": "MISCELLANEOUS PARAMETERS", + "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True", + "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "Float? (optional)", - "##_COMMENT4": "DOCKERS", - "HaplotypeCallerGvcf_GATK4.gatk_docker": "broadinstitute/gatk:4.0.4.0", + "##_COMMENT5": "DOCKERS", + "#HaplotypeCallerGvcf_GATK4.gatk_docker_override": "String? (optional)", + "#HaplotypeCallerGvcf_GATK4.gitc_docker_override": "String? (optional)", - "##_COMMENT5": "PATHS", - "HaplotypeCallerGvcf_GATK4.gatk_path": "/gatk/gatk", - "HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True", - "##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "(optional) Float?", + "##_COMMENT6": "PATHS", + "#HaplotypeCallerGvcf_GATK4.gatk_path_override": "String? (optional)", - "##_COMMENT6": "JAVA OPTIONS", - "##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "(optional) String?", + "##_COMMENT7": "JAVA OPTIONS", + "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "String? (optional)", - "##_COMMENT7": "MEMORY ALLOCATION", - "##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "(optional) Int?", - "##HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "(optional) Int?", + "##_COMMENT8": "MEMORY ALLOCATION", + "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "Int? (optional)", + "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "Int? (optional)", + "#HaplotypeCallerGvcf_GATK4.CramToBamTask.machine_mem_gb": "Int? (optional)", - "##_COMMENT8": "DISK SIZE ALLOCATION", - "##HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "(optional) Int?", - "##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "(optional) Int?", + "##_COMMENT9": "DISK SIZE ALLOCATION", + "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "Int? (optional)", + "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "Int? (optional)", + "#HaplotypeCallerGvcf_GATK4.CramToBamTask.disk_space_gb": "Int? (optional)", - "##_COMMENT9": "PREEMPTION", - "##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "(optional) Int?", - "##HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "(optional) Int?" + "##_COMMENT10": "PREEMPTION", + "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "Int? (optional)", + "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "Int? (optional)", + "#HaplotypeCallerGvcf_GATK4.CramToBamTask.preemptible_attempts": "Int? (optional)" } diff --git a/haplotypecaller-gvcf-gatk4.wdl b/haplotypecaller-gvcf-gatk4.wdl index 9acf202..f5eb32c 100644 --- a/haplotypecaller-gvcf-gatk4.wdl +++ b/haplotypecaller-gvcf-gatk4.wdl @@ -35,19 +35,34 @@ workflow HaplotypeCallerGvcf_GATK4 { Boolean? make_gvcf Boolean making_gvcf = select_first([make_gvcf,true]) - String gatk_docker - - String gatk_path + String? gatk_docker_override + String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"]) + String? gatk_path_override + String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) + String? gitc_docker_override + String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) - String sample_basename = basename(input_bam, ".bam") - - String vcf_basename = sample_basename + #is the input a cram file? + Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" + String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") + String vcf_basename = sample_basename String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" String output_filename = vcf_basename + output_suffix + if ( is_cram ) { + call CramToBamTask { + input: + input_cram = input_bam, + sample_name = sample_basename, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + docker = gitc_docker + } + } # Call variants in parallel over grouped calling intervals scatter (interval_file in scattered_calling_intervals) { @@ -55,8 +70,8 @@ workflow HaplotypeCallerGvcf_GATK4 { # Generate GVCF by interval call HaplotypeCaller { input: - input_bam = input_bam, - input_bam_index = input_bam_index, + input_bam = select_first([CramToBamTask.output_bam, input_bam]), + input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]), interval_list = interval_file, output_filename = output_filename, ref_dict = ref_dict, @@ -87,12 +102,51 @@ workflow HaplotypeCallerGvcf_GATK4 { # TASK DEFINITIONS +task CramToBamTask { + # Command parameters + File ref_fasta + File ref_fasta_index + File ref_dict + File input_cram + String sample_name + + # Runtime parameters + String docker + Int? machine_mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + + Float output_bam_size = size(input_cram, "GB") / 0.60 + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 + + command { + set -e + set -o pipefail + + samtools view -h -T ${ref_fasta} ${input_cram} | + samtools view -b -o ${sample_name}.bam - + samtools index -b ${sample_name}.bam + mv ${sample_name}.bam.bai ${sample_name}.bai + } + runtime { + docker: docker + memory: select_first([machine_mem_gb, 15]) + " GB" + disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" + preemptibe: preemptible_attempts + } + output { + File output_bam = "${sample_name}.bam" + File output_bai = "${sample_name}.bai" + } +} + # HaplotypeCaller per-sample in GVCF mode task HaplotypeCaller { File input_bam File input_bam_index File interval_list - String output_filename File ref_dict File ref_fasta @@ -114,6 +168,9 @@ task HaplotypeCaller { Int machine_mem_gb = select_first([mem_gb, 7]) Int command_mem_gb = machine_mem_gb - 1 + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 + command <<< set -e @@ -129,7 +186,7 @@ task HaplotypeCaller { runtime { docker: docker memory: machine_mem_gb + " GB" - disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" + disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 3]) } @@ -138,7 +195,6 @@ task HaplotypeCaller { File output_vcf_index = "${output_filename}.tbi" } } - # Merge GVCFs generated per-interval for the same sample task MergeGVCFs { Array[File] input_vcfs @@ -179,3 +235,4 @@ task MergeGVCFs { File output_vcf_index = "${output_filename}.tbi" } } + diff --git a/joint-discovery-gatk4-local.hg38.wgs.inputs.json b/joint-discovery-gatk4-local.hg38.wgs.inputs.json index 37ea1be..cf620c8 100644 --- a/joint-discovery-gatk4-local.hg38.wgs.inputs.json +++ b/joint-discovery-gatk4-local.hg38.wgs.inputs.json @@ -39,7 +39,7 @@ "##_COMMENT4": "DOCKERS", "JointGenotyping.python_docker": "python:2.7", - "JointGenotyping.gatk_docker": "broadinstitute/gatk:4.0.4.0", + "JointGenotyping.gatk_docker": "broadinstitute/gatk:4.0.6.0", "##_COMMENT5": "PATHS", "JointGenotyping.gatk_path": "/gatk/gatk", diff --git a/joint-discovery-gatk4-local.wdl b/joint-discovery-gatk4-local.wdl index 14d482e..06028fa 100644 --- a/joint-discovery-gatk4-local.wdl +++ b/joint-discovery-gatk4-local.wdl @@ -860,8 +860,8 @@ task GatherMetrics { ${gatk_path} --java-options "-Xmx2g -Xms2g" \ AccumulateVariantCallingMetrics \ - --INPUT ${sep=" -INPUT " input_details_fofn} \ - --O ${output_prefix} + --INPUT ${sep=" --INPUT " input_details_fofn} \ + --OUTPUT ${output_prefix} >>> runtime { docker: docker diff --git a/joint-discovery-gatk4.hg38.wgs.inputs.json b/joint-discovery-gatk4.hg38.wgs.inputs.json index 4c34849..8610b56 100644 --- a/joint-discovery-gatk4.hg38.wgs.inputs.json +++ b/joint-discovery-gatk4.hg38.wgs.inputs.json @@ -37,7 +37,7 @@ "##_COMMENT4": "DOCKERS", "JointGenotyping.python_docker": "python:2.7", - "JointGenotyping.gatk_docker": "broadinstitute/gatk:4.0.4.0", + "JointGenotyping.gatk_docker": "broadinstitute/gatk:4.0.6.0", "##_COMMENT5": "PATHS", "JointGenotyping.gatk_path": "/gatk/gatk",