From b9bbbdcfca7ece0d011ac1225ce6818b33720f48 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 23 Jul 2018 16:14:23 -0400
Subject: [PATCH] Wdlupdate (#23)

* added haplotypcaller nio, added disk sizing within task.

* update gatk image to 4.0.6.0 , minor correction to picard command in JD

* Added conditional task to convert input CRAM to bam before running haplotypcaller.

* updated gatk version in joint discovery json
---
 haplotypecaller-gvcf-gatk4-nio.wdl            | 247 ++++++++++++++++++
 ...typecaller-gvcf-gatk4.hg38.wgs.inputs.json |  46 ++--
 haplotypecaller-gvcf-gatk4.wdl                |  79 +++++-
 ...discovery-gatk4-local.hg38.wgs.inputs.json |   2 +-
 joint-discovery-gatk4-local.wdl               |   4 +-
 joint-discovery-gatk4.hg38.wgs.inputs.json    |   2 +-
 6 files changed, 346 insertions(+), 34 deletions(-)
 create mode 100644 haplotypecaller-gvcf-gatk4-nio.wdl

diff --git a/haplotypecaller-gvcf-gatk4-nio.wdl b/haplotypecaller-gvcf-gatk4-nio.wdl
new file mode 100644
index 0000000..0d6faef
--- /dev/null
+++ b/haplotypecaller-gvcf-gatk4-nio.wdl
@@ -0,0 +1,247 @@
+## Copyright Broad Institute, 2017
+## 
+## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample 
+## according to the GATK Best Practices (June 2016), scattered across intervals.
+##
+## Requirements/expectations :
+## - One analysis-ready BAM file for a single sample (as identified in RG:SM)
+## - Set of variant calling intervals lists for the scatter, provided in a file
+##
+## Outputs :
+## - One GVCF file and its index
+##
+## Cromwell version support 
+## - Successfully tested on v31
+## - Does not work on versions < v23 due to output syntax
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+##
+## LICENSING : 
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in 
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the dockers
+## for detailed licensing information pertaining to the included programs.
+
+# WORKFLOW DEFINITION 
+workflow HaplotypeCallerGvcf_GATK4 {
+  File input_bam
+  File input_bam_index
+  File ref_dict
+  File ref_fasta
+  File ref_fasta_index
+  File scattered_calling_intervals_list
+  
+  Boolean? make_gvcf
+  Boolean making_gvcf = select_first([make_gvcf,true])
+
+  String? gatk_docker_override
+  String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"])
+  String? gatk_path_override
+  String gatk_path = select_first([gatk_path_override, "/gatk/gatk"])
+  String? gitc_docker_override
+  String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"])
+  
+  Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)
+
+  #is the input a cram file?
+  Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram"
+
+  String sample_basename = if is_cram then  basename(input_bam, ".cram") else basename(input_bam, ".bam")
+  String vcf_basename = sample_basename
+  String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz"
+  String output_filename = vcf_basename + output_suffix
+
+  # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller.
+  # If we take the number we are scattering by and reduce by 20 we will have enough disk space
+  # to account for the fact that the data is quite uneven across the shards.
+  Int potential_hc_divisor = length(scattered_calling_intervals) - 20
+  Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1
+
+
+  if ( is_cram ) {
+    call CramToBamTask {
+          input:
+            input_cram = input_bam,
+            sample_name = sample_basename,
+            ref_dict = ref_dict,
+            ref_fasta = ref_fasta,
+            ref_fasta_index = ref_fasta_index,
+            docker = gitc_docker
+    }
+  }
+
+  # Call variants in parallel over grouped calling intervals
+  scatter (interval_file in scattered_calling_intervals) {
+
+    # Generate GVCF by interval
+    call HaplotypeCaller {
+      input:
+        input_bam = select_first([CramToBamTask.output_bam, input_bam]),
+        input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
+        interval_list = interval_file,
+        output_filename = output_filename,
+        ref_dict = ref_dict,
+        ref_fasta = ref_fasta,
+        ref_fasta_index = ref_fasta_index,
+        hc_scatter = hc_divisor,
+        make_gvcf = making_gvcf,
+        docker = gatk_docker,
+        gatk_path = gatk_path
+    }
+  }
+
+  # Merge per-interval GVCFs
+  call MergeGVCFs {
+    input:
+      input_vcfs = HaplotypeCaller.output_vcf,
+      input_vcfs_indexes = HaplotypeCaller.output_vcf_index,
+      output_filename = output_filename,
+      docker = gatk_docker,
+      gatk_path = gatk_path
+  }
+
+  # Outputs that will be retained when execution is complete
+  output {
+    File output_vcf = MergeGVCFs.output_vcf
+    File output_vcf_index = MergeGVCFs.output_vcf_index
+  }
+}
+
+# TASK DEFINITIONS
+
+task CramToBamTask {
+  # Command parameters
+  File ref_fasta
+  File ref_fasta_index
+  File ref_dict
+  File input_cram
+  String sample_name
+
+  # Runtime parameters
+  String docker
+  Int? machine_mem_gb
+  Int? disk_space_gb
+  Boolean use_ssd = false
+  Int? preemptible_attempts
+
+  Float output_bam_size = size(input_cram, "GB") / 0.60
+  Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
+  Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20
+
+  command {
+    set -e
+    set -o pipefail
+
+    samtools view -h -T ${ref_fasta} ${input_cram} |
+    samtools view -b -o ${sample_name}.bam -
+    samtools index -b ${sample_name}.bam
+    mv ${sample_name}.bam.bai ${sample_name}.bai
+  }
+  runtime {
+    docker: docker
+    memory: select_first([machine_mem_gb, 15]) + " GB"
+    disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
+    preemptible: preemptible_attempts
+ }
+  output {
+    File output_bam = "${sample_name}.bam"
+    File output_bai = "${sample_name}.bai"
+  }
+}
+
+# HaplotypeCaller per-sample in GVCF mode
+task HaplotypeCaller {
+  String input_bam
+  String input_bam_index
+  File interval_list
+  String output_filename
+  File ref_dict
+  File ref_fasta
+  File ref_fasta_index
+  Float? contamination
+  Boolean make_gvcf
+  Int hc_scatter
+
+  String gatk_path
+  String? java_options
+  String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"])
+
+  # Runtime parameters
+  String docker
+  Int? mem_gb
+  Int? disk_space_gb
+  Boolean use_ssd = false
+  Int? preemptible_attempts
+
+  Int machine_mem_gb = select_first([mem_gb, 7])
+  Int command_mem_gb = machine_mem_gb - 1
+
+  Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
+  Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20
+
+  command <<<
+  set -e
+  
+    ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \
+      HaplotypeCaller \
+      -R ${ref_fasta} \
+      -I ${input_bam} \
+      -L ${interval_list} \
+      -O ${output_filename} \
+      -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf}
+  >>>
+
+  runtime {
+    docker: docker
+    memory: machine_mem_gb + " GB"
+    disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
+    preemptible: select_first([preemptible_attempts, 3])
+  }
+
+  output {
+    File output_vcf = "${output_filename}"
+    File output_vcf_index = "${output_filename}.tbi"
+  }
+}
+# Merge GVCFs generated per-interval for the same sample
+task MergeGVCFs {
+  Array[File] input_vcfs
+  Array[File] input_vcfs_indexes
+  String output_filename
+
+  String gatk_path
+
+  # Runtime parameters
+  String docker
+  Int? mem_gb
+  Int? disk_space_gb
+  Boolean use_ssd = false
+  Int? preemptible_attempts
+
+  Int machine_mem_gb = select_first([mem_gb, 3])
+  Int command_mem_gb = machine_mem_gb - 1
+
+  command <<<
+  set -e
+
+    ${gatk_path} --java-options "-Xmx${command_mem_gb}G"  \
+      MergeVcfs \
+      --INPUT ${sep=' --INPUT ' input_vcfs} \
+      --OUTPUT ${output_filename}
+  >>>
+
+  runtime {
+    docker: docker
+    memory: machine_mem_gb + " GB"
+    disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD"
+    preemptible: select_first([preemptible_attempts, 3])
+  }
+
+
+  output {
+    File output_vcf = "${output_filename}"
+    File output_vcf_index = "${output_filename}.tbi"
+  }
+}
+
diff --git a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json
index 19f0add..0709c4e 100644
--- a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json
+++ b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json
@@ -1,7 +1,9 @@
 {
   "##_COMMENT1": "INPUT BAM",
-  "HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam",
-  "HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai",
+  "#HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam",
+  "#HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai",
+  "HaplotypeCallerGvcf_GATK4.input_bam": "gs://broad-public-datasets/NA12878/NA12878.cram",
+  "HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://broad-public-datasets/NA12878/NA12878.cram.crai",
 
   "##_COMMENT2": "REFERENCE FILES",
   "HaplotypeCallerGvcf_GATK4.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
@@ -10,27 +12,33 @@
 
   "##_COMMENT3": "INTERVALS",
   "HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt",
+  
+  "##_COMMENT4": "MISCELLANEOUS PARAMETERS",
+  "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True",
+  "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "Float? (optional)",
 
-  "##_COMMENT4": "DOCKERS", 
-  "HaplotypeCallerGvcf_GATK4.gatk_docker": "broadinstitute/gatk:4.0.4.0",
+  "##_COMMENT5": "DOCKERS", 
+  "#HaplotypeCallerGvcf_GATK4.gatk_docker_override": "String? (optional)",
+  "#HaplotypeCallerGvcf_GATK4.gitc_docker_override": "String? (optional)",
   
-  "##_COMMENT5": "PATHS", 
-  "HaplotypeCallerGvcf_GATK4.gatk_path": "/gatk/gatk",
-  "HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True",
-  "##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "(optional) Float?",
+  "##_COMMENT6": "PATHS", 
+  "#HaplotypeCallerGvcf_GATK4.gatk_path_override": "String? (optional)",
 
-  "##_COMMENT6": "JAVA OPTIONS", 
-  "##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "(optional) String?",
+  "##_COMMENT7": "JAVA OPTIONS", 
+  "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "String? (optional)",
 
-  "##_COMMENT7": "MEMORY ALLOCATION", 
-  "##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "(optional) Int?",
-  "##HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "(optional) Int?",
+  "##_COMMENT8": "MEMORY ALLOCATION", 
+  "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "Int? (optional)",
+  "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "Int? (optional)",
+  "#HaplotypeCallerGvcf_GATK4.CramToBamTask.machine_mem_gb": "Int? (optional)",
 
-  "##_COMMENT8": "DISK SIZE ALLOCATION",
-  "##HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "(optional) Int?",
-  "##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "(optional) Int?",
+  "##_COMMENT9": "DISK SIZE ALLOCATION",
+  "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "Int? (optional)",
+  "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "Int? (optional)",
+  "#HaplotypeCallerGvcf_GATK4.CramToBamTask.disk_space_gb": "Int? (optional)",
 
-  "##_COMMENT9": "PREEMPTION",
-  "##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "(optional) Int?",
-  "##HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "(optional) Int?"
+  "##_COMMENT10": "PREEMPTION",
+  "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "Int? (optional)",
+  "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "Int? (optional)",
+  "#HaplotypeCallerGvcf_GATK4.CramToBamTask.preemptible_attempts": "Int? (optional)"
 }
diff --git a/haplotypecaller-gvcf-gatk4.wdl b/haplotypecaller-gvcf-gatk4.wdl
index 9acf202..f5eb32c 100644
--- a/haplotypecaller-gvcf-gatk4.wdl
+++ b/haplotypecaller-gvcf-gatk4.wdl
@@ -35,19 +35,34 @@ workflow HaplotypeCallerGvcf_GATK4 {
   Boolean? make_gvcf
   Boolean making_gvcf = select_first([make_gvcf,true])
 
-  String gatk_docker
-
-  String gatk_path
+  String? gatk_docker_override
+  String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"])
+  String? gatk_path_override
+  String gatk_path = select_first([gatk_path_override, "/gatk/gatk"])
+  String? gitc_docker_override
+  String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"])
   
   Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)
 
-  String sample_basename = basename(input_bam, ".bam")
-  
-  String vcf_basename = sample_basename
+  #is the input a cram file?
+  Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram"
 
+  String sample_basename = if is_cram then  basename(input_bam, ".cram") else basename(input_bam, ".bam")
+  String vcf_basename = sample_basename
   String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz"
   String output_filename = vcf_basename + output_suffix
 
+  if ( is_cram ) {
+    call CramToBamTask {
+          input:
+            input_cram = input_bam,
+            sample_name = sample_basename,
+            ref_dict = ref_dict,
+            ref_fasta = ref_fasta,
+            ref_fasta_index = ref_fasta_index,
+            docker = gitc_docker
+    }
+  }
 
   # Call variants in parallel over grouped calling intervals
   scatter (interval_file in scattered_calling_intervals) {
@@ -55,8 +70,8 @@ workflow HaplotypeCallerGvcf_GATK4 {
     # Generate GVCF by interval
     call HaplotypeCaller {
       input:
-        input_bam = input_bam,
-        input_bam_index = input_bam_index,
+        input_bam = select_first([CramToBamTask.output_bam, input_bam]),
+        input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
         interval_list = interval_file,
         output_filename = output_filename,
         ref_dict = ref_dict,
@@ -87,12 +102,51 @@ workflow HaplotypeCallerGvcf_GATK4 {
 
 # TASK DEFINITIONS
 
+task CramToBamTask {
+  # Command parameters
+  File ref_fasta
+  File ref_fasta_index
+  File ref_dict
+  File input_cram
+  String sample_name
+
+  # Runtime parameters
+  String docker
+  Int? machine_mem_gb
+  Int? disk_space_gb
+  Boolean use_ssd = false
+  Int? preemptible_attempts
+
+  Float output_bam_size = size(input_cram, "GB") / 0.60
+  Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
+  Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20
+
+  command {
+    set -e
+    set -o pipefail
+
+    samtools view -h -T ${ref_fasta} ${input_cram} |
+    samtools view -b -o ${sample_name}.bam -
+    samtools index -b ${sample_name}.bam
+    mv ${sample_name}.bam.bai ${sample_name}.bai
+  }
+  runtime {
+    docker: docker
+    memory: select_first([machine_mem_gb, 15]) + " GB"
+    disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
+    preemptibe: preemptible_attempts
+ }
+  output {
+    File output_bam = "${sample_name}.bam"
+    File output_bai = "${sample_name}.bai"
+  }
+}
+
 # HaplotypeCaller per-sample in GVCF mode
 task HaplotypeCaller {
   File input_bam
   File input_bam_index
   File interval_list
-
   String output_filename
   File ref_dict
   File ref_fasta
@@ -114,6 +168,9 @@ task HaplotypeCaller {
   Int machine_mem_gb = select_first([mem_gb, 7])
   Int command_mem_gb = machine_mem_gb - 1
 
+  Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
+  Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20
+
   command <<<
   set -e
   
@@ -129,7 +186,7 @@ task HaplotypeCaller {
   runtime {
     docker: docker
     memory: machine_mem_gb + " GB"
-    disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD"
+    disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
     preemptible: select_first([preemptible_attempts, 3])
   }
 
@@ -138,7 +195,6 @@ task HaplotypeCaller {
     File output_vcf_index = "${output_filename}.tbi"
   }
 }
-
 # Merge GVCFs generated per-interval for the same sample
 task MergeGVCFs {
   Array[File] input_vcfs
@@ -179,3 +235,4 @@ task MergeGVCFs {
     File output_vcf_index = "${output_filename}.tbi"
   }
 }
+
diff --git a/joint-discovery-gatk4-local.hg38.wgs.inputs.json b/joint-discovery-gatk4-local.hg38.wgs.inputs.json
index 37ea1be..cf620c8 100644
--- a/joint-discovery-gatk4-local.hg38.wgs.inputs.json
+++ b/joint-discovery-gatk4-local.hg38.wgs.inputs.json
@@ -39,7 +39,7 @@
 
   "##_COMMENT4": "DOCKERS", 
   "JointGenotyping.python_docker": "python:2.7",
-  "JointGenotyping.gatk_docker": "broadinstitute/gatk:4.0.4.0",
+  "JointGenotyping.gatk_docker": "broadinstitute/gatk:4.0.6.0",
 
   "##_COMMENT5": "PATHS", 
   "JointGenotyping.gatk_path": "/gatk/gatk",
diff --git a/joint-discovery-gatk4-local.wdl b/joint-discovery-gatk4-local.wdl
index 14d482e..06028fa 100644
--- a/joint-discovery-gatk4-local.wdl
+++ b/joint-discovery-gatk4-local.wdl
@@ -860,8 +860,8 @@ task GatherMetrics {
     
     ${gatk_path} --java-options "-Xmx2g -Xms2g" \
     AccumulateVariantCallingMetrics \
-    --INPUT ${sep=" -INPUT " input_details_fofn} \
-    --O ${output_prefix}
+    --INPUT ${sep=" --INPUT " input_details_fofn} \
+    --OUTPUT ${output_prefix}
   >>>
   runtime {
     docker: docker
diff --git a/joint-discovery-gatk4.hg38.wgs.inputs.json b/joint-discovery-gatk4.hg38.wgs.inputs.json
index 4c34849..8610b56 100644
--- a/joint-discovery-gatk4.hg38.wgs.inputs.json
+++ b/joint-discovery-gatk4.hg38.wgs.inputs.json
@@ -37,7 +37,7 @@
 
   "##_COMMENT4": "DOCKERS", 
   "JointGenotyping.python_docker": "python:2.7",
-  "JointGenotyping.gatk_docker": "broadinstitute/gatk:4.0.4.0",
+  "JointGenotyping.gatk_docker": "broadinstitute/gatk:4.0.6.0",
 
   "##_COMMENT5": "PATHS", 
   "JointGenotyping.gatk_path": "/gatk/gatk",