Skip to content

Commit

Permalink
Wdlupdate (#23)
Browse files Browse the repository at this point in the history
* added haplotypcaller nio, added disk sizing within task.

* update gatk image to 4.0.6.0 , minor correction to picard command in JD

* Added conditional task to convert input CRAM to bam before running haplotypcaller.

* updated gatk version in joint discovery json
  • Loading branch information
bshifaw authored Jul 23, 2018
1 parent 3087acc commit b9bbbdc
Show file tree
Hide file tree
Showing 6 changed files with 346 additions and 34 deletions.
247 changes: 247 additions & 0 deletions haplotypecaller-gvcf-gatk4-nio.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
## Copyright Broad Institute, 2017
##
## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample
## according to the GATK Best Practices (June 2016), scattered across intervals.
##
## Requirements/expectations :
## - One analysis-ready BAM file for a single sample (as identified in RG:SM)
## - Set of variant calling intervals lists for the scatter, provided in a file
##
## Outputs :
## - One GVCF file and its index
##
## Cromwell version support
## - Successfully tested on v31
## - Does not work on versions < v23 due to output syntax
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the dockers
## for detailed licensing information pertaining to the included programs.
# WORKFLOW DEFINITION
workflow HaplotypeCallerGvcf_GATK4 {
File input_bam
File input_bam_index
File ref_dict
File ref_fasta
File ref_fasta_index
File scattered_calling_intervals_list

Boolean? make_gvcf
Boolean making_gvcf = select_first([make_gvcf,true])

String? gatk_docker_override
String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"])
String? gatk_path_override
String gatk_path = select_first([gatk_path_override, "/gatk/gatk"])
String? gitc_docker_override
String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"])

Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)

#is the input a cram file?
Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram"

String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam")
String vcf_basename = sample_basename
String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz"
String output_filename = vcf_basename + output_suffix

# We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller.
# If we take the number we are scattering by and reduce by 20 we will have enough disk space
# to account for the fact that the data is quite uneven across the shards.
Int potential_hc_divisor = length(scattered_calling_intervals) - 20
Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1


if ( is_cram ) {
call CramToBamTask {
input:
input_cram = input_bam,
sample_name = sample_basename,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
docker = gitc_docker
}
}

# Call variants in parallel over grouped calling intervals
scatter (interval_file in scattered_calling_intervals) {

# Generate GVCF by interval
call HaplotypeCaller {
input:
input_bam = select_first([CramToBamTask.output_bam, input_bam]),
input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
interval_list = interval_file,
output_filename = output_filename,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
hc_scatter = hc_divisor,
make_gvcf = making_gvcf,
docker = gatk_docker,
gatk_path = gatk_path
}
}

# Merge per-interval GVCFs
call MergeGVCFs {
input:
input_vcfs = HaplotypeCaller.output_vcf,
input_vcfs_indexes = HaplotypeCaller.output_vcf_index,
output_filename = output_filename,
docker = gatk_docker,
gatk_path = gatk_path
}

# Outputs that will be retained when execution is complete
output {
File output_vcf = MergeGVCFs.output_vcf
File output_vcf_index = MergeGVCFs.output_vcf_index
}
}

# TASK DEFINITIONS
task CramToBamTask {
# Command parameters
File ref_fasta
File ref_fasta_index
File ref_dict
File input_cram
String sample_name

# Runtime parameters
String docker
Int? machine_mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? preemptible_attempts

Float output_bam_size = size(input_cram, "GB") / 0.60
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20

command {
set -e
set -o pipefail

samtools view -h -T ${ref_fasta} ${input_cram} |
samtools view -b -o ${sample_name}.bam -
samtools index -b ${sample_name}.bam
mv ${sample_name}.bam.bai ${sample_name}.bai
}
runtime {
docker: docker
memory: select_first([machine_mem_gb, 15]) + " GB"
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
preemptible: preemptible_attempts
}
output {
File output_bam = "${sample_name}.bam"
File output_bai = "${sample_name}.bai"
}
}

# HaplotypeCaller per-sample in GVCF mode
task HaplotypeCaller {
String input_bam
String input_bam_index
File interval_list
String output_filename
File ref_dict
File ref_fasta
File ref_fasta_index
Float? contamination
Boolean make_gvcf
Int hc_scatter

String gatk_path
String? java_options
String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"])

# Runtime parameters
String docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? preemptible_attempts

Int machine_mem_gb = select_first([mem_gb, 7])
Int command_mem_gb = machine_mem_gb - 1

Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20

command <<<
set -e

${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \
HaplotypeCaller \
-R ${ref_fasta} \
-I ${input_bam} \
-L ${interval_list} \
-O ${output_filename} \
-contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf}
>>>

runtime {
docker: docker
memory: machine_mem_gb + " GB"
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
}

output {
File output_vcf = "${output_filename}"
File output_vcf_index = "${output_filename}.tbi"
}
}
# Merge GVCFs generated per-interval for the same sample
task MergeGVCFs {
Array[File] input_vcfs
Array[File] input_vcfs_indexes
String output_filename

String gatk_path

# Runtime parameters
String docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? preemptible_attempts

Int machine_mem_gb = select_first([mem_gb, 3])
Int command_mem_gb = machine_mem_gb - 1

command <<<
set -e

${gatk_path} --java-options "-Xmx${command_mem_gb}G" \
MergeVcfs \
--INPUT ${sep=' --INPUT ' input_vcfs} \
--OUTPUT ${output_filename}
>>>

runtime {
docker: docker
memory: machine_mem_gb + " GB"
disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
}


output {
File output_vcf = "${output_filename}"
File output_vcf_index = "${output_filename}.tbi"
}
}

46 changes: 27 additions & 19 deletions haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
{
"##_COMMENT1": "INPUT BAM",
"HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam",
"HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai",
"#HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam",
"#HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai",
"HaplotypeCallerGvcf_GATK4.input_bam": "gs://broad-public-datasets/NA12878/NA12878.cram",
"HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://broad-public-datasets/NA12878/NA12878.cram.crai",

"##_COMMENT2": "REFERENCE FILES",
"HaplotypeCallerGvcf_GATK4.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
Expand All @@ -10,27 +12,33 @@

"##_COMMENT3": "INTERVALS",
"HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt",

"##_COMMENT4": "MISCELLANEOUS PARAMETERS",
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True",
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "Float? (optional)",

"##_COMMENT4": "DOCKERS",
"HaplotypeCallerGvcf_GATK4.gatk_docker": "broadinstitute/gatk:4.0.4.0",
"##_COMMENT5": "DOCKERS",
"#HaplotypeCallerGvcf_GATK4.gatk_docker_override": "String? (optional)",
"#HaplotypeCallerGvcf_GATK4.gitc_docker_override": "String? (optional)",

"##_COMMENT5": "PATHS",
"HaplotypeCallerGvcf_GATK4.gatk_path": "/gatk/gatk",
"HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True",
"##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "(optional) Float?",
"##_COMMENT6": "PATHS",
"#HaplotypeCallerGvcf_GATK4.gatk_path_override": "String? (optional)",

"##_COMMENT6": "JAVA OPTIONS",
"##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "(optional) String?",
"##_COMMENT7": "JAVA OPTIONS",
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "String? (optional)",

"##_COMMENT7": "MEMORY ALLOCATION",
"##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "(optional) Int?",
"##HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "(optional) Int?",
"##_COMMENT8": "MEMORY ALLOCATION",
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "Int? (optional)",
"#HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "Int? (optional)",
"#HaplotypeCallerGvcf_GATK4.CramToBamTask.machine_mem_gb": "Int? (optional)",

"##_COMMENT8": "DISK SIZE ALLOCATION",
"##HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "(optional) Int?",
"##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "(optional) Int?",
"##_COMMENT9": "DISK SIZE ALLOCATION",
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "Int? (optional)",
"#HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "Int? (optional)",
"#HaplotypeCallerGvcf_GATK4.CramToBamTask.disk_space_gb": "Int? (optional)",

"##_COMMENT9": "PREEMPTION",
"##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "(optional) Int?",
"##HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "(optional) Int?"
"##_COMMENT10": "PREEMPTION",
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "Int? (optional)",
"#HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "Int? (optional)",
"#HaplotypeCallerGvcf_GATK4.CramToBamTask.preemptible_attempts": "Int? (optional)"
}
Loading

0 comments on commit b9bbbdc

Please sign in to comment.