mlin · Jun 27, 2018 · Jun 27, 2018 · Jun 27, 2018 · Jun 27, 2018 · Jun 28, 2018
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ compile_commands.json
 CTestTestfile.cmake
 DartConfiguration.tcl
 spvcf
+external/
diff --git a/.travis.yml b/.travis.yml
@@ -12,9 +12,9 @@ script:
 deploy:
   provider: releases
   api_key:
-    secure: AHG3d3SAeswsy5NyGoi5/yr8xZCbB8pjYcyOTSpB0yYjj4aGFRcEEuAYD8NOP1yU+kDXPPfJCnXDJfhhoM4Y6WEZOqrPc3PIiagdzBRWdzIJpF+NZkgekPPl4bp9eVXb+pn2AO7YC64VPnxqBk2nkMRquAVmeT3BuULGQB6UZliiIVNhiN7iOb99eEc7aeDtgJYRFfKLDEnL0dgpnOq6olhBHhCE+qWjjpnS/gUxsKJi6F+qKyZliNE3HPOzCEXN76UxDfxnoamBnbgD02qUmk3mLHoOI1Ow+qQVdzXPB4fYNsbaVkjS261nIFtJvnw2V8RHXnFSCdzz8DxbZxzXFQUuuZAAL8csTG4zHlk7W4DfzR+9d6qLvLrrp8Fx/p3ZZAFTC9PA9BiLuOicK1jr9Mnqzh0cuXlKi4CjOKCpko/bIKV4TE6RPZo2A2HVM84fL+sJQCmnlCRQIX1gPjW+aRsyZVMs/yDlgvjsR0lYTqGbFjJumZP4HPT1zpSpTiOwkZvQIrRqL8CvAi2owVYa4jWnn+5btaSsTSPYJ00gbjMmddqBj0s90rMvHGvuTvN9h6QILxUiAg06AfbeZr6k7lBX3hpSes3ayr7lAsCn2wVkOEebnEcdmQElygVM+yIa0aiIbddWLpb8xCt4W5VpTDl3Nd8Uw2HbqwqRzA7WIl0=
+    secure: dJnEEwlcPE6hcs8xZcFM9pMYVzrFlk11CRkhahgvR7hh9rUGFjPFIMpmLUUQigKvf0YxgwZ5ZXcyOQfb0riRovN6K6xqsPU+V5csIHPr629LZIvklRzA0/163fUVm3BoYAA80LwwCjd+ysSS/Ot1PLE/BVbN6amYplFT5KiQVjn2l/P196/+NMeX1O3keGESlPdTu1L783PL3TOKTvT/J+XiX3PSwvMaasDRDB9Bbq4PzT7JNnO/s4XkT6zxHsKnM6bPzhmJ2yvmXJj3gBLgEQUS+BLh5vy2BkHsQgC3C5amtSpqtc2neifubU12BUHxypmoWndN8n0qkdLTEGOr/fvFthlnJ37rBJL9KWvBuDbYLH5DgO0XDXO7dN/WGSSh+fBkTBeWPkc3Sqrxk+Wg9CH+WRa2LTz6e/hmFyiSahI/g/6h8e7dS9A2o2dV4A+NVG4Zmj5TDojpd1FuwgRpcVD2urXjwJniYbZ+Ca30KRZDYjSmaatfs+J8Y7+CdzyZQIFR5z05Sy9xkKOmbAtx4esFNzHf/UfJj/4iqyw1/ZovZ0xNKWHJQ43dT7+YU7E0/HhQh1bKGO+7cO81Pz3DmzL9jFunOtumohucAW1gQRopVmlL+kJ5mc8HD1b56w7KlJEgFp4WWsEei/UvZeGSwGkgSG7p0lrRKss0A9d0zxs=
   file: spvcf
-  skip_cleanup: true
   on:
     repo: mlin/spVCF
     tags: true
+  skip_cleanup: 'true'
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
@@ -0,0 +1,17 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**"
+            ],
+            "defines": [],
+            "compilerPath": "/usr/bin/gcc",
+            "cStandard": "c11",
+            "cppStandard": "c++17",
+            "intelliSenseMode": "clang-x64",
+            "compileCommands": "${workspaceFolder}/compile_commands.json"
+        }
+    ],
+    "version": 4
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,5 +1,19 @@
 {
     "files.associations": {
-        "algorithm": "cpp"
+        "algorithm": "cpp",
+        "array": "cpp",
+        "cmath": "cpp",
+        "*.tcc": "cpp",
+        "deque": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "system_error": "cpp",
+        "ostream": "cpp",
+        "stdexcept": "cpp",
+        "cerrno": "cpp",
+        "sstream": "cpp",
+        "iosfwd": "cpp",
+        "mutex": "cpp",
+        "chrono": "cpp"
     }
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,21 +1,44 @@
 cmake_minimum_required(VERSION 3.5)
-project(StrawmanSparseVCF LANGUAGES CXX)
+include(ExternalProject)
+project(spvcf LANGUAGES CXX)
 
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 SET(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-execute_process(COMMAND git describe --tags --long --dirty --always
+IF(NOT CMAKE_BUILD_TYPE)
+  SET(CMAKE_BUILD_TYPE Release)
+ENDIF(NOT CMAKE_BUILD_TYPE)
+
+ExternalProject_Add(htslib
+    URL https://github.com/samtools/htslib/releases/download/1.10.2/htslib-1.10.2.tar.bz2
+    PREFIX ${CMAKE_CURRENT_BINARY_DIR}/external
+    CONFIGURE_COMMAND bash -c "autoreconf && ./configure --disable-libcurl --disable-bz2 --disable-lzma --disable-s3 --disable-gcs"
+    PATCH_COMMAND sed -i "s/^CFLAGS .*$/CFLAGS = -O3 -DNDEBUG -march=ivybridge/" Makefile
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND bash -c "make -n && make -j$(nproc)"
+    INSTALL_COMMAND ""
+    LOG_DOWNLOAD ON
+    LOG_BUILD ON
+  )
+ExternalProject_Get_Property(htslib source_dir)
+set(HTSLIB_SOURCE_DIR ${source_dir})
+ExternalProject_Get_Property(htslib binary_dir)
+set(HTSLIB_BINARY_DIR ${binary_dir})
+
+execute_process(COMMAND git describe --tags --long --always
                 OUTPUT_VARIABLE GIT_REVISION OUTPUT_STRIP_TRAILING_WHITESPACE)
                 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGIT_REVISION=\"\\\"${GIT_REVISION}\\\"\"")
 
-add_executable(spvcf src/main.cc src/spVCF.cc src/spVCF.h)
-target_include_directories(spvcf PRIVATE src)
+add_executable(spvcf src/main.cc src/spVCF.cc src/spVCF.h src/strlcpy.h)
+add_dependencies(spvcf htslib)
+target_include_directories(spvcf PRIVATE src ${HTSLIB_SOURCE_DIR})
 if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
-    target_compile_options(spvcf PRIVATE -fdiagnostics-color=auto -march=ivybridge)
-    set_target_properties(spvcf PROPERTIES LINK_FLAGS -static-libstdc++)
+    target_compile_options(spvcf PRIVATE -fdiagnostics-color=auto -march=ivybridge -g)
+    set_target_properties(spvcf PROPERTIES LINK_FLAGS "-static-libgcc -static-libstdc++ -pthread")
 endif()
+target_link_libraries(spvcf ${HTSLIB_BINARY_DIR}/libhts.a libz.a libdeflate.a)
 
 include(CTest)
 add_test(NAME tests COMMAND prove -v test/spVCF.t)

diff --git a/Dockerfile b/Dockerfile
@@ -3,8 +3,13 @@ ENV DEBIAN_FRONTEND noninteractive
 
 RUN apt-get -qq update && \
      apt-get -qq install -y --no-install-recommends --no-install-suggests \
-     curl wget ca-certificates git-core less netbase \
-     g++ cmake make pigz
+     curl wget ca-certificates git-core less netbase tabix \
+     g++ cmake make automake autoconf bash-completion pigz zlib1g-dev
+
+ADD https://github.com/ebiggers/libdeflate/archive/v1.6.tar.gz /tmp
+RUN tar xzf /tmp/v1.6.tar.gz -C /tmp
+WORKDIR /tmp/libdeflate-1.6
+RUN make -j $(nproc) && make install
 
 ADD . /src
 WORKDIR /src

diff --git a/README.md b/README.md
@@ -1,3 +1,87 @@
 # Sparse Project VCF (spVCF)
-Minimalist strawman for encoding cohort genotype matrices more efficiently
 
+**Maintainer: Mike Lin [@DNAmlin](https://twitter.com/DNAmlin)**
+
+Project VCF (pVCF; aka multi-sample VCF) is the prevailing file format for small genetic variants discovered by cohort sequencing. It encodes a two-dimensional matrix with variant sites down the rows and study participants across the columns, filled in with all the genotypes and associated QC measures (read depths, genotype likelihoods, etc.). Large cohorts harbor many rare variants, implying a sparse genotype matrix composed largely of reference-homozygous or non-called cells. But the dense pVCF format encodes this inefficiently, growing super-linearly with the cohort size.
+
+spVCF is an evolution of VCF that keeps most aspects of its tab-delimited text format, but presents the genotype matrix sparsely, by selectively reducing QC measure entropy and run-length encoding repetitive information about reference coverage. This is less sophisticated than some other efforts to address VCF's density and other shortcomings, but perhaps more palatable to existing VCF consumers by virtue of simplicity.
+
+Further resources:
+
+* [bioRxiv preprint](https://www.biorxiv.org/content/10.1101/611954v2) for a short manuscript on the approach & tool
+* [doc/SPEC.md](https://github.com/mlin/spVCF/blob/master/doc/SPEC.md) has complete details and a worked example
+* [doc/compression_results.md](https://github.com/mlin/spVCF/blob/master/doc/compression_results.md) tests spVCF with *N*=50K exomes, observing up to 15X size reduction for bgzip-compressed pVCF, and scaling much more gently with *N*.
+* [slide deck](https://docs.google.com/presentation/d/13lzEkdWAVwcsKofhsiYEdl92xMQgx5_dSOSIyZDggfM/edit?usp=sharing) presented at the GA4GH & MPEG-G Genome Compression Workshop, October 2018.
+* [spVCF files for the resequenced 1000 Genomes Project cohort](https://github.com/mlin/spVCF/blob/master/doc/1000G_NYGC_GATK.md) (*N*=2,504 WGS)
+
+## `spvcf` utility
+
+[![Build Status](https://travis-ci.org/mlin/spVCF.svg?branch=master)](https://travis-ci.org/mlin/spVCF)
+
+This repository has a command-line utility for encoding pVCF to spVCF and vice versa. The [Releases](https://github.com/mlin/spVCF/releases) page has pre-built executables compatible with most Linux x86-64 hosts, which you can download and `chmod +x spvcf`. To build and test it yourself, clone this repository and:
+
+```
+cmake . && make
+ctest -V
+```
+
+The subcommands `spvcf encode` and `spvcf decode` encode existing pVCF to spVCF and vice versa. The input and output streams are uncompressed VCF text, so you may wish to arrange a pipe with [de]compression programs like `bgzip`.
+
+```
+spvcf encode [options] [in.vcf|-]
+Reads VCF text from standard input if filename is empty or -
+
+Options:
+  -o,--output out.spvcf  Write to out.spvcf instead of standard output
+  -n,--no-squeeze        Disable lossy QC squeezing transformation (lossless run-encoding only)
+  -p,--period P          Ensure checkpoints (full dense rows) at this period or less (default: 1000)
+  -t,--threads N         Use multithreaded encoder with this number of worker threads
+  -q,--quiet             Suppress statistics printed to standard error
+  -h,--help              Show this help message
+```
+
+```
+spvcf decode [options] [in.spvcf|-]
+Reads spVCF text from standard input if filename is empty or -
+
+Options:
+  -o,--output out.vcf  Write to out.vcf instead of standard output
+  -q,--quiet           Suppress statistics printed to standard error
+  -h,--help            Show this help message
+```
+
+Examples:
+
+```
+$ ./spvcf encode my.vcf > my.spvcf
+$ bgzip -dc my.vcf.gz | ./spvcf encode | bgzip -c > my.spvcf.gz
+$ bgzip -dc my.spvcf.gz | ./spvcf decode > my.decoded.vcf
+```
+
+There's also `spvcf squeeze` to apply the QC squeezing transformation to a pVCF, without the sparse quote-encoding. This produces valid pVCF that's typically much smaller, although not as small as spVCF.
+
+The multithreaded encoder should be used only if the single-threaded version is a proven bottleneck. It's capable of higher throughput in favorable circumstances, but trades off memory usage and copying. The memory usage scales with threads and period.
+
+### Tabix slicing
+
+If the familiar `bgzip` and `tabix -p vcf` utilities are used to block-compress and index a spVCF file, then `spvcf tabix` can take a genomic range slice from it, extracting spVCF which decodes standalone. (The regular `tabix` utility generates the index, but using it to take the slice would yield a broken fragment.) Internally, this entails decoding a small bit of the spVCF, determined by the `spvcf encode --period` option.
+
+```
+spvcf tabix [options] in.spvcf.gz chr1:1000-2000 [chr2 ...]
+Requires tabix index present e.g. in.spvcf.gz.tbi. Includes all header lines.
+
+Options:
+  -o,--output out.spvcf  Write to out.spvcf instead of standard output
+  -h,--help              Show this help message
+```
+
+Example:
+
+```
+$ ./spvcf encode my.vcf | bgzip -c > my.spvcf.gz
+$ tabix -p vcf my.spvcf.gz
+$ ./spvcf tabix my.spvcf.gz chr21:5143000-5219900 | ./spvcf decode > slice.vcf
+```
+## Contributing
+
+Any engagement with this open-source repository promotes the project's vitality. Pull requests, opening issues, adding comments, experience reports, ideas and critiques, are all welcome. Please don't be shy!
diff --git a/doc/1000G_NYGC_GATK.md b/doc/1000G_NYGC_GATK.md
@@ -0,0 +1,95 @@
+# spVCF for the resequenced 1000 Genomes Project
+
+### Updated: 15 May 2019
+
+The 2,504 individuals from the 1000 Genomes Project phase 3 cohort were [recently resequenced](https://twitter.com/notSoJunkDNA/status/1125401248348495873) at the New York Genome Center (to high depth on modern instruments). Accompanying this awesome data drop were "working" versions of [project VCF](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20190425_NYGC_GATK/) files [joint-called using GATK](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11145), with total size of roughly 1,250GB. We encoded and recompressed these using [spVCF](https://www.biorxiv.org/content/10.1101/611954v1) to produce files about 1/5 the size with (we contend) minimal loss of useful information.
+
+* [chr21 spvcf.gz on figshare](https://figshare.com/articles/1000G_2504_high_coverage_20190425_NYGC_GATK_chr21_spvcf_gz/8132261) (3.7GB compared to 18GB original pvcf.gz)
+
+**Help wanted:** please [get in touch](https://twitter.com/DNAmlin) if you could host & distribute the full dataset on our behalf (about 250GB). Right now we have it stored in a couple places which understandably are reluctant to provide free-for-all public downloads (the 80% reduced size notwithstanding!).
+
+We're really grateful to all involved in the sponsorship & production of the new sequencing results, a tremendous resource for global collaboration amongst genome geeks everywhere!
+
+### Example usage
+
+The easiest way to start using the spVCF file is with [our command-line tool](https://github.com/mlin/spVCF/releases) to decode it on-the-fly for existing tools that consume project VCF files. Here are some example invocations, supposing you've just downloaded `1000G_2504_high_coverage.20190425_NYGC_GATK.chr21.spvcf.gz`.
+
+```bash
+# Fetch & enable spvcf utility
+wget https://github.com/mlin/spVCF/releases/download/v1.0.0/spvcf
+chmod +x spvcf
+
+# For brevity below:
+mv 1000G_2504_high_coverage.20190425_NYGC_GATK.chr21.spvcf.gz chr21.spvcf.gz
+
+# Generate tabix index of the spVCF file
+tabix -p vcf chr21.spvcf.gz
+
+# Slice & decode a range for use with plain bcftools
+./spvcf tabix chr21.spvcf.gz chr21:7000000-8000000 | bgzip -c > chr21.slice.spvcf.gz
+gzip -dc chr21.slice.spvcf.gz | ./spvcf decode | bgzip -c > chr21.slice.vcf.gz
+bcftools stats chr21.slice.vcf.gz
+
+# Repeat with a much better streaming approach
+bcftools stats <(./spvcf tabix chr21.spvcf.gz chr21:7000000-8000000 | ./spvcf decode)
+```
+
+These invocations report transition/transversion ratio of 1.17 which is super low, suggesting that downstream analyses need to look at the filters carefully. Updated and additional variant call sets for these genomes are no doubt coming from numerous groups, and we hope spVCF might facilitate their exchange.
+
+### Afterthought
+
+spVCF reduces project VCF file size in two ways, first through selective removal & rounding of QC measures ("squeezing"), and second by run-length encoding the remainder. For this dataset, nearly all of the 80% size reduction arises from the lossy squeezing step; restated, the `vcf.gz` decoded from a given `spvcf.gz` is only slightly larger (about 10%). The run encoding is ineffective with "only" *N*=2,504 genomes because the variant sites are spaced far enough apart that there are not many long identical runs (in the way spVCF sees them) to encode. At *N*=50K the run-encoding halves the size after squeezing, and quarters it at *N*=100K. So, while this dataset didn't exhibit spVCF to its full potential, we hope it provides a useful familiarization exercise for things to come.
+
+### Appendix: workflow
+
+The following [WDL](http://openwdl.org/) workflow generated the spVCF files. The task downloads and encodes the pVCF for one chromosome. The workflow scatters this task across the chromosomes.
+
+```wdl
+version 1.0
+
+task spvcf_encode_1000G_chrom {
+  input {
+    String chrom
+  }
+  command <<<
+    set -eux -o pipefail
+    apt-get update && apt-get -y install aria2 pigz tabix
+    aria2c https://github.com/mlin/spVCF/releases/download/v1.0.0/spvcf
+    chmod +x spvcf
+    aria2c -s 4 -j 4 -x 4 --file-allocation=none "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20190425_NYGC_GATK/CCDG_13607_B01_GRM_WGS_2019-02-19_~{chrom}.recalibrated_variants.vcf.gz"
+    pigz -dc *.vcf.gz | ./spvcf encode | bgzip -c > "1000G_2504_high_coverage.20190425_NYGC_GATK.~{chrom}.spvcf.gz"
+  >>>
+  output {
+    File spvcf_gz = glob("*.spvcf.gz")[0]
+  }
+  runtime {
+    docker: "ubuntu:18.04"
+    preemptible: 2
+    cpu: 2
+    disks: "local-disk 160 HDD"
+  }
+}
+
+workflow spvcf_encode_1000G {
+  input {
+    Array[String] chroms = [
+         "chr1", "chr2", "chr3", "chr4",
+         "chr5", "chr6", "chr7", "chr8",
+         "chr9","chr10","chr11","chr12",
+        "chr13","chr14","chr15","chr16",
+        "chr17","chr18","chr19","chr20",
+        "chr21","chr22", "chrX", "chrY",
+        "others"
+    ]
+  }
+  scatter (chrom in chroms) {
+    call spvcf_encode_1000G_chrom as encode1 {
+      input:
+        chrom = chrom
+    }
+  }
+  output {
+    Array[File] spvcf_gz = encode1.spvcf_gz
+  }
+}
+```
diff --git a/doc/SPEC.md b/doc/SPEC.md
@@ -0,0 +1,80 @@
+# Sparse Project VCF: format specification
+
+**Maintainer: Mike Lin [@DNAmlin](https://twitter.com/DNAmlin)**
+
+Project VCF (pVCF; aka multi-sample VCF) is the prevailing file format for small genetic variants discovered by cohort sequencing. It encodes a two-dimensional matrix with variant sites down the rows and study participants across the columns, filled in with all the genotypes and associated QC measures (read depths, genotype likelihoods, etc.). Large cohorts harbor many rare variants, implying a sparse genotype matrix composed largely of reference-homozygous or non-called cells. The dense pVCF format encodes this very inefficiently. See the [VCF specification](http://samtools.github.io/hts-specs/VCFv4.3.pdf) for full details of this format.
+
+[Sparse Project VCF (spVCF)](https://github.com/mlin/spVCF) is a simple scheme to encode the pVCF matrix sparsely, by keeping most aspects of the VCF format while run-length encoding repetitive information about reference coverage. The encoding includes a checkpointing feature to facilitate random access within a block-compressed spVCF file, using familiar tools like `bgzip` and `tabix`. Lastly, spVCF suggests an optional convention to strip typically-unneeded QC details from the matrix.
+
+### Sparse encoding
+
+spVCF adopts from pVCF the tab-delimited text format with header, and the first nine columns providing all variant-level details. The sparse encoding concerns the genotype matrix `V[i,j]`, *i* indexing variant sites and *j* indexing the *N* samples, written across tab-delimited columns ten through 9+*N* of the pVCF text file. Each cell `V[i,j]` is a colon-delimited text string including the genotype and various QC measures (DP, AD, PL, etc.).
+
+In the spVCF encoding, cells are first replaced with a double-quotation mark `"` if they're (i) identical to the cell *above*, and (ii) their GT field is reference-identical or non-called:
+
+```
+S[i,j] :=   "    if i>0 and V[i,j] == V[i-1,j] and V[i,j]["GT"] in ["0/0","0|0","./.",".|."],
+          V[i,j] otherwise.
+```
+
+Here 'identical' covers all the QC measures exactly, assuming they have the same order from row to row. Such exact repetition is common in pVCF produced using tools like [GATK GenotypeGVCFs](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_variantutils_GenotypeGVCFs.php) and [GLnexus](https://github.com/dnanexus-rnd/GLnexus), which merge gVCF or similar files summarizing reference coverage in lengthy bands.
+
+For clarity, the list of "quotable" GTs enumerated above shows diploid genotypes only; in general, quotable GTs are those whose constituent allele calls are either all reference (0), or all non-called (.).
+
+Second, within each row of `S`, consecutive runs of quotation marks are abbreviated with a text integer, so for example a horizontal run of 42 quotes is written `"42` and tab-delimited from adjacent cells. The result is a ragged, tab-delimited matrix.
+
+The first header line of a spVCF file begins with `##fileformat=spVCF`, followed by a version/tag, semicolon, and finally the original format of the encoded VCF file (e.g. `VCF4.2`).
+
+**Worked example**
+
+```
+#CHROM  POS ID REF ALT ... FORMAT       Alice                           Bob                      Carol
+22     1000  . A   G   ... GT:DP:AD:PL  0/0:35:35,0:0,117,402           0/0:29:29,0:0,109,387    0/0:22:22,0:0,63,188
+22     1012  . CT  C   ... GT:DP:AD:PL  0/0:35:35,0:0,117,402           0/0:31:31,0:0,117,396    0/1:28:17,11:74,0,188
+22     1018  . G   A   ... GT:DP:AD:PL  0/0:35:35,0:0,117,402           0/0:31:31,0:0,117,396    1/1:27:0,27:312,87,0
+22     1074  . T   C,G ... GT:DP:AD:PL  0/0:33:33,0,0:0,48,62,52,71,94  ./.:0:0,0:.,.,.,.,.,.    1/2:42:4,20,18:93,83,76,87,0,77
+```
+
+encodes to
+
+```
+#CHROM  POS ID REF ALT ... FORMAT       Alice                           Bob                      Carol
+22     1000  . A   G   ... GT:DP:AD:PL  0/0:35:35,0:0,117,402           0/0:29:29,0:0,109,387    0/0:22:22,0:0,63,188
+22     1012  . CT  C   ... GT:DP:AD:PL           "                      0/0:31:31,0:0,117,396    0/1:28:17,11:74,0,188
+22     1018  . G   A   ... GT:DP:AD:PL           "2                                              1/1:27:0,27:312,87,0
+22     1074  . T   C,G ... GT:DP:AD:PL  0/0:33:33,0,0:0,48,62,52,71,94  ./.:0:0,0:.,.,.,.,.,.    1/2:42:4,20,18:93,83,76,87,0,77
+```
+
+Here some VCF features have been omitted for brevity, and for clarity the columns have been aligned artificially (for example, in the spVCF there would be only one tab delimiting `"2` and Carol's `1/1`).
+
+Notice that a site with multiple alternate alleles usually breaks the columnar runs of repetition. We'll revisit this below.
+
+### Decoding and checkpoints
+
+spVCF is decoded back to pVCF by, first, copying out the header and the first line, which are identical to the original. On subsequent lines, the decoder copies out explicit cells and, upon encountering a quotation mark or an encoded run thereof, repeats the last-emitted cell from the respective column(s).
+
+Decoding a given line of spVCF requires context from previous lines, potentially back to the beginning of the file. To enable random access within a spVCF file, the encoder should generate periodic *checkpoints*, which are simply pVCF lines copied verbatim without any run-encoding. Subsequent spVCF lines can be decoded by looking back no farther than the last checkpoint.
+
+To facilitate finding the last checkpoint, the encoder prepends an INFO field to the eighth column of each non-checkpoint line, `spVCF_checkpointPOS=12345`, giving the VCF `POS` of the last checkpoint line. The decoder must remove this extra field from the output pVCF. A spVCF line is a checkpoint if and only if it lacks this `spVCF_checkpointPOS` field first in its INFO column. The first line for each reference contig (chromosome) must be a checkpoint, naturally including the first line of the file. 
+
+With checkpoints, it's possible to reuse the familiar `bgzip` and `tabix` utilities with spVCF files. Compression and indexing use the original utilities as-is, while random access (genomic range slicing) requires specialized logic to construct self-contained spVCF from the whole original, locating a checkpoint and decoding from it as needed. The decoder seeking a checkpoint must accommodate the possibility that multiple VCF lines could share `POS` with the desired checkpoint.
+
+### Optional: QC entropy reduction or "squeezing"
+
+Lastly, spVCF suggests the following convention to remove typically-unneeded detail from the matrix, and increase the compressibility of what remains, prior to the sparse encoding. In any cell with QC measures indicating zero non-reference reads (typically `AD=d,0` for some *d*, but this depends on how the pVCF-generating pipeline expresses non-reference read depth), report only `GT` and `DP` and omit any other fields. Also, round `DP` down to a power of two (0, 1, 2, 4, 8, 16, ...).
+
+This "squeezing" requires the encoder to reorder the colon-delimited fields in each cell so that `GT` and `DP` precede any other fields. Then it's valid for a subset of cells to omit remaining fields completely, as permitted by VCF. The FORMAT specification in column 9 of each line must reflect this reordering. Notice that not all reference-identical genotype calls are necessarily squeezed, namely if the QC data indicate even one non-reference read.
+
+The optional squeezing transformation can be applied to any pVCF, usually to great benefit, whether or not the spVCF sparse encoding is also used.
+
+Revisiting the worked example above reveals another benefit of squeezing, that it extends repetitive runs through sites with multiple alternate alleles.
+
+```
+#CHROM  POS ID REF ALT ... FORMAT       Alice                  Bob                      Carol
+22     1000  . A   G   ... GT:DP:AD:PL  0/0:32                 0/0:16                   0/0:16
+22     1012  . CT  C   ... GT:DP:AD:PL            "2                                    0/1:28:17,11:74,0,188
+22     1018  . G   A   ... GT:DP:AD:PL            "2                                    1/1:27:0,27:312,87,0
+22     1074  . T   C,G ... GT:DP:AD:PL            "            ./.:0                    1/2:42:4,20,18:93,83,76,87,0,77
+
+```
+
diff --git a/doc/compression_results.md b/doc/compression_results.md
@@ -0,0 +1,31 @@
+# spVCF compression tests
+
+To test [spVCF](https://github.com/mlin/spVCF)'s potential to bend the super-linear growth of pVCF files with cohort size *N*, we revisited chromosome 2 pVCF files for nested subsets of 50,000 exomes from the [DiscovEHR cohort](http://science.sciencemag.org/content/354/6319/aaf6814) sequenced at [Regeneron Genetics Center](https://www.regeneron.com/genetics-center). These pVCF files were generated from [GATK HaplotypeCaller](https://www.biorxiv.org/content/early/2017/11/14/201178.1) gVCF inputs using [GLnexus](https://github.com/dnanexus-rnd/GLnexus), as described in our [preprint](https://www.biorxiv.org/content/early/2018/06/11/343970). As shown in Table 2 of that manuscript, at *N*=50K, 96% of the ~270K pVCF sites have alternate allele frequency below 0.1%. Each pVCF cell complements the called genotype with a typical array of QC measures (`GT:DP:AD:SB:GQ:PL:RNC`) which account for a large majority of the file size. 
+
+spVCF has a default "Lossless" sparse encoding mode, and a "Squeeze" mode which discards most QC measures in cells reporting only reference-equivalent reads (`AD=*,0`), otherwise keeping them. We used spVCF v0.2.3 in both modes to encode the pVCF files for the nested cohorts numbering *N*=1K,5K,10K,25K,50K and compared the resulting file sizes. All file sizes are reported with generic `bgzip` compression, irrespective of the encoding. [Spreadsheet](https://docs.google.com/spreadsheets/d/1IwjbZzPpuYl9HroRCxqcM5bsvJ46ISn-vcOFi7pQxRo/edit?usp=sharing)
+
+![](https://github.com/mlin/spVCF/raw/master/doc/media/DiscovEHR_file_size.png)
+
+The "Squeeze&Decode" series show the squeezed spVCF decoded back to dense pVCF/BCF; this is to let us disentangle the effect of discarding QC measures from the sparse encoding. 
+
+We can also render these results as compression ratios:
+
+![](https://github.com/mlin/spVCF/raw/master/doc/media/DiscovEHR_compression_ratio.png)
+
+TODO: Weissman scores
+
+## Analysis
+
+* The lossless sparse encoding offers a fair ~2X compression alone. This ratio climbs gradually with *N*, which might become important in the future.
+* The QC squeezing offers >5X size reduction by itself, with little loss of *useful* information. This seems like a no-brainer for future pVCF production, with or without sparse encoding.
+* The sparse encoding of squeezed pVCF further ~doubles the compression, roughly consistent with the lossless ratio.
+* There is evidence of synergy between the squeezing and sparse encoding, as the Squeeze compression ratio climbs more steeply with *N* compared to both Squeeze&Decode and Lossless. Squeezing makes the matrix more run-length encodable as *N* grows and sites become more closely spaced.
+* At the end spVCF delivers 15X size reduction from 79 GiB to 5.2 GiB for *N*=50K. The file size scaling is still super-linear but far more gently, so the ratio is expected to climb farther with *N*.
+
+Using our spreadsheet's regression (i.e. not to be taken seriously), the predicted file size for *N*=1,000,000 is 8 TiB with vcf.gz and 151 GiB with Squeezed spvcf.gz, a >50X reduction.
+
+## Test with 23K WGS
+
+We tested spVCF on a pVCF file representing ~960K SNV sites on a 18Mbp segment from ~23K WGS, joint-called with GLnexus from gVCF files generated by [BCM-HGSC](https://www.hgsc.bcm.edu/) using [xAtlas](https://www.biorxiv.org/content/early/2018/04/05/295071).
+
+This 111.5 GiB vcf.gz compressed to 58.8 GiB Lossless spvcf.gz (1.9X), 9.9 GiB Squeeze spvcf.gz (11.2X), and 19.7 GiB Squeeze&Decode vcf.gz (5.7X). These ratios are roughly in line with the DiscovEHR WES trends for similar *N*, suggesting robustness to the WES/WGS setting and the different gVCF variant callers.
diff --git a/doc/media/DiscovEHR_compression_ratio.png b/doc/media/DiscovEHR_compression_ratio.png
diff --git a/doc/media/DiscovEHR_file_size.png b/doc/media/DiscovEHR_file_size.png
diff --git a/spvcf.wdl b/spvcf.wdl
@@ -0,0 +1,120 @@
+version 1.0
+
+task spvcf_encode {
+    input {
+        File vcf_gz
+        Boolean multithread = false
+        String release = "v1.1.0"
+        Int cpu = if multithread then 8 else 4
+    }
+
+    parameter_meta {
+        vcf_gz: "stream"
+    }
+
+    command <<<
+        set -euxo pipefail
+
+        apt-get -qq update && apt-get install -y wget tabix
+        wget -nv https://github.com/mlin/spVCF/releases/download/~{release}/spvcf
+        chmod +x spvcf
+
+        threads_arg=""
+        if [ "~{multithread}" == "true" ]; then
+            threads_arg="--threads 4"
+        fi
+
+        nm=$(basename "~{vcf_gz}" .vcf.gz)
+        nm="${nm}.spvcf.gz"
+        mkdir out
+        bgzip -dc "~{vcf_gz}" | ./spvcf encode $threads_arg | bgzip -@ 4 > "out/${nm}"
+    >>>
+
+    runtime {
+        docker: "ubuntu:20.04"
+        cpu: cpu
+        memory: "~{cpu} GB"
+        disks: "local-disk ~{ceil(size(vcf_gz,'GB'))} SSD"
+    }
+
+    output {
+        File spvcf_gz = glob("out/*.gz")[0]
+    }
+}
+
+task spvcf_decode {
+    input {
+        File spvcf_gz
+        String release = "v1.1.0"
+    }
+
+    parameter_meta {
+        spvcf_gz: "stream"
+    }
+
+    command <<<
+        set -euxo pipefail
+
+        apt-get -qq update && apt-get install -y wget tabix
+        wget -nv https://github.com/mlin/spVCF/releases/download/~{release}/spvcf
+        chmod +x spvcf
+
+        nm=$(basename "~{spvcf_gz}" .spvcf.gz)
+        nm="${nm}.vcf.gz"
+        mkdir out
+        bgzip -dc "~{spvcf_gz}" | ./spvcf decode | bgzip -@ 4 > "out/${nm}"
+    >>>
+
+    runtime {
+        docker: "ubuntu:20.04"
+        cpu: 4
+        memory: "4 GB"
+        disks: "local-disk ~{10*ceil(size(spvcf_gz,'GB'))} SSD"
+    }
+
+    output {
+        File vcf_gz = glob("out/*.gz")[0]
+    }
+}
+
+task spvcf_squeeze {
+    input {
+        File vcf_gz
+        Boolean multithread = false
+        String release = "v1.1.0"
+        Int cpu = if multithread then 8 else 4
+    }
+
+    parameter_meta {
+        vcf_gz: "stream"
+    }
+
+    command <<<
+        set -euxo pipefail
+
+        apt-get -qq update && apt-get install -y wget tabix
+        wget -nv https://github.com/mlin/spVCF/releases/download/~{release}/spvcf
+        chmod +x spvcf
+
+        threads_arg=""
+        if [ "~{multithread}" == "true" ]; then
+            threads_arg="--threads 4"
+        fi
+
+        nm=$(basename "~{vcf_gz}" .vcf.gz)
+        nm="${nm}.squeeze.vcf.gz"
+        mkdir out
+        bgzip -dc "~{vcf_gz}" | ./spvcf squeeze $threads_arg | bgzip -@ 4 > "out/${nm}"
+    >>>
+
+    runtime {
+        docker: "ubuntu:20.04"
+        cpu: cpu
+        memory: "~{cpu} GB"
+        disks: "local-disk ~{ceil(size(vcf_gz,'GB'))} SSD"
+    }
+
+    output {
+        File squeeze_vcf_gz = glob("out/*.gz")[0]
+    }
+}
diff --git a/src/main.cc b/src/main.cc
diff --git a/src/spVCF.cc b/src/spVCF.cc
diff --git a/src/spVCF.h b/src/spVCF.h
@@ -1,23 +1,42 @@
 #include <memory>
 #include <string>
+#include <vector>
+#include <iostream>
 
 namespace spVCF {
 
 struct transcode_stats {
     uint64_t N = 0;                // samples in the project VCF
     uint64_t lines = 0;            // VCF lines (excluding header)
     uint64_t sparse_cells = 0;     // total 'cells' in the sparse representation
-    uint64_t sparse90_lines = 0;   // lines encoded with <=10% the dense number of cells
+    uint64_t sparse75_lines = 0;   // lines encoded with <=25% the dense number of cells
+    uint64_t sparse90_lines = 0;   // " <=10% "
     uint64_t sparse99_lines = 0;   // " <=1% "
     // dense_cells = lines*N
+
+    uint64_t squeezed_cells = 0;   // cells whose QC measures were dropped
+    uint64_t checkpoints = 0;      // checkpoints (purposely dense rows to aid partial decoding)
+
+    void operator+=(const transcode_stats& rhs) {
+        N = std::max(N, rhs.N);
+        lines += rhs.lines;
+        sparse_cells += rhs.sparse_cells;
+        sparse75_lines += rhs.sparse75_lines;
+        sparse90_lines += rhs.sparse90_lines;
+        sparse99_lines += rhs.sparse99_lines;
+        squeezed_cells += rhs.squeezed_cells;
+        checkpoints += rhs.checkpoints;
+    }
 };
 
 class Transcoder {
 public:
-    virtual std::string ProcessLine(const std::string& input_line) = 0;
-    virtual void Stats(transcode_stats& stats) = 0;
+    virtual const char* ProcessLine(char* input_line) = 0; // input_line is consumed (damaged)
+    virtual transcode_stats Stats() = 0;
 };
-std::unique_ptr<Transcoder> NewEncoder();
+std::unique_ptr<Transcoder> NewEncoder(uint64_t checkpoint_period, bool sparse, bool squeeze, double roundDP_base);
 std::unique_ptr<Transcoder> NewDecoder();
 
+void TabixSlice(const std::string& spvcf_gz, std::vector<std::string> regions, std::ostream& out);
+
 }
diff --git a/src/strlcpy.h b/src/strlcpy.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 1998, 2015 Todd C. Miller <Todd.Miller@courtesan.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <string.h>
+
+/*
+ * Copy string src to buffer dst of size dsize.  At most dsize-1
+ * chars will be copied.  Always NUL terminates (unless dsize == 0).
+ * Returns strlen(src); if retval >= dsize, truncation occurred.
+ */
+size_t
+strlcpy(char *dst, const char *src, size_t dsize)
+{
+	const char *osrc = src;
+	size_t nleft = dsize;
+
+	/* Copy as many bytes as will fit. */
+	if (nleft != 0) {
+		while (--nleft != 0) {
+			if ((*dst++ = *src++) == '\0')
+				break;
+		}
+	}
+
+	/* Not enough room in dst, add NUL and traverse rest of src. */
+	if (nleft == 0) {
+		if (dsize != 0)
+			*dst = '\0';		/* NUL-terminate dst */
+		while (*src++)
+			;
+	}
+
+	return(src - osrc - 1);	/* count does not include NUL */
+}
diff --git a/test/spVCF.t b/test/spVCF.t
@@ -10,23 +10,92 @@ D=/tmp/spVCFTests
 rm -rf $D
 mkdir -p $D
 
-plan tests 7
+plan tests 28
 
 pigz -dc "$HERE/data/small.vcf.gz" > $D/small.vcf
-"$EXE" -o $D/small.ssvcf $D/small.vcf
+"$EXE" encode --no-squeeze -o $D/small.spvcf $D/small.vcf
 is "$?" "0" "filename I/O"
-is "$(cat $D/small.ssvcf | wc -c)" "36842025" "filename I/O output size"
+is "$(cat $D/small.spvcf | wc -c)" "37097532" "filename I/O output size"
 
-pigz -dc "$HERE/data/small.vcf.gz" | "$EXE" -q > $D/small.ssvcf
+pigz -dc "$HERE/data/small.vcf.gz" | "$EXE" encode -n -q > $D/small.spvcf
 is "$?" "0" "piped I/O"
-is "$(cat $D/small.ssvcf | wc -c)" "36842025" "piped I/O output size"
+is "$(cat $D/small.spvcf | wc -c)" "37097532" "piped I/O output size"
 
-"$EXE" -d -o $D/small.roundtrip.vcf $D/small.ssvcf
+"$EXE" decode -o $D/small.roundtrip.vcf $D/small.spvcf
 is "$?" "0" "decode"
 is "$(cat $D/small.roundtrip.vcf | wc -c)" "54007969" "roundtrip decode"
 
 is "$(cat $D/small.vcf | grep -v ^# | sha256sum)" \
    "$(cat $D/small.roundtrip.vcf | grep -v ^# | sha256sum)" \
    "roundtrip fidelity"
 
+is "$(egrep -o "spVCF_checkpointPOS=[0-9]+" $D/small.spvcf | uniq | cut -f2 -d = | tr '\n' ' ')" \
+   "5030088 5142698 5232868 5252604 5273770 " \
+   "checkpoint positions"
+
+"$EXE" encode -p 500 -o $D/small.squeezed.spvcf $D/small.vcf
+is "$?" "0" "squeeze"
+is "$(cat $D/small.squeezed.spvcf | wc -c)" "17553214" "squeezed output size"
+
+"$EXE" decode -q -o $D/small.squeezed.roundtrip.vcf $D/small.squeezed.spvcf
+is "$?" "0" "squeezed roundtrip decode"
+is "$(cat $D/small.vcf | grep -v ^# | sed -r 's/(\t[^:]+):[^\t]+/\1/g' | sha256sum)" \
+   "$(cat $D/small.squeezed.roundtrip.vcf | grep -v ^# | sed -r 's/(\t[^:]+):[^\t]+/\1/g' | sha256sum)" \
+   "squeezed roundtrip GT fidelity"
+
+"$EXE" squeeze -q -o $D/small.squeezed_only.vcf $D/small.vcf
+is "$?" "0" "squeeze (only)"
+is "$(cat $D/small.squeezed_only.vcf | grep -v ^# | sha256sum)" \
+   "$(cat $D/small.squeezed.roundtrip.vcf | grep -v ^# | sha256sum)" \
+   "squeeze (only) fidelity"
+
+is "$(egrep -o "spVCF_checkpointPOS=[0-9]+" $D/small.squeezed.spvcf | uniq | cut -f2 -d = | tr '\n' ' ')" \
+   "5030088 5085555 5142698 5225300 5232868 5243775 5252604 5264460 5273770 " \
+   "squeezed checkpoint positions"
+
+is $(grep -o ":32" "$D/small.squeezed_only.vcf" | wc -l) "140477" "squeezed DP rounding, r=2"
+is $("$EXE" squeeze -q -r 1.618 "$D/small.vcf" | grep -o ":29" | wc -l) "114001" "squeezed DP rounding, r=phi"
+
+bgzip -c $D/small.squeezed.spvcf > $D/small.squeezed.spvcf.gz
+tabix -p vcf $D/small.squeezed.spvcf.gz
+"$EXE" tabix -o $D/small.squeezed.slice.spvcf $D/small.squeezed.spvcf.gz chr21:5143000-5226000
+is "$?" "0" "tabix slice"
+
+is "$(egrep -o "spVCF_checkpointPOS=[0-9]+" $D/small.squeezed.slice.spvcf | uniq -c | tr -d ' ' | tr '\n' ' ')" \
+   "494spVCF_checkpointPOS=5143363 31spVCF_checkpointPOS=5225300 " \
+   "slice checkpoint"
+
+"$EXE" decode $D/small.squeezed.slice.spvcf > $D/small.squeezed.slice.vcf
+is "$?" "0" "decode tabix slice"
+
+bgzip -c $D/small.squeezed.roundtrip.vcf > $D/small.squeezed.roundtrip.vcf.gz
+tabix -p vcf $D/small.squeezed.roundtrip.vcf.gz
+tabix -p vcf $D/small.squeezed.roundtrip.vcf.gz chr21:5143000-5226000 > $D/small.squeezed.roundtrip.slice.vcf
+is "$(cat $D/small.squeezed.slice.vcf | grep -v ^# | sha256sum)" \
+   "$(cat $D/small.squeezed.roundtrip.slice.vcf | grep -v ^# | sha256sum)" \
+   "slice fidelity"
+
+"$EXE" tabix -o $D/small.squeezed.slice_chr21.spvcf $D/small.squeezed.spvcf.gz chr21
+is "$(cat $D/small.squeezed.slice_chr21.spvcf | sha256sum)" \
+   "$(cat $D/small.squeezed.spvcf | sha256sum)" \
+   "chromosome slice"
+
+pigz -dc "$HERE/data/small.vcf.gz" | "$EXE" encode -n -t $(nproc) - > $D/small.mt.spvcf
+is "$?" "0" "multithreaded encode"
+is "$(cat $D/small.mt.spvcf | wc -c)" "37097532" "multithreaded output size"
+
+time "$EXE" decode -o $D/small.mt.roundtrip.vcf $D/small.mt.spvcf
+is "$?" "0" "decode from multithreaded"
+
+is "$(cat $D/small.vcf | grep -v ^# | sha256sum)" \
+   "$(cat $D/small.mt.roundtrip.vcf | grep -v ^# | sha256sum)" \
+   "multithreaded roundtrip fidelity"
+
+is "$(egrep -o "spVCF_checkpointPOS=[0-9]+" $D/small.mt.spvcf | uniq | cut -f2 -d = | tr '\n' ' ')" \
+   "5030088 5142698 5232868 5252604 5273770 " \
+   "multithreaded checkpoint positions"
+
+is $("$EXE" encode -r 1.618 -t $(nproc) $D/small.vcf | "$EXE" decode | grep -o ":29" | wc -l) "114001" \
+   "multithreaded encode DP rounding, r=phi"
+
 rm -rf $D
diff --git a/test/spvcf.wdl b/test/spvcf.wdl
@@ -0,0 +1 @@
+../spvcf.wdl
diff --git a/test/test_spvcf.wdl b/test/test_spvcf.wdl
@@ -0,0 +1,61 @@
+version 1.0
+
+import "spvcf.wdl" as tasks
+
+workflow test_spvcf {
+    input {
+        File vcf_gz # pVCF
+    }
+
+    # spVCF-encode the pVCF
+    call tasks.spvcf_encode {
+        input:
+            vcf_gz = vcf_gz
+    }
+
+    # also squeeze the pVCF (without run-encoding)
+    call tasks.spvcf_squeeze {
+        input:
+            vcf_gz = vcf_gz
+    }
+
+    # decode the spVCF back to squeezed pVCF
+    call tasks.spvcf_decode {
+        input:
+            spvcf_gz = spvcf_encode.spvcf_gz
+    }
+
+    # verify decoded pVCF is identical to the squeezed pVCF
+    # (modulo arbitrary differences in compression block framing)
+    call verify_identical_gz_content {
+        input:
+            gz1 = spvcf_squeeze.squeeze_vcf_gz,
+            gz2 = spvcf_decode.vcf_gz
+    }
+
+    output {
+        File spvcf_gz = spvcf_encode.spvcf_gz
+        File squeeze_vcf_gz = spvcf_squeeze.squeeze_vcf_gz
+        File decoded_vcf_gz = spvcf_decode.vcf_gz
+    }
+}
+
+task verify_identical_gz_content {
+    input {
+        File gz1
+        File gz2
+    }
+
+    command <<<
+        set -euxo pipefail
+        apt-get -qq update && apt-get install -y tabix
+        cmp --silent <(bgzip -dc "~{gz1}") <(bgzip -dc "~{gz2}")
+    >>>
+
+    runtime {
+        docker: "ubuntu:20.04"
+        cpu: 4
+        memory: "4 GB"
+        disks: "local-disk ~{ceil(size(gz2,'GB')+size(gz1,'GiB'))+4} SSD"
+    }
+}