Skip to content

Commit

Permalink
add --with-missing-fields decoder option (#28)
Browse files Browse the repository at this point in the history
In each cell, appends `:.` for each omitted, trailing FORMAT field. Faster than piping through `bcftools view` to achieve the same.
  • Loading branch information
mlin authored Jun 4, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 58f5311 commit 53ecc90
Showing 5 changed files with 88 additions and 30 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -16,7 +16,7 @@ Further resources:

## `spvcf` utility

[![Build Status](https://travis-ci.org/mlin/spVCF.svg?branch=master)](https://travis-ci.org/mlin/spVCF)
[![build](https://github.com/mlin/spVCF/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/mlin/spVCF/actions/workflows/build.yml)

This repository has a command-line utility for encoding pVCF to spVCF and vice versa. The [Releases](https://github.com/mlin/spVCF/releases) page has pre-built executables compatible with most Linux x86-64 hosts, which you can download and `chmod +x spvcf`.

@@ -55,9 +55,10 @@ spvcf decode [options] [in.spvcf|-]
Reads spVCF text from standard input if filename is empty or -
Options:
-o,--output out.vcf Write to out.vcf instead of standard output
-q,--quiet Suppress statistics printed to standard error
-h,--help Show this help message
--with-missing-fields Include trailing FORMAT fields with missing values
-o,--output out.vcf Write to out.vcf instead of standard output
-q,--quiet Suppress statistics printed to standard error
-h,--help Show this help message
```

There's also `spvcf squeeze` to apply the QC squeezing transformation to a pVCF, without the sparse quote-encoding. This produces valid pVCF that's typically much smaller, although not as small as spVCF.
11 changes: 7 additions & 4 deletions spvcf.wdl
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@ task spvcf_encode {
input {
File vcf_gz
Boolean multithread = false
String docker = "ghcr.io/mlin/spvcf:v1.2.0"
String docker = "ghcr.io/mlin/spvcf:v1.3.0"
Int cpu = if multithread then 8 else 4
}

@@ -41,7 +41,8 @@ task spvcf_encode {
task spvcf_decode {
input {
File spvcf_gz
String docker = "ghcr.io/mlin/spvcf:v1.2.0"
Boolean with_missing_fields = false
String docker = "ghcr.io/mlin/spvcf:v1.3.0"
}

parameter_meta {
@@ -54,7 +55,9 @@ task spvcf_decode {
nm=$(basename "~{spvcf_gz}" .spvcf.gz)
nm="${nm}.vcf.gz"
mkdir out
bgzip -dc "~{spvcf_gz}" | spvcf decode | bgzip -@ 4 > "out/${nm}"
bgzip -dc "~{spvcf_gz}" \
| spvcf decode ~{if with_missing_fields then "--with-missing-fields" else ""} \
| bgzip -@ 4 > "out/${nm}"
>>>

runtime {
@@ -73,7 +76,7 @@ task spvcf_squeeze {
input {
File vcf_gz
Boolean multithread = false
String docker = "ghcr.io/mlin/spvcf:v1.2.0"
String docker = "ghcr.io/mlin/spvcf:v1.3.0"
Int cpu = if multithread then 8 else 4
}

34 changes: 24 additions & 10 deletions src/main.cc
Original file line number Diff line number Diff line change
@@ -195,9 +195,11 @@ void help_codec(CodecMode mode) {
<< "Reads spVCF text from standard input if filename is empty or -" << endl
<< endl
<< "Options:" << endl
<< " -o,--output out.vcf Write to out.vcf instead of standard output" << endl
<< " -q,--quiet Suppress statistics printed to standard error" << endl
<< " -h,--help Show this help message" << endl
<< " --with-missing-fields Include trailing FORMAT fields with missing values"
<< endl
<< " -o,--output out.vcf Write to out.vcf instead of standard output" << endl
<< " -q,--quiet Suppress statistics printed to standard error" << endl
<< " -h,--help Show this help message" << endl
<< endl;
break;
}
@@ -206,16 +208,21 @@ void help_codec(CodecMode mode) {
int main_codec(int argc, char *argv[], CodecMode mode) {
bool squeeze = true;
bool quiet = false;
bool with_missing_fields = false;
string output_filename;
uint64_t checkpoint_period = 1000;
size_t thread_count = 1;
double roundDP_base = 2.0;

static struct option long_options[] = {
{"help", no_argument, 0, 'h'}, {"no-squeeze", no_argument, 0, 'n'},
{"period", required_argument, 0, 'p'}, {"resolution", required_argument, 0, 'r'},
{"threads", required_argument, 0, 't'}, {"quiet", no_argument, 0, 'q'},
{"output", required_argument, 0, 'o'}, {0, 0, 0, 0}};
static struct option long_options[] = {{"help", no_argument, 0, 'h'},
{"no-squeeze", no_argument, 0, 'n'},
{"period", required_argument, 0, 'p'},
{"resolution", required_argument, 0, 'r'},
{"with-missing-fields", no_argument, 0, 'm'},
{"threads", required_argument, 0, 't'},
{"quiet", no_argument, 0, 'q'},
{"output", required_argument, 0, 'o'},
{0, 0, 0, 0}};

int c;
while (-1 != (c = getopt_long(argc, argv, "hnp:r:qo:t:", long_options, nullptr))) {
@@ -231,7 +238,7 @@ int main_codec(int argc, char *argv[], CodecMode mode) {
squeeze = false;
break;
case 'p':
if (mode == CodecMode::decode) {
if (mode != CodecMode::encode) {
help_codec(mode);
return -1;
}
@@ -254,6 +261,13 @@ int main_codec(int argc, char *argv[], CodecMode mode) {
return -1;
}
break;
case 'm':
if (mode != CodecMode::decode) {
help_codec(mode);
return -1;
}
with_missing_fields = true;
break;
case 't':
if (mode == CodecMode::decode) {
help_codec(mode);
@@ -322,7 +336,7 @@ int main_codec(int argc, char *argv[], CodecMode mode) {
if (thread_count <= 1) {
unique_ptr<spVCF::Transcoder> tc;
if (mode == CodecMode::decode) {
tc = spVCF::NewDecoder();
tc = spVCF::NewDecoder(with_missing_fields);
} else {
tc = spVCF::NewEncoder(checkpoint_period, (mode == CodecMode::encode), squeeze,
roundDP_base);
62 changes: 51 additions & 11 deletions src/spVCF.cc
Original file line number Diff line number Diff line change
@@ -282,7 +282,7 @@ const char *EncoderImpl::ProcessLine(char *input_line) {
++sparse_cells;
}

// CHECKPOINT -- return a densely-encode row -- if we've switched to a new
// CHECKPOINT -- return a densely-encoded row -- if we've switched to a new
// chromosome OR we've hit the specified period
++since_checkpoint_;
if (chrom_ != tokens[0] ||
@@ -527,13 +527,18 @@ unique_ptr<Transcoder> NewEncoder(uint64_t checkpoint_period, bool sparse, bool

class DecoderImpl : public TranscoderBase {
public:
DecoderImpl() = default;
DecoderImpl(bool with_missing_fields) : with_missing_fields_(with_missing_fields) {}
DecoderImpl(const DecoderImpl &) = delete;
const char *ProcessLine(char *input_line) override;

private:
void add_missing_fields(const char *entry, int field_count, string &ans);

vector<string> dense_entries_;
OStringStream buffer_;

bool with_missing_fields_;
int format_field_count_ = 0;
};

const char *DecoderImpl::ProcessLine(char *input_line) {
@@ -569,13 +574,12 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
assert(dense_entries_.size() == N);

// Pass through first nine columns
int format_field_count = 1;
buffer_.Clear();
buffer_ << tokens[0];
for (int i = 1; i < 9; i++) {
buffer_ << '\t';
if (i != 7) {
buffer_ << tokens[i];
} else {
if (i == 7) {
// Strip the spVCF_checkpointPOS INFO field if present
string INFO = tokens[7];
if (INFO.substr(0, 20) == "spVCF_checkpointPOS=") {
@@ -585,10 +589,24 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
} else {
buffer_ << '.';
}
} else {
buffer_ << INFO;
continue;
}
} else if (i == 8 && with_missing_fields_) {
// Count FORMAT fields for --with-missing-fields
for (char *FORMAT = tokens[8]; *FORMAT != 0; FORMAT++) {
if (*FORMAT == ':') {
format_field_count++;
}
}
if (format_field_count_ <= 0) {
format_field_count_ = format_field_count;
} else if (format_field_count != format_field_count_) {
fail(
"--with-missing-fields is unsuitable when pVCF lines have varying field FORMATs"
"; try piping output through bcftools instead");
}
}
buffer_ << tokens[i];
}

// Iterate over the sparse columns
@@ -604,8 +622,13 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
if (dense_cursor >= N) {
fail("Greater-than-expected number of columns implied by sparse encoding");
}
dense_entries_[dense_cursor++] = t;
buffer_ << '\t' << t;
string &dense_entry = dense_entries_[dense_cursor++];
if (with_missing_fields_) {
add_missing_fields(t, format_field_count, dense_entry);
} else {
dense_entry = t;
}
buffer_ << '\t' << dense_entry;
} else {
// Sparse entry - determine the run length
uint64_t r = 1;
@@ -655,7 +678,24 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
return buffer_.Get();
}

unique_ptr<Transcoder> NewDecoder() { return make_unique<DecoderImpl>(); }
// Add trailing missing fields to entry (--with-missing-fields); return tmpstr.c_str() or entry
// itself (if already complete).
void DecoderImpl::add_missing_fields(const char *entry, int field_count, string &ans) {
int entry_field_count = 1;
for (char *cursor = const_cast<char *>(entry); *cursor != 0; cursor++) {
if (*cursor == ':') {
entry_field_count++;
}
}
ans = entry;
while (entry_field_count++ < field_count) {
ans += ":.";
}
}

unique_ptr<Transcoder> NewDecoder(bool with_missing_fields) {
return make_unique<DecoderImpl>(with_missing_fields);
}

class TabixIterator {
htsFile *fp_;
@@ -831,7 +871,7 @@ void TabixSlice(const std::string &spvcf_gz, std::vector<std::string> regions, s

// From the checkpoint, run spVCF decoder until we see a line with POS >= region_lo
// TODO: the correct condition may be END >= region_lo. Needs careful testing.
auto decoder = NewDecoder();
auto decoder = NewDecoder(false);
while (true) {
linecpy = itr->Line();

2 changes: 1 addition & 1 deletion src/spVCF.h
Original file line number Diff line number Diff line change
@@ -36,7 +36,7 @@ class Transcoder {
};
std::unique_ptr<Transcoder> NewEncoder(uint64_t checkpoint_period, bool sparse, bool squeeze,
double roundDP_base);
std::unique_ptr<Transcoder> NewDecoder();
std::unique_ptr<Transcoder> NewDecoder(bool with_missing_fields);

void TabixSlice(const std::string &spvcf_gz, std::vector<std::string> regions, std::ostream &out);

0 comments on commit 53ecc90

Please sign in to comment.