Skip to content

Commit

Permalink
--with-missing-fields: pad AD & PL to expected vector length (#29)
Browse files Browse the repository at this point in the history
For now all sites must be biallelic. (Can be generalized in the future.)
  • Loading branch information
mlin authored Oct 2, 2023
1 parent 53ecc90 commit b25626d
Showing 1 changed file with 67 additions and 24 deletions.
91 changes: 67 additions & 24 deletions src/spVCF.cc
Original file line number Diff line number Diff line change
Expand Up @@ -532,13 +532,20 @@ class DecoderImpl : public TranscoderBase {
const char *ProcessLine(char *input_line) override;

private:
void add_missing_fields(const char *entry, int field_count, string &ans);
void add_missing_fields(const char *entry, int n_alt, string &ans);

// temp buffers used in ProcessLine (to reduce allocations)
vector<string> dense_entries_;
OStringStream buffer_;

bool with_missing_fields_;
int format_field_count_ = 0;
string format_;
vector<string> format_split_;
int iAD_ = -1, iPL_ = -1;
// temp buffers used in add_missing_fields (to reduce allocations)
OStringStream format_buffer_;
string entry_copy_;
vector<char *> entry_fields_;
};

const char *DecoderImpl::ProcessLine(char *input_line) {
Expand All @@ -565,6 +572,23 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
fail("Invalid project VCF: fewer than 10 columns");
}

int n_alt = -1;
if (with_missing_fields_) {
// count n_alt for use in missing fields with Number={A,G,R}
n_alt = 1;
for (char *alt = tokens[4]; *alt; alt++) {
if (*alt == ',') {
n_alt++;
}
}
if (n_alt != 1) {
// FIXME: handling multiallelic sites will require add_missing_fields() even on sparse
// entries
fail("--with-missing-fields only works with biallelic pVCF for now"
"; try piping output through bcftools instead");
}
}

// Figure out number of dense columns, the number of columns on the first line
uint64_t N = dense_entries_.empty() ? (tokens.size() - 9) : dense_entries_.size();
if (dense_entries_.empty()) {
Expand All @@ -574,7 +598,6 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
assert(dense_entries_.size() == N);

// Pass through first nine columns
int format_field_count = 1;
buffer_.Clear();
buffer_ << tokens[0];
for (int i = 1; i < 9; i++) {
Expand All @@ -592,15 +615,23 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
continue;
}
} else if (i == 8 && with_missing_fields_) {
// Count FORMAT fields for --with-missing-fields
for (char *FORMAT = tokens[8]; *FORMAT != 0; FORMAT++) {
if (*FORMAT == ':') {
format_field_count++;
if (format_.empty()) {
// one-time initialization
format_ = tokens[8];
string format_copy = format_;
vector<char *> format_split;
split(format_copy, ':', back_inserter(format_split));
for (int j = 0; j < format_split.size(); j++) {
char *s = format_split[j];
if (!strcmp(s, "AD")) {
iAD_ = j;
} else if (!strcmp(s, "PL")) {
iPL_ = j;
}
format_split_.push_back(s);
}
}
if (format_field_count_ <= 0) {
format_field_count_ = format_field_count;
} else if (format_field_count != format_field_count_) {
if (format_ != tokens[8]) {
fail(
"--with-missing-fields is unsuitable when pVCF lines have varying field FORMATs"
"; try piping output through bcftools instead");
Expand All @@ -617,14 +648,12 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
fail("empty cell");
} else if (*t != '"') {
// Dense entry - remember it and copy it to the output
// TODO: Perhaps fill QC fields with missing values (.) if they were squeezed out.
// The VCF spec does however say "Trailing fields can be dropped"
if (dense_cursor >= N) {
fail("Greater-than-expected number of columns implied by sparse encoding");
}
string &dense_entry = dense_entries_[dense_cursor++];
if (with_missing_fields_) {
add_missing_fields(t, format_field_count, dense_entry);
add_missing_fields(t, n_alt, dense_entry);
} else {
dense_entry = t;
}
Expand Down Expand Up @@ -678,19 +707,33 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
return buffer_.Get();
}

// Add trailing missing fields to entry (--with-missing-fields); return tmpstr.c_str() or entry
// itself (if already complete).
void DecoderImpl::add_missing_fields(const char *entry, int field_count, string &ans) {
int entry_field_count = 1;
for (char *cursor = const_cast<char *>(entry); *cursor != 0; cursor++) {
if (*cursor == ':') {
entry_field_count++;
// Add trailing missing fields to entry (--with-missing-fields)
// Most missing fields are just represented by . except for AD and PL, which we pad with . to the
// correct vector length. (In principle we should do that for any Number={A,G,R} field, but this
// suffices for our practical need for this feature.)
void DecoderImpl::add_missing_fields(const char *entry, int n_alt, string &ans) {
format_buffer_.Clear();
entry_copy_ = entry;
entry_fields_.clear();
split(entry_copy_, ':', back_inserter(entry_fields_));
for (int i = 0; i < format_split_.size(); i++) {
bool present = i < entry_fields_.size();
const char *field = present ? entry_fields_[i] : nullptr;
if (i > 0) {
format_buffer_ << ':';
}
if (i == iAD_ && (!present || !strcmp(field, "."))) {
// FIXME: handle multiallelic. Meaning we need to run this on sparse entries too.
// const int nAD = n_alt + 1;
format_buffer_ << ".,.";
} else if (i == iPL_ && (!present || !strcmp(field, "."))) {
// const int nPL = (n_alt + 1) * (n_alt + 2) / 2;
format_buffer_ << ".,.,.";
} else {
format_buffer_ << (present ? field : ".");
}
}
ans = entry;
while (entry_field_count++ < field_count) {
ans += ":.";
}
ans = format_buffer_.Get();
}

unique_ptr<Transcoder> NewDecoder(bool with_missing_fields) {
Expand Down

0 comments on commit b25626d

Please sign in to comment.