Fixed memory leak in isBimera

benjjneb · May 22, 2016 · 9e7f9c9 · 9e7f9c9
1 parent 26daf96
commit 9e7f9c9
Show file tree

Hide file tree

Showing 6 changed files with 203 additions and 199 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -7,8 +7,8 @@ Description: The dada2 package provides "OTU picking" functionality, but instead
     sequences and associated abundances after removing substitution and chimeric
     errors. Taxonomic classification is also available via a native implementation
     of the RDP classifier method.
-Version: 1.0.0
-Date: 2016-02-18
+Version: 1.0.3
+Date: 2016-05-22
 Maintainer: Benjamin Callahan <benjamin.j.callahan@gmail.com>
 Author: Benjamin Callahan <benjamin.j.callahan@gmail.com>, Paul McMurdie, Susan
     Holmes

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,12 +1,8 @@
 # This file was generated by Rcpp::compileAttributes
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-#' @useDynLib dada2
-#' @importFrom Rcpp evalCpp
-NULL
-
-dada_uniques <- function(seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose) {
-    .Call('dada2_dada_uniques', PACKAGE = 'dada2', seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose)
+C_is_bimera <- function(sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift) {
+    .Call('dada2_C_is_bimera', PACKAGE = 'dada2', sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift)
 }
 
 C_nwalign <- function(s1, s2, score, gap_p, homo_gap_p, band, endsfree) {
@@ -17,10 +13,6 @@ C_eval_pair <- function(s1, s2) {
     .Call('dada2_C_eval_pair', PACKAGE = 'dada2', s1, s2)
 }
 
-C_is_bimera <- function(sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift) {
-    .Call('dada2_C_is_bimera', PACKAGE = 'dada2', sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift)
-}
-
 C_pair_consensus <- function(s1, s2, prefer) {
     .Call('dada2_C_pair_consensus', PACKAGE = 'dada2', s1, s2, prefer)
 }
@@ -72,6 +64,14 @@ C_nwvec <- function(s1, s2, match, mismatch, gap_p, band) {
     .Call('dada2_C_nwvec', PACKAGE = 'dada2', s1, s2, match, mismatch, gap_p, band)
 }
 
+#' @useDynLib dada2
+#' @importFrom Rcpp evalCpp
+NULL
+
+dada_uniques <- function(seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose) {
+    .Call('dada2_dada_uniques', PACKAGE = 'dada2', seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose)
+}
+
 C_assign_taxonomy <- function(seqs, refs, ref_to_genus, genusmat, verbose) {
     .Call('dada2_C_assign_taxonomy', PACKAGE = 'dada2', seqs, refs, ref_to_genus, genusmat, verbose)
 }

diff --git a/R/multiSample.R b/R/multiSample.R
@@ -32,7 +32,7 @@ makeSequenceTable <- function(samples, orderBy = "abundance") {
   unqs <- lapply(samples, getUniques)
   unqsqs <- unique(do.call(c, lapply(unqs, names)))
   if(length(unique(nchar(unqsqs)))>1) { message("The sequences being tabled vary in length.") }
-  rval <- matrix(0, nrow=length(unqs), ncol=length(unqsqs))
+  rval <- matrix(0L, nrow=length(unqs), ncol=length(unqsqs))
   # Samples are rows, columns are sequences
   colnames(rval) <- unqsqs
   for(i in seq_along(unqs)) {

diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -8,32 +8,20 @@
 
 using namespace Rcpp;
 
-// dada_uniques
-Rcpp::List dada_uniques(std::vector< std::string > seqs, std::vector<int> abundances, Rcpp::NumericMatrix err, Rcpp::NumericMatrix quals, Rcpp::NumericMatrix score, int gap, bool use_kmers, double kdist_cutoff, int band_size, double omegaA, int max_clust, double min_fold, int min_hamming, bool use_quals, int qmax, bool final_consensus, bool vectorized_alignment, int homo_gap, bool verbose);
-RcppExport SEXP dada2_dada_uniques(SEXP seqsSEXP, SEXP abundancesSEXP, SEXP errSEXP, SEXP qualsSEXP, SEXP scoreSEXP, SEXP gapSEXP, SEXP use_kmersSEXP, SEXP kdist_cutoffSEXP, SEXP band_sizeSEXP, SEXP omegaASEXP, SEXP max_clustSEXP, SEXP min_foldSEXP, SEXP min_hammingSEXP, SEXP use_qualsSEXP, SEXP qmaxSEXP, SEXP final_consensusSEXP, SEXP vectorized_alignmentSEXP, SEXP homo_gapSEXP, SEXP verboseSEXP) {
+// C_is_bimera
+bool C_is_bimera(std::string sq, std::vector<std::string> pars, bool allow_one_off, int min_one_off_par_dist, Rcpp::NumericMatrix score, int gap_p, int max_shift);
+RcppExport SEXP dada2_C_is_bimera(SEXP sqSEXP, SEXP parsSEXP, SEXP allow_one_offSEXP, SEXP min_one_off_par_distSEXP, SEXP scoreSEXP, SEXP gap_pSEXP, SEXP max_shiftSEXP) {
 BEGIN_RCPP
     Rcpp::RObject __result;
     Rcpp::RNGScope __rngScope;
-    Rcpp::traits::input_parameter< std::vector< std::string > >::type seqs(seqsSEXP);
-    Rcpp::traits::input_parameter< std::vector<int> >::type abundances(abundancesSEXP);
-    Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type err(errSEXP);
-    Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type quals(qualsSEXP);
+    Rcpp::traits::input_parameter< std::string >::type sq(sqSEXP);
+    Rcpp::traits::input_parameter< std::vector<std::string> >::type pars(parsSEXP);
+    Rcpp::traits::input_parameter< bool >::type allow_one_off(allow_one_offSEXP);
+    Rcpp::traits::input_parameter< int >::type min_one_off_par_dist(min_one_off_par_distSEXP);
     Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type score(scoreSEXP);
-    Rcpp::traits::input_parameter< int >::type gap(gapSEXP);
-    Rcpp::traits::input_parameter< bool >::type use_kmers(use_kmersSEXP);
-    Rcpp::traits::input_parameter< double >::type kdist_cutoff(kdist_cutoffSEXP);
-    Rcpp::traits::input_parameter< int >::type band_size(band_sizeSEXP);
-    Rcpp::traits::input_parameter< double >::type omegaA(omegaASEXP);
-    Rcpp::traits::input_parameter< int >::type max_clust(max_clustSEXP);
-    Rcpp::traits::input_parameter< double >::type min_fold(min_foldSEXP);
-    Rcpp::traits::input_parameter< int >::type min_hamming(min_hammingSEXP);
-    Rcpp::traits::input_parameter< bool >::type use_quals(use_qualsSEXP);
-    Rcpp::traits::input_parameter< int >::type qmax(qmaxSEXP);
-    Rcpp::traits::input_parameter< bool >::type final_consensus(final_consensusSEXP);
-    Rcpp::traits::input_parameter< bool >::type vectorized_alignment(vectorized_alignmentSEXP);
-    Rcpp::traits::input_parameter< int >::type homo_gap(homo_gapSEXP);
-    Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
-    __result = Rcpp::wrap(dada_uniques(seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose));
+    Rcpp::traits::input_parameter< int >::type gap_p(gap_pSEXP);
+    Rcpp::traits::input_parameter< int >::type max_shift(max_shiftSEXP);
+    __result = Rcpp::wrap(C_is_bimera(sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift));
     return __result;
 END_RCPP
 }
@@ -66,23 +54,6 @@ BEGIN_RCPP
     return __result;
 END_RCPP
 }
-// C_is_bimera
-bool C_is_bimera(std::string sq, std::vector<std::string> pars, bool allow_one_off, int min_one_off_par_dist, Rcpp::NumericMatrix score, int gap_p, int max_shift);
-RcppExport SEXP dada2_C_is_bimera(SEXP sqSEXP, SEXP parsSEXP, SEXP allow_one_offSEXP, SEXP min_one_off_par_distSEXP, SEXP scoreSEXP, SEXP gap_pSEXP, SEXP max_shiftSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject __result;
-    Rcpp::RNGScope __rngScope;
-    Rcpp::traits::input_parameter< std::string >::type sq(sqSEXP);
-    Rcpp::traits::input_parameter< std::vector<std::string> >::type pars(parsSEXP);
-    Rcpp::traits::input_parameter< bool >::type allow_one_off(allow_one_offSEXP);
-    Rcpp::traits::input_parameter< int >::type min_one_off_par_dist(min_one_off_par_distSEXP);
-    Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type score(scoreSEXP);
-    Rcpp::traits::input_parameter< int >::type gap_p(gap_pSEXP);
-    Rcpp::traits::input_parameter< int >::type max_shift(max_shiftSEXP);
-    __result = Rcpp::wrap(C_is_bimera(sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift));
-    return __result;
-END_RCPP
-}
 // C_pair_consensus
 Rcpp::CharacterVector C_pair_consensus(std::string s1, std::string s2, int prefer);
 RcppExport SEXP dada2_C_pair_consensus(SEXP s1SEXP, SEXP s2SEXP, SEXP preferSEXP) {
@@ -165,6 +136,35 @@ BEGIN_RCPP
     return __result;
 END_RCPP
 }
+// dada_uniques
+Rcpp::List dada_uniques(std::vector< std::string > seqs, std::vector<int> abundances, Rcpp::NumericMatrix err, Rcpp::NumericMatrix quals, Rcpp::NumericMatrix score, int gap, bool use_kmers, double kdist_cutoff, int band_size, double omegaA, int max_clust, double min_fold, int min_hamming, bool use_quals, int qmax, bool final_consensus, bool vectorized_alignment, int homo_gap, bool verbose);
+RcppExport SEXP dada2_dada_uniques(SEXP seqsSEXP, SEXP abundancesSEXP, SEXP errSEXP, SEXP qualsSEXP, SEXP scoreSEXP, SEXP gapSEXP, SEXP use_kmersSEXP, SEXP kdist_cutoffSEXP, SEXP band_sizeSEXP, SEXP omegaASEXP, SEXP max_clustSEXP, SEXP min_foldSEXP, SEXP min_hammingSEXP, SEXP use_qualsSEXP, SEXP qmaxSEXP, SEXP final_consensusSEXP, SEXP vectorized_alignmentSEXP, SEXP homo_gapSEXP, SEXP verboseSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject __result;
+    Rcpp::RNGScope __rngScope;
+    Rcpp::traits::input_parameter< std::vector< std::string > >::type seqs(seqsSEXP);
+    Rcpp::traits::input_parameter< std::vector<int> >::type abundances(abundancesSEXP);
+    Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type err(errSEXP);
+    Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type quals(qualsSEXP);
+    Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type score(scoreSEXP);
+    Rcpp::traits::input_parameter< int >::type gap(gapSEXP);
+    Rcpp::traits::input_parameter< bool >::type use_kmers(use_kmersSEXP);
+    Rcpp::traits::input_parameter< double >::type kdist_cutoff(kdist_cutoffSEXP);
+    Rcpp::traits::input_parameter< int >::type band_size(band_sizeSEXP);
+    Rcpp::traits::input_parameter< double >::type omegaA(omegaASEXP);
+    Rcpp::traits::input_parameter< int >::type max_clust(max_clustSEXP);
+    Rcpp::traits::input_parameter< double >::type min_fold(min_foldSEXP);
+    Rcpp::traits::input_parameter< int >::type min_hamming(min_hammingSEXP);
+    Rcpp::traits::input_parameter< bool >::type use_quals(use_qualsSEXP);
+    Rcpp::traits::input_parameter< int >::type qmax(qmaxSEXP);
+    Rcpp::traits::input_parameter< bool >::type final_consensus(final_consensusSEXP);
+    Rcpp::traits::input_parameter< bool >::type vectorized_alignment(vectorized_alignmentSEXP);
+    Rcpp::traits::input_parameter< int >::type homo_gap(homo_gapSEXP);
+    Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
+    __result = Rcpp::wrap(dada_uniques(seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose));
+    return __result;
+END_RCPP
+}
 // C_assign_taxonomy
 Rcpp::List C_assign_taxonomy(std::vector<std::string> seqs, std::vector<std::string> refs, std::vector<int> ref_to_genus, Rcpp::IntegerMatrix genusmat, bool verbose);
 RcppExport SEXP dada2_C_assign_taxonomy(SEXP seqsSEXP, SEXP refsSEXP, SEXP ref_to_genusSEXP, SEXP genusmatSEXP, SEXP verboseSEXP) {

diff --git a/src/chimera.cpp b/src/chimera.cpp
@@ -0,0 +1,151 @@
+#include "dada.h"
+#include <Rcpp.h>
+using namespace Rcpp;
+
+int get_ham_endsfree(const char *seq1, const char *seq2, int len);
+
+//------------------------------------------------------------------
+// Determines whether sq is a perfect bimera of some combination from pars.
+// 
+// @param sq The query DNA sequence.
+// @param pars Potential bimeric "parents".
+// @param max_shift A \code{integer(1)} of the maximum alignment shift allowed.
+// 
+// [[Rcpp::export]]
+bool C_is_bimera(std::string sq, std::vector<std::string> pars, bool allow_one_off, int min_one_off_par_dist, Rcpp::NumericMatrix score, int gap_p, int max_shift) {
+  // For now only finding perfect bimeras
+  int i, j, left, right, left_oo, right_oo, pos, len, max_len;
+  // Make  c-style 2d score array
+  int c_score[4][4];
+  for(i=0;i<4;i++) {
+    for(j=0;j<4;j++) {
+      c_score[i][j] = (int) score(i,j);
+    }
+  }
+
+  // Make integer-ized c-style sequence strings
+  char *seq1 = (char *) malloc(sq.size()+1); //E
+  // Get max length of the parent sequences
+  max_len=0;
+  for(i=0;i<pars.size();i++) {
+    len = pars[i].size();
+    if(len>max_len) { max_len = len; }
+  }
+  char *seq2 = (char *) malloc(max_len+1); //E
+  if (seq1 == NULL || seq2 == NULL)  Rcpp::stop("Memory allocation failed.");
+  nt2int(seq1, sq.c_str());
+
+  char **al;
+  int max_left=0, max_right=0;
+  int oo_max_left=0, oo_max_right=0, oo_max_left_oo=0, oo_max_right_oo=0;
+
+  bool rval = false;
+  for(i=0;i<pars.size() && rval==false;i++) {
+    nt2int(seq2, pars[i].c_str());
+    al = nwalign_endsfree(seq1, seq2, c_score, gap_p, max_shift);  // Remember, alignments must be freed!
+    len = strlen(al[0]);
+
+    pos=0; left=0;
+    while(al[0][pos] == 6 && pos<len) {
+      pos++; // Scan in until query starts
+    }
+    while(al[1][pos] == 6 && pos<max_shift) {
+      pos++; left++; // Credit as ends-free coverage until parent starts
+    }
+    while(pos<len && al[0][pos] == al[1][pos]) {
+      pos++; left++; // Credit as covered until a mismatch
+    }
+    if(allow_one_off) {
+      // Step forward, and credit to one-off further matches (and this mismatch if not a gap)
+      left_oo = left;
+      pos++;
+      if(pos<len && al[0][pos] != 6) { left_oo++; }
+      while(pos<len && al[0][pos] == al[1][pos]) {
+        pos++; left_oo++;
+      }
+    }
+
+    pos=len-1; right=0;
+    while(al[0][pos] == 6 && pos >= 0) { 
+      pos--;
+    }
+    while(al[1][pos] == 6 && pos>+(len-max_shift)) {
+      pos--; right++;
+    }
+    while(pos>=0 && al[0][pos] == al[1][pos]) {
+      pos--; right++;
+    }
+    if(allow_one_off) {
+      // Step forward, and credit to one-off further matches (and this mismatch if not a gap)
+      right_oo = right;
+      pos--;
+      if(pos>=0 && al[0][pos] != 6) { right_oo++; }
+      while(pos>=0 && al[0][pos] == al[1][pos]) {
+        pos--; right_oo++;
+      }
+    }
+
+    if((left+right) >= sq.size()) { // Toss id/pure-shift/internal-indel "parents"
+      continue;
+    }
+    if(left > max_left) { max_left=left; }
+    if(right > max_right) { max_right=right; }
+
+    // Need to evaluate whether parents are allowed for one-off models
+    if(allow_one_off && get_ham_endsfree(al[0], al[1], len) >= min_one_off_par_dist) {
+      if(left > oo_max_left) { oo_max_left=left; }
+      if(right > oo_max_right) { oo_max_right=right; }
+      if(left_oo > oo_max_left_oo) { oo_max_left_oo=left_oo; }
+      if(right_oo > oo_max_right_oo) { oo_max_right_oo=right_oo; }
+    }
+
+    // Evaluate, and break if found bimeric model
+    if((max_right+max_left)>=sq.size()) {
+      rval = true;
+    }
+    if(allow_one_off) {
+      if((oo_max_left+oo_max_right_oo)>=sq.size() || (oo_max_left_oo+oo_max_right)>=sq.size()) {
+        rval=true;
+      }
+    }
+    free(al[0]);
+    free(al[1]);
+    free(al);
+  }
+
+  free(seq1);
+  free(seq2);
+  return(rval);
+}
+
+// Internal function to get hamming distance between aligned seqs
+//  without counting end gaps
+int get_ham_endsfree(const char *seq1, const char *seq2, int len) {
+  int i, j, pos, ham;
+  bool gap1, gap2;
+  // Find start of internal part of alignment
+  i=0;
+  gap1 = (seq1[i]==6); 
+  gap2 = (seq2[i]==6);
+  while(gap1 || gap2) {
+    i++;
+    gap1 = (gap1 && (seq1[i]==6));
+    gap2 = (gap2 && (seq2[i]==6));
+  }
+  // Find end of internal part of alignment
+  j=len-1;
+  gap1 = (seq1[j]==6); 
+  gap2 = (seq2[j]==6);
+  while(gap1 || gap2) {
+    j--;
+    gap1 = (gap1 && (seq1[j]==6));
+    gap2 = (gap2 && (seq2[j]==6));
+  }
+  // Calculate hamming distance over internal part
+  ham=0;
+  for(pos=i;pos<=j;pos++) {
+    if(seq1[pos] != seq2[pos]) { ham++; }
+  }
+  return(ham);
+}
+