From 9e7f9c96d271dd3ff5064ebd6da4291e49f65ef0 Mon Sep 17 00:00:00 2001 From: benjjneb Date: Sun, 22 May 2016 12:58:37 -0700 Subject: [PATCH] Fixed memory leak in isBimera --- DESCRIPTION | 4 +- R/RcppExports.R | 20 +++--- R/multiSample.R | 2 +- src/RcppExports.cpp | 78 +++++++++++------------ src/chimera.cpp | 151 ++++++++++++++++++++++++++++++++++++++++++++ src/evaluate.cpp | 147 ------------------------------------------ 6 files changed, 203 insertions(+), 199 deletions(-) create mode 100644 src/chimera.cpp diff --git a/DESCRIPTION b/DESCRIPTION index 7ad82b9..8443fe2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,8 +7,8 @@ Description: The dada2 package provides "OTU picking" functionality, but instead sequences and associated abundances after removing substitution and chimeric errors. Taxonomic classification is also available via a native implementation of the RDP classifier method. -Version: 1.0.0 -Date: 2016-02-18 +Version: 1.0.3 +Date: 2016-05-22 Maintainer: Benjamin Callahan Author: Benjamin Callahan , Paul McMurdie, Susan Holmes diff --git a/R/RcppExports.R b/R/RcppExports.R index d82ee3e..6383a0b 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,12 +1,8 @@ # This file was generated by Rcpp::compileAttributes # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -#' @useDynLib dada2 -#' @importFrom Rcpp evalCpp -NULL - -dada_uniques <- function(seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose) { - .Call('dada2_dada_uniques', PACKAGE = 'dada2', seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose) +C_is_bimera <- function(sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift) { + .Call('dada2_C_is_bimera', PACKAGE = 'dada2', sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift) } C_nwalign <- function(s1, s2, score, gap_p, homo_gap_p, band, endsfree) { @@ -17,10 +13,6 @@ C_eval_pair <- function(s1, s2) { .Call('dada2_C_eval_pair', PACKAGE = 'dada2', s1, s2) } -C_is_bimera <- function(sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift) { - .Call('dada2_C_is_bimera', PACKAGE = 'dada2', sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift) -} - C_pair_consensus <- function(s1, s2, prefer) { .Call('dada2_C_pair_consensus', PACKAGE = 'dada2', s1, s2, prefer) } @@ -72,6 +64,14 @@ C_nwvec <- function(s1, s2, match, mismatch, gap_p, band) { .Call('dada2_C_nwvec', PACKAGE = 'dada2', s1, s2, match, mismatch, gap_p, band) } +#' @useDynLib dada2 +#' @importFrom Rcpp evalCpp +NULL + +dada_uniques <- function(seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose) { + .Call('dada2_dada_uniques', PACKAGE = 'dada2', seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose) +} + C_assign_taxonomy <- function(seqs, refs, ref_to_genus, genusmat, verbose) { .Call('dada2_C_assign_taxonomy', PACKAGE = 'dada2', seqs, refs, ref_to_genus, genusmat, verbose) } diff --git a/R/multiSample.R b/R/multiSample.R index fb89fa5..b3878f3 100644 --- a/R/multiSample.R +++ b/R/multiSample.R @@ -32,7 +32,7 @@ makeSequenceTable <- function(samples, orderBy = "abundance") { unqs <- lapply(samples, getUniques) unqsqs <- unique(do.call(c, lapply(unqs, names))) if(length(unique(nchar(unqsqs)))>1) { message("The sequences being tabled vary in length.") } - rval <- matrix(0, nrow=length(unqs), ncol=length(unqsqs)) + rval <- matrix(0L, nrow=length(unqs), ncol=length(unqsqs)) # Samples are rows, columns are sequences colnames(rval) <- unqsqs for(i in seq_along(unqs)) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 480161b..eb3b343 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -8,32 +8,20 @@ using namespace Rcpp; -// dada_uniques -Rcpp::List dada_uniques(std::vector< std::string > seqs, std::vector abundances, Rcpp::NumericMatrix err, Rcpp::NumericMatrix quals, Rcpp::NumericMatrix score, int gap, bool use_kmers, double kdist_cutoff, int band_size, double omegaA, int max_clust, double min_fold, int min_hamming, bool use_quals, int qmax, bool final_consensus, bool vectorized_alignment, int homo_gap, bool verbose); -RcppExport SEXP dada2_dada_uniques(SEXP seqsSEXP, SEXP abundancesSEXP, SEXP errSEXP, SEXP qualsSEXP, SEXP scoreSEXP, SEXP gapSEXP, SEXP use_kmersSEXP, SEXP kdist_cutoffSEXP, SEXP band_sizeSEXP, SEXP omegaASEXP, SEXP max_clustSEXP, SEXP min_foldSEXP, SEXP min_hammingSEXP, SEXP use_qualsSEXP, SEXP qmaxSEXP, SEXP final_consensusSEXP, SEXP vectorized_alignmentSEXP, SEXP homo_gapSEXP, SEXP verboseSEXP) { +// C_is_bimera +bool C_is_bimera(std::string sq, std::vector pars, bool allow_one_off, int min_one_off_par_dist, Rcpp::NumericMatrix score, int gap_p, int max_shift); +RcppExport SEXP dada2_C_is_bimera(SEXP sqSEXP, SEXP parsSEXP, SEXP allow_one_offSEXP, SEXP min_one_off_par_distSEXP, SEXP scoreSEXP, SEXP gap_pSEXP, SEXP max_shiftSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< std::vector< std::string > >::type seqs(seqsSEXP); - Rcpp::traits::input_parameter< std::vector >::type abundances(abundancesSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type err(errSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type quals(qualsSEXP); + Rcpp::traits::input_parameter< std::string >::type sq(sqSEXP); + Rcpp::traits::input_parameter< std::vector >::type pars(parsSEXP); + Rcpp::traits::input_parameter< bool >::type allow_one_off(allow_one_offSEXP); + Rcpp::traits::input_parameter< int >::type min_one_off_par_dist(min_one_off_par_distSEXP); Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type score(scoreSEXP); - Rcpp::traits::input_parameter< int >::type gap(gapSEXP); - Rcpp::traits::input_parameter< bool >::type use_kmers(use_kmersSEXP); - Rcpp::traits::input_parameter< double >::type kdist_cutoff(kdist_cutoffSEXP); - Rcpp::traits::input_parameter< int >::type band_size(band_sizeSEXP); - Rcpp::traits::input_parameter< double >::type omegaA(omegaASEXP); - Rcpp::traits::input_parameter< int >::type max_clust(max_clustSEXP); - Rcpp::traits::input_parameter< double >::type min_fold(min_foldSEXP); - Rcpp::traits::input_parameter< int >::type min_hamming(min_hammingSEXP); - Rcpp::traits::input_parameter< bool >::type use_quals(use_qualsSEXP); - Rcpp::traits::input_parameter< int >::type qmax(qmaxSEXP); - Rcpp::traits::input_parameter< bool >::type final_consensus(final_consensusSEXP); - Rcpp::traits::input_parameter< bool >::type vectorized_alignment(vectorized_alignmentSEXP); - Rcpp::traits::input_parameter< int >::type homo_gap(homo_gapSEXP); - Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - __result = Rcpp::wrap(dada_uniques(seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose)); + Rcpp::traits::input_parameter< int >::type gap_p(gap_pSEXP); + Rcpp::traits::input_parameter< int >::type max_shift(max_shiftSEXP); + __result = Rcpp::wrap(C_is_bimera(sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift)); return __result; END_RCPP } @@ -66,23 +54,6 @@ BEGIN_RCPP return __result; END_RCPP } -// C_is_bimera -bool C_is_bimera(std::string sq, std::vector pars, bool allow_one_off, int min_one_off_par_dist, Rcpp::NumericMatrix score, int gap_p, int max_shift); -RcppExport SEXP dada2_C_is_bimera(SEXP sqSEXP, SEXP parsSEXP, SEXP allow_one_offSEXP, SEXP min_one_off_par_distSEXP, SEXP scoreSEXP, SEXP gap_pSEXP, SEXP max_shiftSEXP) { -BEGIN_RCPP - Rcpp::RObject __result; - Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< std::string >::type sq(sqSEXP); - Rcpp::traits::input_parameter< std::vector >::type pars(parsSEXP); - Rcpp::traits::input_parameter< bool >::type allow_one_off(allow_one_offSEXP); - Rcpp::traits::input_parameter< int >::type min_one_off_par_dist(min_one_off_par_distSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type score(scoreSEXP); - Rcpp::traits::input_parameter< int >::type gap_p(gap_pSEXP); - Rcpp::traits::input_parameter< int >::type max_shift(max_shiftSEXP); - __result = Rcpp::wrap(C_is_bimera(sq, pars, allow_one_off, min_one_off_par_dist, score, gap_p, max_shift)); - return __result; -END_RCPP -} // C_pair_consensus Rcpp::CharacterVector C_pair_consensus(std::string s1, std::string s2, int prefer); RcppExport SEXP dada2_C_pair_consensus(SEXP s1SEXP, SEXP s2SEXP, SEXP preferSEXP) { @@ -165,6 +136,35 @@ BEGIN_RCPP return __result; END_RCPP } +// dada_uniques +Rcpp::List dada_uniques(std::vector< std::string > seqs, std::vector abundances, Rcpp::NumericMatrix err, Rcpp::NumericMatrix quals, Rcpp::NumericMatrix score, int gap, bool use_kmers, double kdist_cutoff, int band_size, double omegaA, int max_clust, double min_fold, int min_hamming, bool use_quals, int qmax, bool final_consensus, bool vectorized_alignment, int homo_gap, bool verbose); +RcppExport SEXP dada2_dada_uniques(SEXP seqsSEXP, SEXP abundancesSEXP, SEXP errSEXP, SEXP qualsSEXP, SEXP scoreSEXP, SEXP gapSEXP, SEXP use_kmersSEXP, SEXP kdist_cutoffSEXP, SEXP band_sizeSEXP, SEXP omegaASEXP, SEXP max_clustSEXP, SEXP min_foldSEXP, SEXP min_hammingSEXP, SEXP use_qualsSEXP, SEXP qmaxSEXP, SEXP final_consensusSEXP, SEXP vectorized_alignmentSEXP, SEXP homo_gapSEXP, SEXP verboseSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< std::vector< std::string > >::type seqs(seqsSEXP); + Rcpp::traits::input_parameter< std::vector >::type abundances(abundancesSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type err(errSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type quals(qualsSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type score(scoreSEXP); + Rcpp::traits::input_parameter< int >::type gap(gapSEXP); + Rcpp::traits::input_parameter< bool >::type use_kmers(use_kmersSEXP); + Rcpp::traits::input_parameter< double >::type kdist_cutoff(kdist_cutoffSEXP); + Rcpp::traits::input_parameter< int >::type band_size(band_sizeSEXP); + Rcpp::traits::input_parameter< double >::type omegaA(omegaASEXP); + Rcpp::traits::input_parameter< int >::type max_clust(max_clustSEXP); + Rcpp::traits::input_parameter< double >::type min_fold(min_foldSEXP); + Rcpp::traits::input_parameter< int >::type min_hamming(min_hammingSEXP); + Rcpp::traits::input_parameter< bool >::type use_quals(use_qualsSEXP); + Rcpp::traits::input_parameter< int >::type qmax(qmaxSEXP); + Rcpp::traits::input_parameter< bool >::type final_consensus(final_consensusSEXP); + Rcpp::traits::input_parameter< bool >::type vectorized_alignment(vectorized_alignmentSEXP); + Rcpp::traits::input_parameter< int >::type homo_gap(homo_gapSEXP); + Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); + __result = Rcpp::wrap(dada_uniques(seqs, abundances, err, quals, score, gap, use_kmers, kdist_cutoff, band_size, omegaA, max_clust, min_fold, min_hamming, use_quals, qmax, final_consensus, vectorized_alignment, homo_gap, verbose)); + return __result; +END_RCPP +} // C_assign_taxonomy Rcpp::List C_assign_taxonomy(std::vector seqs, std::vector refs, std::vector ref_to_genus, Rcpp::IntegerMatrix genusmat, bool verbose); RcppExport SEXP dada2_C_assign_taxonomy(SEXP seqsSEXP, SEXP refsSEXP, SEXP ref_to_genusSEXP, SEXP genusmatSEXP, SEXP verboseSEXP) { diff --git a/src/chimera.cpp b/src/chimera.cpp new file mode 100644 index 0000000..c297313 --- /dev/null +++ b/src/chimera.cpp @@ -0,0 +1,151 @@ +#include "dada.h" +#include +using namespace Rcpp; + +int get_ham_endsfree(const char *seq1, const char *seq2, int len); + +//------------------------------------------------------------------ +// Determines whether sq is a perfect bimera of some combination from pars. +// +// @param sq The query DNA sequence. +// @param pars Potential bimeric "parents". +// @param max_shift A \code{integer(1)} of the maximum alignment shift allowed. +// +// [[Rcpp::export]] +bool C_is_bimera(std::string sq, std::vector pars, bool allow_one_off, int min_one_off_par_dist, Rcpp::NumericMatrix score, int gap_p, int max_shift) { + // For now only finding perfect bimeras + int i, j, left, right, left_oo, right_oo, pos, len, max_len; + // Make c-style 2d score array + int c_score[4][4]; + for(i=0;i<4;i++) { + for(j=0;j<4;j++) { + c_score[i][j] = (int) score(i,j); + } + } + + // Make integer-ized c-style sequence strings + char *seq1 = (char *) malloc(sq.size()+1); //E + // Get max length of the parent sequences + max_len=0; + for(i=0;imax_len) { max_len = len; } + } + char *seq2 = (char *) malloc(max_len+1); //E + if (seq1 == NULL || seq2 == NULL) Rcpp::stop("Memory allocation failed."); + nt2int(seq1, sq.c_str()); + + char **al; + int max_left=0, max_right=0; + int oo_max_left=0, oo_max_right=0, oo_max_left_oo=0, oo_max_right_oo=0; + + bool rval = false; + for(i=0;i= 0) { + pos--; + } + while(al[1][pos] == 6 && pos>+(len-max_shift)) { + pos--; right++; + } + while(pos>=0 && al[0][pos] == al[1][pos]) { + pos--; right++; + } + if(allow_one_off) { + // Step forward, and credit to one-off further matches (and this mismatch if not a gap) + right_oo = right; + pos--; + if(pos>=0 && al[0][pos] != 6) { right_oo++; } + while(pos>=0 && al[0][pos] == al[1][pos]) { + pos--; right_oo++; + } + } + + if((left+right) >= sq.size()) { // Toss id/pure-shift/internal-indel "parents" + continue; + } + if(left > max_left) { max_left=left; } + if(right > max_right) { max_right=right; } + + // Need to evaluate whether parents are allowed for one-off models + if(allow_one_off && get_ham_endsfree(al[0], al[1], len) >= min_one_off_par_dist) { + if(left > oo_max_left) { oo_max_left=left; } + if(right > oo_max_right) { oo_max_right=right; } + if(left_oo > oo_max_left_oo) { oo_max_left_oo=left_oo; } + if(right_oo > oo_max_right_oo) { oo_max_right_oo=right_oo; } + } + + // Evaluate, and break if found bimeric model + if((max_right+max_left)>=sq.size()) { + rval = true; + } + if(allow_one_off) { + if((oo_max_left+oo_max_right_oo)>=sq.size() || (oo_max_left_oo+oo_max_right)>=sq.size()) { + rval=true; + } + } + free(al[0]); + free(al[1]); + free(al); + } + + free(seq1); + free(seq2); + return(rval); +} + +// Internal function to get hamming distance between aligned seqs +// without counting end gaps +int get_ham_endsfree(const char *seq1, const char *seq2, int len) { + int i, j, pos, ham; + bool gap1, gap2; + // Find start of internal part of alignment + i=0; + gap1 = (seq1[i]==6); + gap2 = (seq2[i]==6); + while(gap1 || gap2) { + i++; + gap1 = (gap1 && (seq1[i]==6)); + gap2 = (gap2 && (seq2[i]==6)); + } + // Find end of internal part of alignment + j=len-1; + gap1 = (seq1[j]==6); + gap2 = (seq2[j]==6); + while(gap1 || gap2) { + j--; + gap1 = (gap1 && (seq1[j]==6)); + gap2 = (gap2 && (seq2[j]==6)); + } + // Calculate hamming distance over internal part + ham=0; + for(pos=i;pos<=j;pos++) { + if(seq1[pos] != seq2[pos]) { ham++; } + } + return(ham); +} + diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 9595218..1024fb6 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -111,153 +111,6 @@ Rcpp::IntegerVector C_eval_pair(std::string s1, std::string s2) { return(rval); } -// Internal function to get hamming distance between aligned seqs -// without counting end gaps -int get_ham_endsfree(const char *seq1, const char *seq2, int len) { - int i, j, pos, ham; - bool gap1, gap2; - // Find start of internal part of alignment - i=0; - gap1 = (seq1[i]==6); - gap2 = (seq2[i]==6); - while(gap1 || gap2) { - i++; - gap1 = (gap1 && (seq1[i]==6)); - gap2 = (gap2 && (seq2[i]==6)); - } - // Find end of internal part of alignment - j=len-1; - gap1 = (seq1[j]==6); - gap2 = (seq2[j]==6); - while(gap1 || gap2) { - j--; - gap1 = (gap1 && (seq1[j]==6)); - gap2 = (gap2 && (seq2[j]==6)); - } - // Calculate hamming distance over internal part - ham=0; - for(pos=i;pos<=j;pos++) { - if(seq1[pos] != seq2[pos]) { ham++; } - } - return(ham); -} - -//------------------------------------------------------------------ -// Determines whether sq is a perfect bimera of some combination from pars. -// -// @param sq The query DNA sequence. -// @param pars Potential bimeric "parents". -// @param max_shift A \code{integer(1)} of the maximum alignment shift allowed. -// -// [[Rcpp::export]] -bool C_is_bimera(std::string sq, std::vector pars, bool allow_one_off, int min_one_off_par_dist, Rcpp::NumericMatrix score, int gap_p, int max_shift) { - // For now only finding perfect bimeras - int i, j, left, right, left_oo, right_oo, pos, len, max_len; - // Make c-style 2d score array - int c_score[4][4]; - for(i=0;i<4;i++) { - for(j=0;j<4;j++) { - c_score[i][j] = (int) score(i,j); - } - } - - // Make integer-ized c-style sequence strings - char *seq1 = (char *) malloc(sq.size()+1); //E - // Get max length of the parent sequences - max_len=0; - for(i=0;imax_len) { max_len = len; } - } - char *seq2 = (char *) malloc(max_len+1); //E - if (seq1 == NULL || seq2 == NULL) Rcpp::stop("Memory allocation failed."); - nt2int(seq1, sq.c_str()); - - char **al; // Remember, alignments must be freed! - int max_left=0, max_right=0; - int oo_max_left=0, oo_max_right=0, oo_max_left_oo=0, oo_max_right_oo=0; - - bool rval = false; - for(i=0;i= 0) { - pos--; - } - while(al[1][pos] == 6 && pos>+(len-max_shift)) { - pos--; right++; - } - while(pos>=0 && al[0][pos] == al[1][pos]) { - pos--; right++; - } - if(allow_one_off) { - // Step forward, and credit to one-off further matches (and this mismatch if not a gap) - right_oo = right; - pos--; - if(pos>=0 && al[0][pos] != 6) { right_oo++; } - while(pos>=0 && al[0][pos] == al[1][pos]) { - pos--; right_oo++; - } - } - - if((left+right) >= sq.size()) { // Toss id/pure-shift/internal-indel "parents" - continue; - } - if(left > max_left) { max_left=left; } - if(right > max_right) { max_right=right; } - - // Need to evaluate whether parents are allowed for one-off models - if(allow_one_off && get_ham_endsfree(al[0], al[1], len) >= min_one_off_par_dist) { - if(left > oo_max_left) { oo_max_left=left; } - if(right > oo_max_right) { oo_max_right=right; } - if(left_oo > oo_max_left_oo) { oo_max_left_oo=left_oo; } - if(right_oo > oo_max_right_oo) { oo_max_right_oo=right_oo; } - } - - // Evaluate, and break if found bimeric model - if((max_right+max_left)>=sq.size()) { - rval = true; - break; - } - if(allow_one_off) { - if((oo_max_left+oo_max_right_oo)>=sq.size() || (oo_max_left_oo+oo_max_right)>=sq.size()) { - rval=true; - break; - } - } - } - - free(seq1); - free(seq2); - free(al[0]); - free(al[1]); - free(al); - return(rval); -} - //------------------------------------------------------------------ // Calculates the consensus of two sequences (prefer sequence wins mismatches). //