From 7315c67e235c8f96c89ce38c02c1b3bf93dc579a Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Wed, 20 Apr 2016 15:27:01 +0100 Subject: [PATCH 01/15] Adding soapsplice2bed script. Fixed a bug in the full pipeline which prevented junction filtering from working. --- Makefile.am | 1 + src/portcullis.cc | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile.am b/Makefile.am index 59ddf4de..e78793b8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -39,6 +39,7 @@ dist_bin_SCRIPTS = scripts/analyse_alignments.py \ scripts/predict.snakefile \ scripts/read_gen.snakefile \ scripts/snakey.py \ + scripts/soapsplice2bed.py \ scripts/spanki2bed.py \ scripts/spanki_filter.py \ scripts/split_error_classes.py \ diff --git a/src/portcullis.cc b/src/portcullis.cc index 10f9b5fc..74af944f 100644 --- a/src/portcullis.cc +++ b/src/portcullis.cc @@ -260,7 +260,8 @@ int mainFull(int argc, char *argv[]) { cout << "Identifying junctions and calculating metrics" << endl << "---------------------------------------------" << endl << endl; - path juncOut = outputDir.string() + "/2-junc/portcullis_all"; + path juncDir = outputDir.string() + "/2-junc"; + path juncOut = juncDir.string() + "/portcullis_all"; // Identify junctions and calculate metrics JunctionBuilder jb(prepDir.string(), juncOut); @@ -281,7 +282,7 @@ int mainFull(int argc, char *argv[]) { << "-------------------" << endl << endl; path filtOut = outputDir.string() + "/3-filt/portcullis_filtered"; - path juncTab = juncOut.string() + "/portcullis_all.junctions.tab"; + path juncTab = juncDir.string() + "/portcullis_all.junctions.tab"; JunctionFilter filter(juncTab, filtOut); filter.setVerbose(verbose); From 7fa519c12ef60beb6243f387679fac9ae653a09f Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Thu, 21 Apr 2016 19:00:44 +0100 Subject: [PATCH 02/15] Fixed a critical bug in the self training routine which prevented it from working unless you label the data. Added ability to generate an intron score to help boost accuracy. --- configure.ac | 2 +- data/selftrain_initial_neg.json | 44 +++++++++++---- data/selftrain_initial_neg.json.old | 57 +++++++++++++++++++ data/selftrain_initial_pos.json | 34 ++++++++--- data/selftrain_initial_pos.json.old | 37 ++++++++++++ doc/source/conf.py | 4 +- lib/include/portcullis/junction.hpp | 57 +++++++++++++------ lib/include/portcullis/junction_system.hpp | 2 - lib/include/portcullis/rule_parser.hpp | 11 ++-- lib/src/junction.cc | 63 +++++++++++---------- lib/src/junction_system.cc | 13 ----- lib/src/rule_parser.cc | 14 +++-- src/junction_filter.cc | 65 ++++++++++++++++++---- src/junction_filter.hpp | 4 +- src/train.cc | 37 ++++++------ src/train.hpp | 32 +++++++++-- 16 files changed, 348 insertions(+), 128 deletions(-) create mode 100644 data/selftrain_initial_neg.json.old create mode 100644 data/selftrain_initial_pos.json.old diff --git a/configure.ac b/configure.ac index 94687932..346afb02 100644 --- a/configure.ac +++ b/configure.ac @@ -4,7 +4,7 @@ # Autoconf setup AC_PREREQ([2.68]) -AC_INIT([portcullis],[0.15.0],[daniel.mapleson@tgac.ac.uk],[portcullis],[http://www.tgac.ac.uk]) +AC_INIT([portcullis],[0.16.0],[daniel.mapleson@tgac.ac.uk],[portcullis],[http://www.tgac.ac.uk]) AC_CONFIG_SRCDIR([src/portcullis.cc]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_MACRO_DIR([m4]) diff --git a/data/selftrain_initial_neg.json b/data/selftrain_initial_neg.json index 23e962fb..4885e954 100644 --- a/data/selftrain_initial_neg.json +++ b/data/selftrain_initial_neg.json @@ -2,27 +2,35 @@ "parameters": { "M1-canonical_ss": { "operator": "in", - "value": ["N"] + "value": ["N", "S"] }, "M2-nb_reads": { "operator": "lte", "value": 1 }, "M3-nb_dist_aln": { - "operator": "eq", - "value": 1 + "operator": "gte", + "value": 2 }, "M4-nb_rel_aln": { "operator": "eq", "value": 0 }, "M11-entropy": { - "operator": "eq", - "value": 0.0 + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 }, "M12-maxmmes": { "operator": "lt", - "value": 8 + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lte", + "value": 10 }, "M8-max_min_anc": { "operator": "lt", @@ -38,16 +46,32 @@ }, "M13-hamming5p": { "operator": "lte", - "value": 1 + "value": 2 }, "M14-hamming3p": { "operator": "lte", - "value": 1 + "value": 2 }, "M19-mean_mismatches": { "operator": "gte", "value": 5.0 - } + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + } }, - "expression": "M2-nb_reads | M12-maxmmes | Suspect" + "expression": "( M20-nb_usrs & M21-nb_msrs & M12-maxmmes.2 ) | ( M20-nb_usrs.2 & M1-canonical_ss & M12-maxmmes.2 ) | ( Suspect & M11-entropy.2 ) | M12-maxmmes | ( M1-canonical_ss & M4-nb_rel_aln )" } diff --git a/data/selftrain_initial_neg.json.old b/data/selftrain_initial_neg.json.old new file mode 100644 index 00000000..86eb3f59 --- /dev/null +++ b/data/selftrain_initial_neg.json.old @@ -0,0 +1,57 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 9 + }, + "M12-maxmmes.2": { + "operator": "lte", + "value": 12 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + } + }, + "expression": "M19-mean_mismatches | ( M1-canonical_ss & M11-entropy ) | ( M4-nb_rel_aln & M11-entropy ) | ( M12-maxmmes & M11-entropy ) | ( M12-maxmmes.2 & M2-nb_reads ) | ( M13-hamming5p & M14-hamming3p ) | PFP | ( Suspect & M2-nb_reads )" +} diff --git a/data/selftrain_initial_pos.json b/data/selftrain_initial_pos.json index d850bd63..a3440720 100644 --- a/data/selftrain_initial_pos.json +++ b/data/selftrain_initial_pos.json @@ -2,32 +2,52 @@ "parameters": { "M1-canonical_ss": { "operator": "in", - "value": ["C"] + "value": ["C", "S"] }, "M4-nb_rel_aln": { "operator": "gte", "value": 5 }, + "M4-nb_rel_aln.2": { + "operator": "gte", + "value": 1 + }, "M12-maxmmes": { "operator": "gte", "value": 20 }, "M11-entropy": { "operator": "gt", - "value": 2.5 + "value": 5.0 }, "M13-hamming5p": { "operator": "gte", - "value": 8 + "value": 5 + }, + "M13-hamming5p.2": { + "operator": "gte", + "value": 7 }, "M14-hamming3p": { "operator": "gte", - "value": 8 + "value": 5 + }, + "M14-hamming3p.2": { + "operator": "gte", + "value": 7 }, "M19-mean_mismatches": { - "operator": "lte", + "operator": "eq", + "value": 0 + }, + "M19-mean_mismatches.2": { + "operator": "lt", "value": 1.0 - } + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 5 + } }, - "expression": "M1-canonical_ss & M4-nb_rel_aln & M11-entropy & M12-maxmmes & M13-hamming5p & M14-hamming3p & M19-mean_mismatches" + "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & ( ( M19-mean_mismatches & M20-nb_usrs ) | ( M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) )" } diff --git a/data/selftrain_initial_pos.json.old b/data/selftrain_initial_pos.json.old new file mode 100644 index 00000000..c4552792 --- /dev/null +++ b/data/selftrain_initial_pos.json.old @@ -0,0 +1,37 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["C"] + }, + "M4-nb_rel_aln": { + "operator": "gte", + "value": 5 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 18 + }, + "M11-entropy": { + "operator": "gt", + "value": 2.5 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 7 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 7 + }, + "M19-mean_mismatches": { + "operator": "eq", + "value": 0.0 + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 5 + } + }, + "expression": "M1-canonical_ss & M4-nb_rel_aln & M11-entropy & M12-maxmmes & M13-hamming5p & M14-hamming3p & M19-mean_mismatches & M20-nb_usrs" +} diff --git a/doc/source/conf.py b/doc/source/conf.py index cec4720f..419f9401 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -53,9 +53,9 @@ # built documents. # # The short X.Y version. -version = '0.15.0' +version = '0.16.0' # The full version, including alpha/beta/rc tags. -release = '0.15.0' +release = '0.16.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp index ed510d25..047c0744 100644 --- a/lib/include/portcullis/junction.hpp +++ b/lib/include/portcullis/junction.hpp @@ -24,6 +24,7 @@ #include #include using std::ostream; +using std::cout; using std::endl; using std::min; using std::max; @@ -97,14 +98,15 @@ const vector METRIC_NAMES = { "M17-primary_junc", "M18-mm_score", "M19-mean_mismatches", - "M20-nb_msrs", - "M21-nb_up_juncs", - "M22-nb_down_juncs", - "M23-up_aln", - "M24-down_aln", - "M25-dist_2_up_junc", - "M26-dist_2_down_junc", - "M27-dist_nearest_junc" + "M20-nb_usrs", + "M21-nb_msrs", + "M22-nb_up_juncs", + "M23-nb_down_juncs", + "M24-up_aln", + "M25-down_aln", + "M26-dist_2_up_junc", + "M27-dist_2_down_junc", + "M28-dist_nearest_junc" }; const vector STRAND_NAMES = { @@ -257,15 +259,16 @@ class Junction { bool primaryJunction; // Metric 17 double multipleMappingScore; // Metric 18 double meanMismatches; // Metric 19 - uint32_t nbMultipleSplicedReads; // Metric 20 - uint16_t nbUpstreamJunctions; // Metric 21 - uint16_t nbDownstreamJunctions; // Metric 22 - uint32_t nbUpstreamFlankingAlignments; // Metric 23 - uint32_t nbDownstreamFlankingAlignments; // Metric 24 - int32_t distanceToNextUpstreamJunction; // Metric 25 - int32_t distanceToNextDownstreamJunction; // Metric 26 - int32_t distanceToNearestJunction; // Metric 27 - vector junctionOverhangs; // Metric 28-37 + //uint32_t nbUniquelySplicedReads; // Metric 20 (Use getter) + uint32_t nbMultipleSplicedReads; // Metric 21 + uint16_t nbUpstreamJunctions; // Metric 22 + uint16_t nbDownstreamJunctions; // Metric 23 + uint32_t nbUpstreamFlankingAlignments; // Metric 24 + uint32_t nbDownstreamFlankingAlignments; // Metric 25 + int32_t distanceToNextUpstreamJunction; // Metric 26 + int32_t distanceToNextDownstreamJunction; // Metric 27 + int32_t distanceToNearestJunction; // Metric 28 + vector junctionOverhangs; // Metric 29-38 // **** Predictions **** @@ -689,6 +692,11 @@ class Junction { return nbMultipleSplicedReads; } + uint32_t getNbUniquelySplicedReads() const { + return nbJunctionAlignments - nbMultipleSplicedReads; + } + + uint16_t getNbDownstreamJunctions() const { return nbDownstreamJunctions; } @@ -813,7 +821,7 @@ class Junction { void setMeanMismatches(double meanMismatches) { this->meanMismatches = meanMismatches; } - + void setNbMultipleSplicedReads(uint32_t nbMultipleSplicedReads) { this->nbMultipleSplicedReads = nbMultipleSplicedReads; } @@ -842,6 +850,18 @@ class Junction { string locationAsString() const { return this->intron->toString() + strandToChar(this->consensusStrand); } + + /** + * Calculates a score for this intron size based on how this intron size fits + * into an expected distribution specified by the length at the 95th percentile + * (L95) provided by the user. Introns of length < L95 have a score of 0. + * Introns with length > L95 have score: -ln(size - L95) + * @param L95 Intron size at 95 percentile of a correct distribution + * @return A score for this intron size given the L95 + */ + double calcIntronScore(const uint32_t L95) const { + return this->intron->size() <= L95 ? 0.0 : -log(this->intron->size() - L95); + } @@ -921,6 +941,7 @@ class Junction { << j.primaryJunction << "\t" << j.multipleMappingScore << "\t" << j.meanMismatches << "\t" + << j.getNbUniquelySplicedReads() << "\t" << j.nbMultipleSplicedReads << "\t" << j.nbDownstreamJunctions << "\t" << j.nbUpstreamJunctions << "\t" diff --git a/lib/include/portcullis/junction_system.hpp b/lib/include/portcullis/junction_system.hpp index 61b7bc8e..47b91d03 100644 --- a/lib/include/portcullis/junction_system.hpp +++ b/lib/include/portcullis/junction_system.hpp @@ -152,8 +152,6 @@ class JunctionSystem { void calcJunctionStats(bool verbose); - void calcTrimmedOverhangScore(); - void sort(); void saveAll(const path& outputPrefix, const string& source); diff --git a/lib/include/portcullis/rule_parser.hpp b/lib/include/portcullis/rule_parser.hpp index 27dfd2b5..ac24a33b 100644 --- a/lib/include/portcullis/rule_parser.hpp +++ b/lib/include/portcullis/rule_parser.hpp @@ -88,11 +88,12 @@ const unordered_set numericParams = { "M17-primary_junc", "M18-mm_score", "M19-nb_mismatches", - "M20-nb_msrs", - "M21-nb_up_juncs", - "M22-nb_down_juncs", - "M23-up_aln", - "M24-down_aln" + "M20-nb_usrs", + "M21-nb_msrs", + "M22-nb_up_juncs", + "M23-nb_down_juncs", + "M24-up_aln", + "M25-down_aln" }; const unordered_set stringParams = { diff --git a/lib/src/junction.cc b/lib/src/junction.cc index 5e0d4c2d..af3a438d 100644 --- a/lib/src/junction.cc +++ b/lib/src/junction.cc @@ -823,10 +823,10 @@ void portcullis::Junction::calcMismatchStats() { // Assuming we have some mismatches determine if this junction has no overhangs // extending beyond first mismatch. If so determine if that distance is small // enough to consider the junction as suspicious - if (nbMismatches > 0) { + if (nbMismatches > 0 && firstMismatch < 20) { bool found = false; for(const auto& a : alignments) { - if (a->mmes > firstMismatch) { + if (a->minMatch > firstMismatch) { found = true; break; } @@ -941,14 +941,15 @@ void portcullis::Junction::outputDescription(std::ostream &strm, string delimite << "M17: Primary Junction: " << boolalpha << primaryJunction << delimiter << "M18: Multiple mapping score: " << multipleMappingScore << delimiter << "M19: Mean mismatches: " << meanMismatches << delimiter - << "M20: # Multiple Spliced Reads: " << nbMultipleSplicedReads << delimiter - << "M21: # Upstream Junctions: " << nbUpstreamJunctions << delimiter - << "M22: # Downstream Junctions: " << nbDownstreamJunctions << delimiter - << "M23: # Upstream Non-Spliced Alignments: " << nbUpstreamFlankingAlignments << delimiter - << "M24: # Downstream Non-Spliced Alignments: " << nbDownstreamFlankingAlignments << delimiter - << "M25: Distance to next upstream junction: " << distanceToNextUpstreamJunction << delimiter - << "M26: Distance to next downstream junction: " << distanceToNextDownstreamJunction << delimiter - << "M27: Distance to nearest junction: " << distanceToNearestJunction; + << "M20: # Uniquely Spliced Reads: " << getNbUniquelySplicedReads() << delimiter + << "M21: # Multiple Spliced Reads: " << nbMultipleSplicedReads << delimiter + << "M22: # Upstream Junctions: " << nbUpstreamJunctions << delimiter + << "M23: # Downstream Junctions: " << nbDownstreamJunctions << delimiter + << "M24: # Upstream Non-Spliced Alignments: " << nbUpstreamFlankingAlignments << delimiter + << "M25: # Downstream Non-Spliced Alignments: " << nbDownstreamFlankingAlignments << delimiter + << "M26: Distance to next upstream junction: " << distanceToNextUpstreamJunction << delimiter + << "M27: Distance to next downstream junction: " << distanceToNextDownstreamJunction << delimiter + << "M28: Distance to nearest junction: " << distanceToNearestJunction; } /** @@ -979,14 +980,15 @@ void portcullis::Junction::condensedOutputDescription(std::ostream &strm, string << "M17-PrimaryJunction=" << boolalpha << primaryJunction << delimiter << "M18-MultipleMappingScore=" << multipleMappingScore << delimiter << "M19-MeanMismatches=" << meanMismatches << delimiter - << "M20-NbMultipleSplicedReads=" << nbMultipleSplicedReads << delimiter - << "M21-NbUpstreamJunctions=" << nbUpstreamJunctions << delimiter - << "M22-NbDownstreamJunctions=" << nbDownstreamJunctions << delimiter - << "M23-NbUpstreamNonSplicedAlignments=" << nbUpstreamFlankingAlignments << delimiter - << "M24-NbDownstreamNonSplicedAlignments=" << nbDownstreamFlankingAlignments << delimiter - << "M25-DistanceToNextUpstreamJunction=" << distanceToNextUpstreamJunction << delimiter - << "M26-DistanceToNextDownstreamJunction=" << distanceToNextDownstreamJunction << delimiter - << "M27-DistanceToNearestJunction=" << distanceToNearestJunction << delimiter + << "M20-NbUniquelySplicedReads=" << getNbUniquelySplicedReads() << delimiter + << "M21-NbMultipleSplicedReads=" << nbMultipleSplicedReads << delimiter + << "M22-NbUpstreamJunctions=" << nbUpstreamJunctions << delimiter + << "M23-NbDownstreamJunctions=" << nbDownstreamJunctions << delimiter + << "M24-NbUpstreamNonSplicedAlignments=" << nbUpstreamFlankingAlignments << delimiter + << "M25-NbDownstreamNonSplicedAlignments=" << nbDownstreamFlankingAlignments << delimiter + << "M26-DistanceToNextUpstreamJunction=" << distanceToNextUpstreamJunction << delimiter + << "M27-DistanceToNextDownstreamJunction=" << distanceToNextDownstreamJunction << delimiter + << "M28-DistanceToNearestJunction=" << distanceToNearestJunction << delimiter << "PFP=" << boolalpha << pfp; } @@ -1200,21 +1202,22 @@ shared_ptr portcullis::Junction::parse(const string& line) j->setPrimaryJunction(lexical_cast(parts[29])); j->setMultipleMappingScore(lexical_cast(parts[30])); j->setMeanMismatches(lexical_cast(parts[31])); - j->setNbMultipleSplicedReads(lexical_cast(parts[32])); - j->setNbUpstreamJunctions(lexical_cast(parts[33])); - j->setNbDownstreamJunctions(lexical_cast(parts[34])); - j->setNbUpstreamFlankingAlignments(lexical_cast(parts[35])); - j->setNbDownstreamFlankingAlignments(lexical_cast(parts[36])); - j->setDistanceToNextUpstreamJunction(lexical_cast(parts[37])); - j->setDistanceToNextDownstreamJunction(lexical_cast(parts[38])); - j->setDistanceToNearestJunction(lexical_cast(parts[39])); + //j->setNbUniquelySplicedReads(lexical_cast(parts[32])); + j->setNbMultipleSplicedReads(lexical_cast(parts[33])); + j->setNbUpstreamJunctions(lexical_cast(parts[34])); + j->setNbDownstreamJunctions(lexical_cast(parts[35])); + j->setNbUpstreamFlankingAlignments(lexical_cast(parts[36])); + j->setNbDownstreamFlankingAlignments(lexical_cast(parts[37])); + j->setDistanceToNextUpstreamJunction(lexical_cast(parts[38])); + j->setDistanceToNextDownstreamJunction(lexical_cast(parts[39])); + j->setDistanceToNearestJunction(lexical_cast(parts[40])); // Read Junction overhangs - j->setMeanQueryLength(lexical_cast(parts[40])); - j->setSuspicious(lexical_cast(parts[41])); - j->setPotentialFalsePositive(lexical_cast(parts[42])); + j->setMeanQueryLength(lexical_cast(parts[41])); + j->setSuspicious(lexical_cast(parts[42])); + j->setPotentialFalsePositive(lexical_cast(parts[43])); for(size_t i = 0; i < JO_NAMES.size(); i++) { - j->setJunctionOverhangs(i, lexical_cast(parts[43 + i])); + j->setJunctionOverhangs(i, lexical_cast(parts[44 + i])); } return j; diff --git a/lib/src/junction_system.cc b/lib/src/junction_system.cc index 7104d459..6a6c0b2a 100644 --- a/lib/src/junction_system.cc +++ b/lib/src/junction_system.cc @@ -368,19 +368,6 @@ void portcullis::JunctionSystem::calcJunctionStats(bool verbose) { } } -/** - * Used of calculating metric 27. Trimmed Overhang score. This provide a probability - * that this junction is genuine based on comparisons of expected distribution - * of coverage to observed distribution of coverage, after reads have been trimmed - * to the first mismatch up and downstream of the junction. An L1 regularized - * logistic regression is then fitted to - * - * This metric similar to this is used in FineSplice. - */ -void portcullis::JunctionSystem::calcTrimmedOverhangScore() { - - -} void portcullis::JunctionSystem::sort() { diff --git a/lib/src/rule_parser.cc b/lib/src/rule_parser.cc index 0c9a0380..a3bb74d6 100644 --- a/lib/src/rule_parser.cc +++ b/lib/src/rule_parser.cc @@ -219,14 +219,16 @@ double portcullis::eval::getNumericFromJunc(const var& fullname) const { case 18: return junc->getMeanMismatches(); case 19: - return junc->getNbMultipleSplicedReads(); + return junc->getNbUniquelySplicedReads(); case 20: - return junc->getNbUpstreamJunctions(); + return junc->getNbMultipleSplicedReads(); case 21: - return junc->getNbDownstreamJunctions(); + return junc->getNbUpstreamJunctions(); case 22: - return junc->getNbUpstreamFlankingAlignments(); + return junc->getNbDownstreamJunctions(); case 23: + return junc->getNbUpstreamFlankingAlignments(); + case 24: return junc->getNbDownstreamFlankingAlignments(); } @@ -235,7 +237,7 @@ double portcullis::eval::getNumericFromJunc(const var& fullname) const { return junc->isSuspicious(); } else if (boost::iequals(name, "PFP")) { - return junc->getSpliceSiteType(); + return junc->isPotentialFalsePositive(); } @@ -257,7 +259,7 @@ string portcullis::eval::getStringFromJunc(const var& fullname) const { } BOOST_THROW_EXCEPTION(RuleParserException() << RuleParserErrorInfo(string( - "Unrecognised param: ") + name)); + "Unrecognised param: ") + name)); } bool portcullis::eval::evalNumberLeaf(Operator op, double threshold, double value) const { diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 2e3bd4f6..cdee0531 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -196,6 +196,9 @@ void portcullis::JunctionFilter::filter() { currentJuncs.push_back(j); } + // To be overridden if we are training + uint32_t L95 = 0; + if (train) { JunctionList initialSet; @@ -216,7 +219,9 @@ void portcullis::JunctionFilter::filter() { pos = passJuncs.size(); for(auto& j : passJuncs) { - initialSet.push_back(j); + JunctionPtr copy = make_shared(*j); + copy->setGenuine(true); + initialSet.push_back(copy); } if (!genuineFile.empty()) { shared_ptr p = calcPerformance(passJuncs, failJuncs); @@ -243,7 +248,11 @@ void portcullis::JunctionFilter::filter() { cout << "Saving missed positive junctions to disk:" << endl; missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp"); } - + + + // Analyse positive set to get L0.05 of intron size + L95 = this->calcIntronThreshold(passJuncs); + cout << "Length of intron at 95th percentile of positive set: " << L95 << endl << endl; passJuncs.clear(); failJuncs.clear(); @@ -257,7 +266,9 @@ void portcullis::JunctionFilter::filter() { neg = passJuncs.size(); for(auto& j : passJuncs) { - initialSet.push_back(j); + JunctionPtr copy = make_shared(*j); + copy->setGenuine(false); + initialSet.push_back(copy); } if (!genuineFile.empty()) { @@ -286,11 +297,17 @@ void portcullis::JunctionFilter::filter() { cout << "Saving missed negative junctions to disk:" << endl; missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); } + + const uint32_t L95_fail = this->calcIntronThreshold(passJuncs); + cout << "Length of intron at 95th percentile of negative set: " << L95_fail << endl << endl; + cout << "Initial training set consists of " << pos << " positive and " << neg << " negative junctions." << endl << endl; + + cout << "Training a random forest model using the initial positive and negative datasets" << endl; - shared_ptr forest = Train::trainInstance(initialSet, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true); + shared_ptr forest = Train::trainInstance(initialSet, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, L95); forest->saveToFile(); forest->writeOutput(&cout); @@ -311,7 +328,7 @@ void portcullis::JunctionFilter::filter() { JunctionList passJuncs; JunctionList failJuncs; - forestPredict(currentJuncs, passJuncs, failJuncs); + forestPredict(currentJuncs, passJuncs, failJuncs, L95); printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering")); @@ -451,6 +468,18 @@ void portcullis::JunctionFilter::filter() { } } +uint32_t portcullis::JunctionFilter::calcIntronThreshold(const JunctionList& juncs) { + + vector intron_sizes; + for(auto& j : juncs) { + intron_sizes.push_back(j->getIntronSize()); + } + + std::sort(intron_sizes.begin(), intron_sizes.end()); + + return intron_sizes[intron_sizes.size() * 0.95]; +} + void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) { // Output stats size_t diff = in.size() - pass.size(); @@ -512,13 +541,13 @@ void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, cons } -void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail) { +void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95) { if (verbose) { cerr << endl << "Preparing junction metrics into matrix" << endl; } - Data* testingData = Train::juncs2FeatureVectors(all); + Data* testingData = Train::juncs2FeatureVectors(all, L95); // Load forest from disk @@ -571,9 +600,15 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction f->run(verbose); if (verbose) { - cerr << "Separating original junction data into pass and fail categories" << endl; + cerr << "Separating original junction data into pass and fail categories." << endl; } + + path scorepath = output.string() + ".scores"; + ofstream scoreStream(scorepath.c_str()); + for(size_t i = 0; i < all.size(); i++) { + + scoreStream << f->getPredictions()[i][0] << endl; if (f->getPredictions()[i][0] >= 0.5) { pass.push_back(all[i]); } @@ -582,6 +617,11 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction } } + scoreStream.close(); + + cout << "Saved junction scores to: " << scorepath << endl; + + delete testingData; } @@ -594,6 +634,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { path referenceFile; path output; uint16_t threads; + bool no_ml; bool saveBad; int32_t max_length; string canonical; @@ -618,6 +659,8 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { "If you have a list of line separated boolean values in a file, indicating whether each junction in your input is genuine or not, then we can use that information here to gauge the accuracy of the predictions.") ("reference,r", po::value(&referenceFile), "Reference annotation of junctions in BED format. Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria. If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.") + ("no_ml,n", po::bool_switch(&no_ml)->default_value(false), + "Disables machine learning filtering") ("save_bad,b", po::bool_switch(&saveBad)->default_value(false), "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") ("source", po::value(&source)->default_value(DEFAULT_FILTER_SOURCE), @@ -679,12 +722,14 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { // Only set the filter rules if specified. filter.setFilterFile(filterFile); filter.setGenuineFile(genuineFile); - if (modelFile.empty()) { + if (modelFile.empty() && !no_ml) { filter.setTrain(true); } else { filter.setTrain(false); - filter.setModelFile(modelFile); + if (!no_ml) { + filter.setModelFile(modelFile); + } } filter.setReferenceFile(referenceFile); diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index 12642bcd..5ff86eef 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -261,7 +261,7 @@ class JunctionFilter { protected: - void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail); + void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95); shared_ptr calcPerformance(const JunctionList& pass, const JunctionList& fail) { return calcPerformance(pass, fail, false); @@ -272,6 +272,8 @@ class JunctionFilter { void doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail, const string& prefix, JuncResultMap& resultMap); + uint32_t calcIntronThreshold(const JunctionList& pass); + public: static string helpMessage() { diff --git a/src/train.cc b/src/train.cc index 82c2abc3..5907e434 100644 --- a/src/train.cc +++ b/src/train.cc @@ -68,7 +68,7 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){ verbose = false; } -Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x) { +Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint32_t L95) { vector headers; headers.reserve( VAR_NAMES.size() + JO_NAMES.size() ); @@ -82,16 +82,17 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x) { for (const auto& j : x) { d[0 * x.size() + row] = j->getNbJunctionAlignments(); - d[1 * x.size() + row] = j->getNbDistinctAlignments(); - d[2 * x.size() + row] = j->getNbReliableAlignments(); - d[3 * x.size() + row] = j->getMaxMinAnchor(); - d[4 * x.size() + row] = j->getDiffAnchor(); - d[5 * x.size() + row] = j->getNbDistinctAnchors(); - d[6 * x.size() + row] = j->getEntropy(); - d[7 * x.size() + row] = j->getMaxMMES(); - d[8 * x.size() + row] = j->getHammingDistance5p(); - d[9 * x.size() + row] = j->getHammingDistance3p(); - d[10 * x.size() + row] = j->isGenuine(); + //d[1 * x.size() + row] = j->getNbDistinctAlignments(); + d[1 * x.size() + row] = j->getNbReliableAlignments(); + //d[3 * x.size() + row] = j->getMaxMinAnchor(); + //d[4 * x.size() + row] = j->getDiffAnchor(); + //d[5 * x.size() + row] = j->getNbDistinctAnchors(); + d[2 * x.size() + row] = j->getEntropy(); + d[3 * x.size() + row] = j->getMaxMMES(); + d[4 * x.size() + row] = j->getHammingDistance5p(); + d[5 * x.size() + row] = j->getHammingDistance3p(); + //d[6 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score + d[6 * x.size() + row] = j->isGenuine(); // Junction overhang values at each position are first converted into deviation from expected distributions double half_read_length = (double)j->getMeanQueryLength() / 2.0; for(size_t i = 0; i < JO_NAMES.size(); i++) { @@ -101,7 +102,7 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x) { double Ei = (double)j->getNbJunctionAlignments() * Pi; // Expected count at this position double Xi = abs(log2(Ni / Ei)); // Log deviation - d[(i + 11) * x.size() + row] = Xi; + d[(i + 7) * x.size() + row] = Xi; } row++; @@ -112,10 +113,12 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x) { return data; } -shared_ptr portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) { +shared_ptr portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95) { - Data* trainingData = juncs2FeatureVectors(x); + cout << "Creating feature vector" << endl; + Data* trainingData = juncs2FeatureVectors(x, L95); + cout << "Initialising random forest" << endl; shared_ptr f = nullptr; if (regressionMode) { f = make_shared(); @@ -146,17 +149,17 @@ shared_ptr portcullis::Train::trainInstance(const JunctionList& x, strin false, // predall 1.0); // Sample fraction - + cout << "Training" << endl; f->setVerboseOut(&cerr); f->run(verbose); return f; } -void portcullis::Train::testInstance(shared_ptr f, const JunctionList& x) { +void portcullis::Train::testInstance(shared_ptr f, const JunctionList& x, const uint32_t L95) { // Convert testing set junctions into feature vector - Data* testingData = juncs2FeatureVectors(x); + Data* testingData = juncs2FeatureVectors(x, L95); f->setPredictionMode(true); f->setData(testingData); diff --git a/src/train.hpp b/src/train.hpp index 68fefdff..7abed309 100644 --- a/src/train.hpp +++ b/src/train.hpp @@ -51,9 +51,18 @@ const double DEFAULT_TRAIN_FRACTION = 1.0; const int DEFAULT_SEED = 1234567; // To avoid non-deterministic behaviour // List of variable names -const vector VAR_NAMES = { "M2-nb-reads", "M3-nb_dist_aln", "M4-nb_rel_aln", - "M8-max_min_anc", "M9-dif_anc", "M10-dist_anc", "M11-entropy", - "M12-maxmmes", "M13-hammping5p", "M14-hamming3p", +const vector VAR_NAMES = { + "M2-nb-reads", + //"M3-nb_dist_aln", + "M4-nb_rel_aln", + //"M8-max_min_anc", + //"M9-dif_anc", + //"M10-dist_anc", + "M11-entropy", + "M12-maxmmes", + "M13-hamming5p", + "M14-hamming3p", + //"IntronScore", "Genuine" }; // Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/ @@ -220,15 +229,26 @@ class Train { static int main(int argc, char *argv[]); - static Data* juncs2FeatureVectors(const JunctionList& x); + static Data* juncs2FeatureVectors(const JunctionList& x) { + return juncs2FeatureVectors(x, 0); + } + static Data* juncs2FeatureVectors(const JunctionList& x, const uint32_t L95); static shared_ptr trainInstance(const JunctionList& x, string outputPrefix, - uint16_t trees, uint16_t threads, bool regressionMode, bool verbose); + uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) { + return trainInstance(x, outputPrefix, trees, threads, regressionMode, verbose, 0); + } + static shared_ptr trainInstance(const JunctionList& x, string outputPrefix, + uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95); protected: - void testInstance(shared_ptr f, const JunctionList& y); + void testInstance(shared_ptr f, const JunctionList& y) { + return testInstance(f, y, 0); + } + + void testInstance(shared_ptr f, const JunctionList& y, const uint32_t L95); void getRandomSubset(const JunctionList& in, JunctionList& out); From f6165ff5177056b485c8074690ec05ff357bf532 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 22 Apr 2016 13:26:28 +0100 Subject: [PATCH 03/15] Updating the junction analysis tool to include reliable 2 raw alignment ratio scores. Also removed some of the features going into the prediction tool which seems to ensure that all features are now contributing to the decision making process under all the situations I've tested so far. Performance seems on par with finesplice, but we are still a little way behind truesight and soapsplice with this version. --- data/selftrain_initial_pos.json | 6 ++- lib/include/portcullis/junction.hpp | 54 ++++++++++++++------------ lib/include/portcullis/rule_parser.hpp | 9 +++-- lib/src/junction.cc | 39 ++++++++++--------- lib/src/rule_parser.cc | 9 +++-- src/junction_filter.cc | 31 ++++++++++----- src/train.cc | 24 ++++++------ src/train.hpp | 16 ++++---- 8 files changed, 110 insertions(+), 78 deletions(-) diff --git a/data/selftrain_initial_pos.json b/data/selftrain_initial_pos.json index a3440720..83539e6f 100644 --- a/data/selftrain_initial_pos.json +++ b/data/selftrain_initial_pos.json @@ -47,7 +47,11 @@ "M20-nb_usrs": { "operator": "gte", "value": 5 + }, + "M22-rel2raw": { + "operator": "gte", + "value": 0.1 } }, - "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & ( ( M19-mean_mismatches & M20-nb_usrs ) | ( M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) )" + "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M22-rel2raw & ( ( M19-mean_mismatches & M20-nb_usrs ) | ( M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) )" } diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp index 047c0744..f7219007 100644 --- a/lib/include/portcullis/junction.hpp +++ b/lib/include/portcullis/junction.hpp @@ -100,13 +100,14 @@ const vector METRIC_NAMES = { "M19-mean_mismatches", "M20-nb_usrs", "M21-nb_msrs", - "M22-nb_up_juncs", - "M23-nb_down_juncs", - "M24-up_aln", - "M25-down_aln", - "M26-dist_2_up_junc", - "M27-dist_2_down_junc", - "M28-dist_nearest_junc" + "M22-rel2raw", + "M23-nb_up_juncs", + "M24-nb_down_juncs", + "M25-up_aln", + "M26-down_aln", + "M27-dist_2_up_junc", + "M28-dist_2_down_junc", + "M29-dist_nearest_junc" }; const vector STRAND_NAMES = { @@ -259,16 +260,17 @@ class Junction { bool primaryJunction; // Metric 17 double multipleMappingScore; // Metric 18 double meanMismatches; // Metric 19 - //uint32_t nbUniquelySplicedReads; // Metric 20 (Use getter) + //uint32_t nbUniquelySplicedReads; // Metric 20 (Use getter) uint32_t nbMultipleSplicedReads; // Metric 21 - uint16_t nbUpstreamJunctions; // Metric 22 - uint16_t nbDownstreamJunctions; // Metric 23 - uint32_t nbUpstreamFlankingAlignments; // Metric 24 - uint32_t nbDownstreamFlankingAlignments; // Metric 25 - int32_t distanceToNextUpstreamJunction; // Metric 26 - int32_t distanceToNextDownstreamJunction; // Metric 27 - int32_t distanceToNearestJunction; // Metric 28 - vector junctionOverhangs; // Metric 29-38 + //double reliableVsRawReadRatio; // Metric 22 (Use getter) + uint16_t nbUpstreamJunctions; // Metric 23 + uint16_t nbDownstreamJunctions; // Metric 24 + uint32_t nbUpstreamFlankingAlignments; // Metric 25 + uint32_t nbDownstreamFlankingAlignments; // Metric 26 + int32_t distanceToNextUpstreamJunction; // Metric 27 + int32_t distanceToNextDownstreamJunction; // Metric 28 + int32_t distanceToNearestJunction; // Metric 29 + vector junctionOverhangs; // Metric 30-49 // **** Predictions **** @@ -695,7 +697,10 @@ class Junction { uint32_t getNbUniquelySplicedReads() const { return nbJunctionAlignments - nbMultipleSplicedReads; } - + + double getReliable2RawRatio() const { + return (double)nbReliableAlignments / (double)nbJunctionAlignments; + } uint16_t getNbDownstreamJunctions() const { return nbDownstreamJunctions; @@ -853,14 +858,14 @@ class Junction { /** * Calculates a score for this intron size based on how this intron size fits - * into an expected distribution specified by the length at the 95th percentile - * (L95) provided by the user. Introns of length < L95 have a score of 0. - * Introns with length > L95 have score: -ln(size - L95) - * @param L95 Intron size at 95 percentile of a correct distribution - * @return A score for this intron size given the L95 + * into an expected distribution specified by the length at the threhsold percentile + * (threshold) provided by the user. Introns of length < threshold have a score of 0. + * Introns with length > threshold have score: -ln(size - length_threshold) + * @param Length of threshold Intron size + * @return A score for this intron size given the threshold value */ - double calcIntronScore(const uint32_t L95) const { - return this->intron->size() <= L95 ? 0.0 : -log(this->intron->size() - L95); + double calcIntronScore(const uint32_t threshold) const { + return this->intron->size() <= threshold ? 0.0 : log(this->intron->size() - threshold); } @@ -943,6 +948,7 @@ class Junction { << j.meanMismatches << "\t" << j.getNbUniquelySplicedReads() << "\t" << j.nbMultipleSplicedReads << "\t" + << j.getReliable2RawRatio() << "\t" << j.nbDownstreamJunctions << "\t" << j.nbUpstreamJunctions << "\t" << j.nbUpstreamFlankingAlignments << "\t" diff --git a/lib/include/portcullis/rule_parser.hpp b/lib/include/portcullis/rule_parser.hpp index ac24a33b..31f39ec0 100644 --- a/lib/include/portcullis/rule_parser.hpp +++ b/lib/include/portcullis/rule_parser.hpp @@ -90,10 +90,11 @@ const unordered_set numericParams = { "M19-nb_mismatches", "M20-nb_usrs", "M21-nb_msrs", - "M22-nb_up_juncs", - "M23-nb_down_juncs", - "M24-up_aln", - "M25-down_aln" + "M22-rel2raw", + "M23-nb_up_juncs", + "M24-nb_down_juncs", + "M25-up_aln", + "M26-down_aln" }; const unordered_set stringParams = { diff --git a/lib/src/junction.cc b/lib/src/junction.cc index af3a438d..b77dd369 100644 --- a/lib/src/junction.cc +++ b/lib/src/junction.cc @@ -982,13 +982,15 @@ void portcullis::Junction::condensedOutputDescription(std::ostream &strm, string << "M19-MeanMismatches=" << meanMismatches << delimiter << "M20-NbUniquelySplicedReads=" << getNbUniquelySplicedReads() << delimiter << "M21-NbMultipleSplicedReads=" << nbMultipleSplicedReads << delimiter - << "M22-NbUpstreamJunctions=" << nbUpstreamJunctions << delimiter - << "M23-NbDownstreamJunctions=" << nbDownstreamJunctions << delimiter - << "M24-NbUpstreamNonSplicedAlignments=" << nbUpstreamFlankingAlignments << delimiter - << "M25-NbDownstreamNonSplicedAlignments=" << nbDownstreamFlankingAlignments << delimiter - << "M26-DistanceToNextUpstreamJunction=" << distanceToNextUpstreamJunction << delimiter - << "M27-DistanceToNextDownstreamJunction=" << distanceToNextDownstreamJunction << delimiter - << "M28-DistanceToNearestJunction=" << distanceToNearestJunction << delimiter + << "M22-Reliable2RawRatio=" << getReliable2RawRatio() << delimiter + << "M23-NbUpstreamJunctions=" << nbUpstreamJunctions << delimiter + << "M24-NbDownstreamJunctions=" << nbDownstreamJunctions << delimiter + << "M25-NbUpstreamNonSplicedAlignments=" << nbUpstreamFlankingAlignments << delimiter + << "M26-NbDownstreamNonSplicedAlignments=" << nbDownstreamFlankingAlignments << delimiter + << "M27-DistanceToNextUpstreamJunction=" << distanceToNextUpstreamJunction << delimiter + << "M28-DistanceToNextDownstreamJunction=" << distanceToNextDownstreamJunction << delimiter + << "M29-DistanceToNearestJunction=" << distanceToNearestJunction << delimiter + << "Suspect=" << boolalpha << suspicious << delimiter << "PFP=" << boolalpha << pfp; } @@ -1204,20 +1206,21 @@ shared_ptr portcullis::Junction::parse(const string& line) j->setMeanMismatches(lexical_cast(parts[31])); //j->setNbUniquelySplicedReads(lexical_cast(parts[32])); j->setNbMultipleSplicedReads(lexical_cast(parts[33])); - j->setNbUpstreamJunctions(lexical_cast(parts[34])); - j->setNbDownstreamJunctions(lexical_cast(parts[35])); - j->setNbUpstreamFlankingAlignments(lexical_cast(parts[36])); - j->setNbDownstreamFlankingAlignments(lexical_cast(parts[37])); - j->setDistanceToNextUpstreamJunction(lexical_cast(parts[38])); - j->setDistanceToNextDownstreamJunction(lexical_cast(parts[39])); - j->setDistanceToNearestJunction(lexical_cast(parts[40])); + //j->setNbMultipleSplicedReads(lexical_cast(parts[33])); + j->setNbUpstreamJunctions(lexical_cast(parts[35])); + j->setNbDownstreamJunctions(lexical_cast(parts[36])); + j->setNbUpstreamFlankingAlignments(lexical_cast(parts[37])); + j->setNbDownstreamFlankingAlignments(lexical_cast(parts[38])); + j->setDistanceToNextUpstreamJunction(lexical_cast(parts[39])); + j->setDistanceToNextDownstreamJunction(lexical_cast(parts[40])); + j->setDistanceToNearestJunction(lexical_cast(parts[41])); // Read Junction overhangs - j->setMeanQueryLength(lexical_cast(parts[41])); - j->setSuspicious(lexical_cast(parts[42])); - j->setPotentialFalsePositive(lexical_cast(parts[43])); + j->setMeanQueryLength(lexical_cast(parts[42])); + j->setSuspicious(lexical_cast(parts[43])); + j->setPotentialFalsePositive(lexical_cast(parts[44])); for(size_t i = 0; i < JO_NAMES.size(); i++) { - j->setJunctionOverhangs(i, lexical_cast(parts[44 + i])); + j->setJunctionOverhangs(i, lexical_cast(parts[45 + i])); } return j; diff --git a/lib/src/rule_parser.cc b/lib/src/rule_parser.cc index a3bb74d6..71f42b80 100644 --- a/lib/src/rule_parser.cc +++ b/lib/src/rule_parser.cc @@ -223,14 +223,15 @@ double portcullis::eval::getNumericFromJunc(const var& fullname) const { case 20: return junc->getNbMultipleSplicedReads(); case 21: - return junc->getNbUpstreamJunctions(); + return junc->getReliable2RawRatio(); case 22: - return junc->getNbDownstreamJunctions(); + return junc->getNbUpstreamJunctions(); case 23: - return junc->getNbUpstreamFlankingAlignments(); + return junc->getNbDownstreamJunctions(); case 24: + return junc->getNbUpstreamFlankingAlignments(); + case 25: return junc->getNbDownstreamFlankingAlignments(); - } if (boost::iequals(name, "Suspect")) { diff --git a/src/junction_filter.cc b/src/junction_filter.cc index cdee0531..0d625a9d 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -260,6 +260,25 @@ void portcullis::JunctionFilter::filter() { doRuleBasedFiltering(this->getIntitalNegRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial negative set for training", resultMap); + const uint32_t longintronthreshold = L95 * 6; + cout << endl << "Adding junctions to negative set with MaxMMES < 10 and excessively long intron size of positive set (> L95 x 6 = " << longintronthreshold << ")..."; + cout.flush(); + + uint32_t nblongintrons = 0; + JunctionList fail2Juncs; + for(auto& j : failJuncs) { + if (j->getIntronSize() > longintronthreshold && j->getMaxMMES() < 10 ) { + passJuncs.push_back(j); + nblongintrons++; + } + else { + fail2Juncs.push_back(j); + } + } + + cout << " done." << endl + << "Found an additional " << nblongintrons << " junctions with long introns." << endl << endl; + cout << "Saving initial negative set:" << endl; JunctionSystem isn(passJuncs); isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn"); @@ -272,7 +291,7 @@ void portcullis::JunctionFilter::filter() { } if (!genuineFile.empty()) { - shared_ptr p = calcPerformance(passJuncs, failJuncs, true); + shared_ptr p = calcPerformance(passJuncs, fail2Juncs, true); cout << "Performance of initial negative set (High NPV is important):" << endl; cout << Performance::longHeader() << endl; cout << p->toLongString() << endl << endl; @@ -285,7 +304,7 @@ void portcullis::JunctionFilter::filter() { invalidNeg.addJunction(j); } } - for(auto& j : failJuncs) { + for(auto& j : fail2Juncs) { if (!j->isGenuine()) { missedNeg.addJunction(j); } @@ -298,14 +317,8 @@ void portcullis::JunctionFilter::filter() { missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); } - const uint32_t L95_fail = this->calcIntronThreshold(passJuncs); - cout << "Length of intron at 95th percentile of negative set: " << L95_fail << endl << endl; - - cout << "Initial training set consists of " << pos << " positive and " << neg << " negative junctions." << endl << endl; - - cout << "Training a random forest model using the initial positive and negative datasets" << endl; shared_ptr forest = Train::trainInstance(initialSet, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, L95); @@ -609,7 +622,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction for(size_t i = 0; i < all.size(); i++) { scoreStream << f->getPredictions()[i][0] << endl; - if (f->getPredictions()[i][0] >= 0.5) { + if (f->getPredictions()[i][0] >= 0.1) { pass.push_back(all[i]); } else { diff --git a/src/train.cc b/src/train.cc index 5907e434..044020d2 100644 --- a/src/train.cc +++ b/src/train.cc @@ -73,7 +73,7 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3 vector headers; headers.reserve( VAR_NAMES.size() + JO_NAMES.size() ); headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() ); - headers.insert( headers.end(), JO_NAMES.begin(), JO_NAMES.end() ); + headers.insert( headers.end(), JO_NAMES.begin()+14, JO_NAMES.end() ); // Convert junction list info to double* double* d = new double[x.size() * headers.size()]; @@ -81,28 +81,30 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3 uint32_t row = 0; for (const auto& j : x) { - d[0 * x.size() + row] = j->getNbJunctionAlignments(); - //d[1 * x.size() + row] = j->getNbDistinctAlignments(); - d[1 * x.size() + row] = j->getNbReliableAlignments(); + //d[0 * x.size() + row] = j->getNbJunctionAlignments(); + //d[0 * x.size() + row] = j->getNbDistinctAlignments(); + d[0 * x.size() + row] = j->getNbReliableAlignments(); //d[3 * x.size() + row] = j->getMaxMinAnchor(); //d[4 * x.size() + row] = j->getDiffAnchor(); //d[5 * x.size() + row] = j->getNbDistinctAnchors(); - d[2 * x.size() + row] = j->getEntropy(); - d[3 * x.size() + row] = j->getMaxMMES(); - d[4 * x.size() + row] = j->getHammingDistance5p(); - d[5 * x.size() + row] = j->getHammingDistance3p(); - //d[6 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score + d[1 * x.size() + row] = j->getEntropy(); + d[2 * x.size() + row] = j->getMaxMMES(); + d[3 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); + //d[3 * x.size() + row] = ; + d[4 * x.size() + row] = j->getReliable2RawRatio(); + //d[5 * x.size() + row] = j->getMeanMismatches(); + d[5 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score d[6 * x.size() + row] = j->isGenuine(); // Junction overhang values at each position are first converted into deviation from expected distributions double half_read_length = (double)j->getMeanQueryLength() / 2.0; - for(size_t i = 0; i < JO_NAMES.size(); i++) { + for(size_t i = 14; i < JO_NAMES.size(); i++) { double Ni = j->getJunctionOverhangs(i); // Actual count at this position if (Ni == 0.0) Ni = 0.000000000001; // Ensure some value > 0 here otherwise we get -infinity later. double Pi = 1.0 - ((double)i / half_read_length); // Likely scale at this position double Ei = (double)j->getNbJunctionAlignments() * Pi; // Expected count at this position double Xi = abs(log2(Ni / Ei)); // Log deviation - d[(i + 7) * x.size() + row] = Xi; + d[(i-14 + 7) * x.size() + row] = Xi; } row++; diff --git a/src/train.hpp b/src/train.hpp index 7abed309..3af77509 100644 --- a/src/train.hpp +++ b/src/train.hpp @@ -52,17 +52,19 @@ const int DEFAULT_SEED = 1234567; // To avoid non-deterministic behaviour // List of variable names const vector VAR_NAMES = { - "M2-nb-reads", + //"M2-nb-reads", //"M3-nb_dist_aln", - "M4-nb_rel_aln", + "nb_rel_aln", //"M8-max_min_anc", //"M9-dif_anc", //"M10-dist_anc", - "M11-entropy", - "M12-maxmmes", - "M13-hamming5p", - "M14-hamming3p", - //"IntronScore", + "entropy", + "maxmmes", + "min_hamming_score", + //"M14-hamming3p", + "rel2raw_ratio", + //"mean_mismatches", + "IntronScore", "Genuine" }; // Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/ From afb55ab3ccc983f5c018c9c1338af01ef120da33 Mon Sep 17 00:00:00 2001 From: Dan Date: Mon, 25 Apr 2016 00:38:12 +0100 Subject: [PATCH 04/15] Implemented a markov model and kmer hash. Implemented coding potential score for junctions, however it seems to detract from the overall usefullness of the predictor at the moment, which means there's probably an error in the calculation somewhere. I should write some unit tests in the future to verify or remove the feature. --- lib/Makefile.am | 3 + lib/include/portcullis/bam/genome_mapper.hpp | 4 +- lib/include/portcullis/junction.hpp | 5 +- lib/include/portcullis/kmer.hpp | 94 ++++++++++++++++++++ lib/include/portcullis/markov_model.hpp | 83 +++++++++++++++++ lib/src/junction.cc | 38 ++++++++ lib/src/markov_model.cc | 58 ++++++++++++ src/junction_filter.cc | 93 ++++++++++++++++--- src/junction_filter.hpp | 18 +++- src/train.cc | 15 ++-- src/train.hpp | 24 +++-- tests/Makefile.am | 3 +- tests/kmer_tests.cpp | 53 +++++++++++ 13 files changed, 457 insertions(+), 34 deletions(-) create mode 100644 lib/include/portcullis/kmer.hpp create mode 100644 lib/include/portcullis/markov_model.hpp create mode 100644 lib/src/markov_model.cc create mode 100644 tests/kmer_tests.cpp diff --git a/lib/Makefile.am b/lib/Makefile.am index dd10476e..0390e79e 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -15,6 +15,7 @@ libportcullis_la_SOURCES = \ src/bam_writer.cc \ src/depth_parser.cc \ src/genome_mapper.cc \ + src/markov_model.cc \ src/python_exe.cc \ src/intron.cc \ src/junction.cc \ @@ -30,6 +31,8 @@ library_include_HEADERS = \ $(PI)/bam/bam_writer.hpp \ $(PI)/bam/depth_parser.hpp \ $(PI)/bam/genome_mapper.hpp \ + $(PI)/kmer.hpp \ + $(PI)/markov_model.hpp \ $(PI)/python_exe.hpp \ $(PI)/intron.hpp \ $(PI)/junction.hpp \ diff --git a/lib/include/portcullis/bam/genome_mapper.hpp b/lib/include/portcullis/bam/genome_mapper.hpp index 2bf028ed..91c55f9c 100644 --- a/lib/include/portcullis/bam/genome_mapper.hpp +++ b/lib/include/portcullis/bam/genome_mapper.hpp @@ -97,7 +97,7 @@ class GenomeMapper { /** * @abstract Fetch the sequence in a region. * @param reg Region in the format "chr2:20,000-30,000" - * @param len Length of the region + * @param len Length of the region returned * @return The sequence as a string; empty string if no seq found */ string fetchBases(const char* reg, int* len) const; @@ -107,7 +107,7 @@ class GenomeMapper { * @param name Region name * @param start Start location on region (zero-based, inclusive) * @param end End position (zero-based, exclusive) - * @param len Length of the region + * @param len Length of the region returned * @return The sequence as a string; empty string if no seq found */ string fetchBases(const char* name, int start, int end, int* len) const; diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp index f7219007..73279b1c 100644 --- a/lib/include/portcullis/junction.hpp +++ b/lib/include/portcullis/junction.hpp @@ -45,6 +45,7 @@ using namespace portcullis::bam; #include "intron.hpp" #include "seq_utils.hpp" +#include "markov_model.hpp" using portcullis::Intron; @@ -867,7 +868,9 @@ class Junction { double calcIntronScore(const uint32_t threshold) const { return this->intron->size() <= threshold ? 0.0 : log(this->intron->size() - threshold); } - + + + double calcCodingPotential(GenomeMapper& gmap, MarkovModel& exon, MarkovModel& intron) const; // **** Output methods **** diff --git a/lib/include/portcullis/kmer.hpp b/lib/include/portcullis/kmer.hpp new file mode 100644 index 00000000..76247a22 --- /dev/null +++ b/lib/include/portcullis/kmer.hpp @@ -0,0 +1,94 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include +#include +#include +#include +using std::ostream; +using std::string; +using std::unordered_map; +using std::vector; + +#include +#include + +namespace portcullis { + +typedef boost::error_info KmerErrorInfo; +struct KmerException: virtual boost::exception, virtual std::exception { }; + +/** + * An extremely basic kmer hash class that doesn't worry about canonical kmers + * or reverse complementing the data. + */ +class KmerHash { +private: + unordered_map khash; + uint16_t k; + +public: + KmerHash(const uint16_t _k, const string& seq) { + k = _k; + + for(size_t i = k; i <= seq.size(); i++) { + khash[boost::to_upper_copy(seq.substr(i - k, k))]++; + } + } + + uint32_t getCount(const string& kmer) { + if (kmer.size() != k) { + BOOST_THROW_EXCEPTION(KmerException() << KmerErrorInfo(string( + "Given kmer size is ") + std::to_string(kmer.size()) + ". Expected kmer of size " + std::to_string(k))); + } + + string kup = boost::to_upper_copy(kmer); + return (khash.find(kup) != khash.end()) ? khash[kup] : 0; + } + + size_t nbDistinctKmers() const { + return khash.size(); + } + + void print(ostream& out) const { + for(auto& kmer : khash) { + out << kmer.first << "\t" << kmer.second << endl; + } + } + + void printAbundanceHistogram(ostream& out, const uint32_t hist_size) { + + vector hist(hist_size, 0); + + for(auto& kmer : khash) { + if (hist.size() > kmer.second) { + hist[kmer.second]++; + } + else { + hist[hist.size() -1]++; + } + } + + for(auto& e : hist) { + out << e << endl; + } + } +}; + +} \ No newline at end of file diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/markov_model.hpp new file mode 100644 index 00000000..1b926361 --- /dev/null +++ b/lib/include/portcullis/markov_model.hpp @@ -0,0 +1,83 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include +#include +#include +using std::string; +using std::unordered_map; +using std::vector; + + +namespace portcullis { + +typedef boost::error_info MMErrorInfo; +struct MMException: virtual boost::exception, virtual std::exception {}; + + + +/** + * Intended to be used as a map containing the probability of seeing a given character + */ +typedef unordered_map NTP; + +/** + * This datastructure consists of a map of kmers to a map of nucleotides with assoicated + * probabilities. + */ +typedef unordered_map MMU; + +/** + * Simple Markov chain implementation derived originally from Truesight. + * Constructor trains the model on a set of sequences. "getScore" returns the score + * for a given sequence based on the pre-trained model. + * Automatically converts sequences to uppercase + */ +class MarkovModel { + +private: + portcullis::MMU model; + uint16_t order; + +public: + + MarkovModel(const uint32_t _order) { + order = _order; + } + + MarkovModel(const vector& input, const uint32_t _order) { + order = _order; + train(input); + } + + void train(const vector& input); + + uint16_t getOrder() const { + return order; + } + + size_t size() const { + return model.size(); + } + + double getScore(const string& seq); +}; + +} + diff --git a/lib/src/junction.cc b/lib/src/junction.cc index b77dd369..f10b9f9b 100644 --- a/lib/src/junction.cc +++ b/lib/src/junction.cc @@ -47,7 +47,9 @@ using portcullis::bam::Strand; #include #include +#include using portcullis::Intron; +using portcullis::MarkovModel; #include @@ -1225,3 +1227,39 @@ shared_ptr portcullis::Junction::parse(const string& line) return j; } + +double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, MarkovModel& exon, MarkovModel& intron) const { + + int len = 0; + + const char* ref = this->intron->ref.name.c_str(); + + string left_exon = gmap.fetchBases(ref, this->intron->start - 80, this->intron->start, &len); + if (this->getConsensusStrand() == Strand::NEGATIVE) { + left_exon = SeqUtils::reverseComplement(left_exon); + } + + string left_intron = gmap.fetchBases(ref, this->intron->start, this->intron->start+81, &len); + if (this->getConsensusStrand() == Strand::NEGATIVE) { + left_intron = SeqUtils::reverseComplement(left_intron); + } + + string right_intron = gmap.fetchBases(ref, this->intron->end-80, this->intron->end+1, &len); + if (this->getConsensusStrand() == Strand::NEGATIVE) { + right_intron = SeqUtils::reverseComplement(right_intron); + } + + string right_exon = gmap.fetchBases(ref, this->intron->end + 1, this->intron->end + 80, &len); + if (this->getConsensusStrand() == Strand::NEGATIVE) { + right_exon = SeqUtils::reverseComplement(right_exon); + } + + double score = exon.getScore(left_intron) + intron.getScore(left_exon) + - ( intron.getScore(left_intron) + exon.getScore(left_exon) ) + + ( intron.getScore(right_exon) + exon.getScore(right_intron) ) + - ( exon.getScore(right_exon) + intron.getScore(right_intron) ); + + //cout << score << endl; + + return score; +} diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc new file mode 100644 index 00000000..11bdb41e --- /dev/null +++ b/lib/src/markov_model.cc @@ -0,0 +1,58 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::endl; + +#include +#include + +#include + +void portcullis::MarkovModel::train(const vector& input) { + MMU temp; + for(auto& seq : input) { + string sequp = boost::to_upper_copy(seq); + for(uint16_t i = order; i < sequp.size(); i++) { + temp[sequp.substr(i-order, order)][sequp.substr(i, 1)]++; + } + } + for(auto& i : temp) { + double sum = 0; + for(auto& j : i.second) { + sum += j.second; + } + for(auto& j : i.second) { + //cout << i.first << " " << j.first << " " << j.second << endl; + model[i.first][j.first] = j.second / sum; + } + } +} + + +double portcullis::MarkovModel::getScore(const string& seq) { + string sequp = boost::to_upper_copy(seq); + double score = 1.0; + for(uint16_t i = order; i < sequp.size(); i++){ + score *= model[sequp.substr(i-order, order)][sequp.substr(i, 1)]; + } + if(score == 0.0) { + return -100.0; + } + return log(score); +} diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 0d625a9d..4d1886bc 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -52,17 +52,21 @@ namespace po = boost::program_options; #include #include +#include #include #include #include #include #include #include +#include using portcullis::PortcullisFS; using portcullis::Intron; using portcullis::IntronHasher; using portcullis::Performance; using portcullis::eval; +using portcullis::bam::GenomeMapper; +using portcullis::MarkovModel; #include "train.hpp" using portcullis::Train; @@ -72,6 +76,7 @@ using portcullis::Train; portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile, const path& _output) { junctionFile = _junctionFile; + genomeFile = ""; modelFile = ""; genuineFile = ""; output = _output; @@ -198,12 +203,20 @@ void portcullis::JunctionFilter::filter() { // To be overridden if we are training uint32_t L95 = 0; + MarkovModel exon_model(5); + MarkovModel intron_model(5); + + // Create the genome mapper + GenomeMapper gmap(this->genomeFile); + + // Load the fasta index + gmap.loadFastaIndex(); + if (train) { - JunctionList initialSet; - uint32_t pos = 0; - uint32_t neg = 0; + // The initial positive and negative sets + JunctionList pos, neg; cout << "Self training mode activated. Collecting initial positive and negative sets from input." << endl; @@ -216,12 +229,11 @@ void portcullis::JunctionFilter::filter() { cout << "Saving initial positive set:" << endl; JunctionSystem isp(passJuncs); isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp"); - pos = passJuncs.size(); for(auto& j : passJuncs) { JunctionPtr copy = make_shared(*j); copy->setGenuine(true); - initialSet.push_back(copy); + pos.push_back(copy); } if (!genuineFile.empty()) { shared_ptr p = calcPerformance(passJuncs, failJuncs); @@ -254,6 +266,13 @@ void portcullis::JunctionFilter::filter() { L95 = this->calcIntronThreshold(passJuncs); cout << "Length of intron at 95th percentile of positive set: " << L95 << endl << endl; + cout << "Training coding potential markov models on positive set ..."; + cout.flush(); + trainMMs(pos, exon_model, intron_model, gmap); + cout << " done." << endl; + cout << "Exon model contains " << exon_model.size() << " 5-mers." << endl; + cout << "Intron model contains " << intron_model.size() << " 5-mers." << endl; + passJuncs.clear(); failJuncs.clear(); resultMap.clear(); @@ -282,12 +301,11 @@ void portcullis::JunctionFilter::filter() { cout << "Saving initial negative set:" << endl; JunctionSystem isn(passJuncs); isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn"); - neg = passJuncs.size(); for(auto& j : passJuncs) { JunctionPtr copy = make_shared(*j); copy->setGenuine(false); - initialSet.push_back(copy); + neg.push_back(copy); } if (!genuineFile.empty()) { @@ -317,10 +335,17 @@ void portcullis::JunctionFilter::filter() { missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); } - cout << "Initial training set consists of " << pos << " positive and " << neg << " negative junctions." << endl << endl; + cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl; + + // Build the training set by combining the positive and negative sets + JunctionList training; + training.reserve(pos.size() + neg.size()); + training.insert(training.end(), pos.begin(), pos.end()); + training.insert(training.end(), neg.begin(), neg.end()); + cout << "Training a random forest model using the initial positive and negative datasets" << endl; - shared_ptr forest = Train::trainInstance(initialSet, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, L95); + shared_ptr forest = Train::trainInstance(training, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, L95, exon_model, intron_model, gmap); forest->saveToFile(); forest->writeOutput(&cout); @@ -341,7 +366,7 @@ void portcullis::JunctionFilter::filter() { JunctionList passJuncs; JunctionList failJuncs; - forestPredict(currentJuncs, passJuncs, failJuncs, L95); + forestPredict(currentJuncs, passJuncs, failJuncs, L95, exon_model, intron_model, gmap); printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering")); @@ -480,6 +505,44 @@ void portcullis::JunctionFilter::filter() { } } } + +void portcullis::JunctionFilter::trainMMs(const JunctionList& in, MarkovModel& exon_model, MarkovModel& intron_model, GenomeMapper& gmap) { + + vector exons; + vector introns; + for(auto& j : in) { + + int len = 0; + + string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 80, j->getIntron()->start, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + left_exon = SeqUtils::reverseComplement(left_exon); + } + exons.push_back(left_exon); + + string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+81, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + left_intron = SeqUtils::reverseComplement(left_intron); + } + introns.push_back(left_intron); + + string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end+1, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + right_intron = SeqUtils::reverseComplement(right_intron); + } + introns.push_back(right_intron); + + + string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 80, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + right_exon = SeqUtils::reverseComplement(right_exon); + } + exons.push_back(right_exon); + } + + exon_model.train(exons); + intron_model.train(introns); +} uint32_t portcullis::JunctionFilter::calcIntronThreshold(const JunctionList& juncs) { @@ -554,13 +617,13 @@ void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, cons } -void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95) { +void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) { if (verbose) { cerr << endl << "Preparing junction metrics into matrix" << endl; } - Data* testingData = Train::juncs2FeatureVectors(all, L95); + Data* testingData = Train::juncs2FeatureVectors(all, L95, exon, intron, gmap); // Load forest from disk @@ -641,6 +704,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction int portcullis::JunctionFilter::main(int argc, char *argv[]) { path junctionFile; + path genomeFile; path modelFile; path genuineFile; path filterFile; @@ -693,11 +757,13 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { // in config file, but will not be shown to the user. po::options_description hidden_options("Hidden options"); hidden_options.add_options() - ("junction-file", po::value(&junctionFile), "Path to the junction file to process.") + ("genome-file", po::value(&genomeFile), "Path to the genome file to process.") + ("junction-file", po::value(&junctionFile), "Path to the junction file to process.") ; // Positional option for the input bam file po::positional_options_description p; + p.add("genome-file", 1); p.add("junction-file", 1); @@ -745,6 +811,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { } } filter.setReferenceFile(referenceFile); + filter.setGenomeFile(genomeFile); filter.filter(); diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index 5ff86eef..ff70137c 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -46,16 +46,20 @@ using boost::filesystem::create_symlink; using boost::filesystem::create_directory; using boost::filesystem::symbolic_link_exists; +#include #include #include #include #include #include +#include +using portcullis::bam::GenomeMapper; using portcullis::PortcullisFS; using portcullis::Intron; using portcullis::IntronHasher; using portcullis::Performance; using portcullis::JuncResultMap; +using portcullis::MarkovModel; namespace portcullis { @@ -78,6 +82,7 @@ class JunctionFilter { private: path junctionFile; + path genomeFile; path modelFile; path filterFile; path genuineFile; @@ -124,6 +129,15 @@ class JunctionFilter { void setJunctionFile(path junctionFile) { this->junctionFile = junctionFile; } + + path getGenomeFile() const { + return genomeFile; + } + + void setGenomeFile(path genomeFile) { + this->genomeFile = genomeFile; + } + path getOutput() const { return output; @@ -261,7 +275,7 @@ class JunctionFilter { protected: - void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95); + void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap); shared_ptr calcPerformance(const JunctionList& pass, const JunctionList& fail) { return calcPerformance(pass, fail, false); @@ -274,6 +288,8 @@ class JunctionFilter { uint32_t calcIntronThreshold(const JunctionList& pass); + void trainMMs(const JunctionList& in, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap); + public: static string helpMessage() { diff --git a/src/train.cc b/src/train.cc index 044020d2..b65ef7c2 100644 --- a/src/train.cc +++ b/src/train.cc @@ -68,7 +68,7 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){ verbose = false; } -Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint32_t L95) { +Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) { vector headers; headers.reserve( VAR_NAMES.size() + JO_NAMES.size() ); @@ -94,7 +94,8 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3 d[4 * x.size() + row] = j->getReliable2RawRatio(); //d[5 * x.size() + row] = j->getMeanMismatches(); d[5 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score - d[6 * x.size() + row] = j->isGenuine(); + d[6 * x.size() + row] = exon.size() > 0 && intron.size() > 0 ? j->calcCodingPotential(gmap, exon, intron) : 0.0; // Coding potential score + d[7 * x.size() + row] = j->isGenuine(); // Junction overhang values at each position are first converted into deviation from expected distributions double half_read_length = (double)j->getMeanQueryLength() / 2.0; for(size_t i = 14; i < JO_NAMES.size(); i++) { @@ -104,7 +105,7 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3 double Ei = (double)j->getNbJunctionAlignments() * Pi; // Expected count at this position double Xi = abs(log2(Ni / Ei)); // Log deviation - d[(i-14 + 7) * x.size() + row] = Xi; + d[(i-14 + 8) * x.size() + row] = Xi; } row++; @@ -115,10 +116,10 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3 return data; } -shared_ptr portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95) { +shared_ptr portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) { cout << "Creating feature vector" << endl; - Data* trainingData = juncs2FeatureVectors(x, L95); + Data* trainingData = juncs2FeatureVectors(x, L95, exon, intron, gmap); cout << "Initialising random forest" << endl; shared_ptr f = nullptr; @@ -158,10 +159,10 @@ shared_ptr portcullis::Train::trainInstance(const JunctionList& x, strin return f; } -void portcullis::Train::testInstance(shared_ptr f, const JunctionList& x, const uint32_t L95) { +void portcullis::Train::testInstance(shared_ptr f, const JunctionList& x) { // Convert testing set junctions into feature vector - Data* testingData = juncs2FeatureVectors(x, L95); + Data* testingData = juncs2FeatureVectors(x); f->setPredictionMode(true); f->setData(testingData); diff --git a/src/train.hpp b/src/train.hpp index 3af77509..6fedc455 100644 --- a/src/train.hpp +++ b/src/train.hpp @@ -36,6 +36,8 @@ using boost::filesystem::path; #include #include +#include +using portcullis::bam::GenomeMapper; namespace portcullis { @@ -65,6 +67,7 @@ const vector VAR_NAMES = { "rel2raw_ratio", //"mean_mismatches", "IntronScore", + "CodingPotential", "Genuine" }; // Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/ @@ -232,25 +235,28 @@ class Train { static int main(int argc, char *argv[]); static Data* juncs2FeatureVectors(const JunctionList& x) { - return juncs2FeatureVectors(x, 0); + GenomeMapper gmap; + MarkovModel m(1); + return juncs2FeatureVectors(x, 0, m, m, gmap); } - static Data* juncs2FeatureVectors(const JunctionList& x, const uint32_t L95); + static Data* juncs2FeatureVectors(const JunctionList& x, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap); static shared_ptr trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) { - return trainInstance(x, outputPrefix, trees, threads, regressionMode, verbose, 0); + GenomeMapper gmap; + MarkovModel m(1); + return trainInstance(x, outputPrefix, trees, threads, regressionMode, verbose, + 0, m, m, gmap); } + static shared_ptr trainInstance(const JunctionList& x, string outputPrefix, - uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95); + uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, + const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap); protected: - void testInstance(shared_ptr f, const JunctionList& y) { - return testInstance(f, y, 0); - } - - void testInstance(shared_ptr f, const JunctionList& y, const uint32_t L95); + void testInstance(shared_ptr f, const JunctionList& y); void getRandomSubset(const JunctionList& in, JunctionList& out); diff --git a/tests/Makefile.am b/tests/Makefile.am index 9cda768c..b3a42b38 100755 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -24,8 +24,9 @@ check_PROGRAMS = check_unit_tests check_unit_tests_SOURCES = bam_tests.cpp \ seq_utils_tests.cpp \ + kmer_tests.cpp \ intron_tests.cpp \ - junction_tests.cpp \ + junction_tests.cpp \ check_portcullis.cc check_unit_tests_CXXFLAGS = -O0 @AM_CXXFLAGS@ @CXXFLAGS@ diff --git a/tests/kmer_tests.cpp b/tests/kmer_tests.cpp new file mode 100644 index 00000000..d7895d2d --- /dev/null +++ b/tests/kmer_tests.cpp @@ -0,0 +1,53 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include + +#include +#include +#include +using std::cout; +using std::endl; +using std::stringstream; + + +#include +#include +#include +namespace bfs = boost::filesystem; +using bfs::path; + +#include +using portcullis::KmerHash; + + +TEST(kmer, make) { + + string seq = "ATGCATGCATCGNATATATATTGAC"; + + KmerHash kh(4, seq); + + //kh.print(std::cout); + + EXPECT_EQ(1, kh.getCount("tgac")); + + EXPECT_EQ(0, kh.getCount("kjls")); + + EXPECT_EQ(16, kh.nbDistinctKmers()); + +} + From 68a29428d4f90b25084bbe90eba9f02c1787d3ad Mon Sep 17 00:00:00 2001 From: Dan Date: Mon, 25 Apr 2016 10:24:15 +0100 Subject: [PATCH 05/15] Moving code relating to additional features calculated during self training into a class of its own stored in the library. --- lib/Makefile.am | 2 + lib/include/portcullis/markov_model.hpp | 8 +- lib/include/portcullis/model_features.hpp | 50 ++++++++++++ lib/src/markov_model.cc | 4 +- lib/src/model_features.cc | 88 +++++++++++++++++++++ src/junction_filter.cc | 93 +++++------------------ src/junction_filter.hpp | 10 +-- src/train.cc | 10 +-- src/train.hpp | 17 ++--- 9 files changed, 187 insertions(+), 95 deletions(-) create mode 100644 lib/include/portcullis/model_features.hpp create mode 100644 lib/src/model_features.cc diff --git a/lib/Makefile.am b/lib/Makefile.am index 0390e79e..529a4809 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -16,6 +16,7 @@ libportcullis_la_SOURCES = \ src/depth_parser.cc \ src/genome_mapper.cc \ src/markov_model.cc \ + src/model_features.cc \ src/python_exe.cc \ src/intron.cc \ src/junction.cc \ @@ -33,6 +34,7 @@ library_include_HEADERS = \ $(PI)/bam/genome_mapper.hpp \ $(PI)/kmer.hpp \ $(PI)/markov_model.hpp \ + $(PI)/model_features.hpp \ $(PI)/python_exe.hpp \ $(PI)/intron.hpp \ $(PI)/junction.hpp \ diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/markov_model.hpp index 1b926361..1b524858 100644 --- a/lib/include/portcullis/markov_model.hpp +++ b/lib/include/portcullis/markov_model.hpp @@ -57,6 +57,8 @@ class MarkovModel { public: + MarkovModel() : MarkovModel(1) {} + MarkovModel(const uint32_t _order) { order = _order; } @@ -66,7 +68,11 @@ class MarkovModel { train(input); } - void train(const vector& input); + void train(const vector& input) { + train(input, order); + } + + void train(const vector& input, const uint32_t order); uint16_t getOrder() const { return order; diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp new file mode 100644 index 00000000..8179e57c --- /dev/null +++ b/lib/include/portcullis/model_features.hpp @@ -0,0 +1,50 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include +using boost::filesystem::path; + +#include +#include +using portcullis::bam::GenomeMapper; +using portcullis::MarkovModel; + +namespace portcullis { + +class ModelFeatures { +public: + uint32_t L95; + MarkovModel exonModel; + MarkovModel intronModel; + GenomeMapper gmap; + + ModelFeatures() : L95(0) { + } + + bool isCodingPotentialModelEmpty() { + return exonModel.size() == 0 || intronModel.size() == 0; + } + + void initGenomeMapper(const path& genomeFile); + + uint32_t calcIntronThreshold(const JunctionList& juncs); + + void trainCodingPotentialModel(const JunctionList& in); +}; +} \ No newline at end of file diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc index 11bdb41e..50d061db 100644 --- a/lib/src/markov_model.cc +++ b/lib/src/markov_model.cc @@ -24,7 +24,8 @@ using std::endl; #include -void portcullis::MarkovModel::train(const vector& input) { +void portcullis::MarkovModel::train(const vector& input, const uint32_t _order) { + order = _order; MMU temp; for(auto& seq : input) { string sequp = boost::to_upper_copy(seq); @@ -32,6 +33,7 @@ void portcullis::MarkovModel::train(const vector& input) { temp[sequp.substr(i-order, order)][sequp.substr(i, 1)]++; } } + model.clear(); for(auto& i : temp) { double sum = 0; for(auto& j : i.second) { diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc new file mode 100644 index 00000000..5bb344ad --- /dev/null +++ b/lib/src/model_features.cc @@ -0,0 +1,88 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::endl; + +#include +using portcullis::Junction; + +#include + + + +void portcullis::ModelFeatures::initGenomeMapper(const path& genomeFile) { + + // Initialise + gmap.setGenomeFile(genomeFile); + + // Load the fasta index + gmap.loadFastaIndex(); +} + +uint32_t portcullis::ModelFeatures::calcIntronThreshold(const JunctionList& juncs) { + + vector intron_sizes; + for(auto& j : juncs) { + intron_sizes.push_back(j->getIntronSize()); + } + + std::sort(intron_sizes.begin(), intron_sizes.end()); + + L95 = intron_sizes[intron_sizes.size() * 0.95]; + + return L95; +} + +void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in) { + + vector exons; + vector introns; + for(auto& j : in) { + + int len = 0; + + string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 80, j->getIntron()->start, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + left_exon = SeqUtils::reverseComplement(left_exon); + } + exons.push_back(left_exon); + + string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+81, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + left_intron = SeqUtils::reverseComplement(left_intron); + } + introns.push_back(left_intron); + + string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end+1, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + right_intron = SeqUtils::reverseComplement(right_intron); + } + introns.push_back(right_intron); + + + string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 80, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + right_exon = SeqUtils::reverseComplement(right_exon); + } + exons.push_back(right_exon); + } + + exonModel.train(exons, 5); + intronModel.train(introns, 5); +} \ No newline at end of file diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 4d1886bc..05e5ae26 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -202,16 +202,9 @@ void portcullis::JunctionFilter::filter() { } // To be overridden if we are training - uint32_t L95 = 0; - MarkovModel exon_model(5); - MarkovModel intron_model(5); + ModelFeatures mf; + mf.initGenomeMapper(this->genomeFile); - // Create the genome mapper - GenomeMapper gmap(this->genomeFile); - - // Load the fasta index - gmap.loadFastaIndex(); - if (train) { @@ -263,15 +256,14 @@ void portcullis::JunctionFilter::filter() { // Analyse positive set to get L0.05 of intron size - L95 = this->calcIntronThreshold(passJuncs); - cout << "Length of intron at 95th percentile of positive set: " << L95 << endl << endl; + cout << "Length of intron at 95th percentile of positive set: " << mf.calcIntronThreshold(passJuncs) << endl << endl; cout << "Training coding potential markov models on positive set ..."; cout.flush(); - trainMMs(pos, exon_model, intron_model, gmap); + mf.trainCodingPotentialModel(pos); cout << " done." << endl; - cout << "Exon model contains " << exon_model.size() << " 5-mers." << endl; - cout << "Intron model contains " << intron_model.size() << " 5-mers." << endl; + cout << "Exon model contains " << mf.exonModel.size() << " 5-mers." << endl; + cout << "Intron model contains " << mf.intronModel.size() << " 5-mers." << endl; passJuncs.clear(); failJuncs.clear(); @@ -279,7 +271,7 @@ void portcullis::JunctionFilter::filter() { doRuleBasedFiltering(this->getIntitalNegRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial negative set for training", resultMap); - const uint32_t longintronthreshold = L95 * 6; + const uint32_t longintronthreshold = mf.L95 * 6; cout << endl << "Adding junctions to negative set with MaxMMES < 10 and excessively long intron size of positive set (> L95 x 6 = " << longintronthreshold << ")..."; cout.flush(); @@ -343,9 +335,8 @@ void portcullis::JunctionFilter::filter() { training.insert(training.end(), pos.begin(), pos.end()); training.insert(training.end(), neg.begin(), neg.end()); - cout << "Training a random forest model using the initial positive and negative datasets" << endl; - shared_ptr forest = Train::trainInstance(training, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, L95, exon_model, intron_model, gmap); + shared_ptr forest = Train::trainInstance(training, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, mf); forest->saveToFile(); forest->writeOutput(&cout); @@ -366,9 +357,9 @@ void portcullis::JunctionFilter::filter() { JunctionList passJuncs; JunctionList failJuncs; - forestPredict(currentJuncs, passJuncs, failJuncs, L95, exon_model, intron_model, gmap); + forestPredict(currentJuncs, passJuncs, failJuncs, mf); - printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering")); + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering results")); // Reset currentJuncs currentJuncs.clear(); @@ -442,7 +433,7 @@ void portcullis::JunctionFilter::filter() { } } - printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Post filtering (length and/or canonical)")); + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Post filtering (length and/or canonical) results")); // Reset currentJuncs currentJuncs.clear(); @@ -451,6 +442,8 @@ void portcullis::JunctionFilter::filter() { } } + cout << endl; + JunctionSystem filteredJuncs; JunctionSystem refKeptJuncs; @@ -459,7 +452,7 @@ void portcullis::JunctionFilter::filter() { } else { - cout << "Recalculating junction grouping and distance stats based on new junction list that passed filters ..."; + cout << "Recalculating junction grouping and distance stats based on new junction list that passed filters ..."; cout.flush(); for(auto& j : currentJuncs) { @@ -489,7 +482,7 @@ void portcullis::JunctionFilter::filter() { discardedJuncs.getJunctions(), string("Overall results")); - cout << "Saving junctions passing filter to disk:" << endl; + cout << endl << "Saving junctions passing filter to disk:" << endl; filteredJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".pass", source + "_pass"); @@ -506,61 +499,15 @@ void portcullis::JunctionFilter::filter() { } } -void portcullis::JunctionFilter::trainMMs(const JunctionList& in, MarkovModel& exon_model, MarkovModel& intron_model, GenomeMapper& gmap) { - - vector exons; - vector introns; - for(auto& j : in) { - - int len = 0; - - string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 80, j->getIntron()->start, &len); - if (j->getConsensusStrand() == Strand::NEGATIVE) { - left_exon = SeqUtils::reverseComplement(left_exon); - } - exons.push_back(left_exon); - - string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+81, &len); - if (j->getConsensusStrand() == Strand::NEGATIVE) { - left_intron = SeqUtils::reverseComplement(left_intron); - } - introns.push_back(left_intron); - - string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end+1, &len); - if (j->getConsensusStrand() == Strand::NEGATIVE) { - right_intron = SeqUtils::reverseComplement(right_intron); - } - introns.push_back(right_intron); - - - string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 80, &len); - if (j->getConsensusStrand() == Strand::NEGATIVE) { - right_exon = SeqUtils::reverseComplement(right_exon); - } - exons.push_back(right_exon); - } - - exon_model.train(exons); - intron_model.train(introns); -} - -uint32_t portcullis::JunctionFilter::calcIntronThreshold(const JunctionList& juncs) { - - vector intron_sizes; - for(auto& j : juncs) { - intron_sizes.push_back(j->getIntronSize()); - } - - std::sort(intron_sizes.begin(), intron_sizes.end()); + - return intron_sizes[intron_sizes.size() * 0.95]; -} + void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) { // Output stats size_t diff = in.size() - pass.size(); - cout << prefix << endl + cout << endl << prefix << endl << "-------------------------" << endl << "Input contained " << in.size() << " junctions." << endl << "Output contains " << pass.size() << " junctions." << endl @@ -617,13 +564,13 @@ void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, cons } -void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) { +void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf) { if (verbose) { cerr << endl << "Preparing junction metrics into matrix" << endl; } - Data* testingData = Train::juncs2FeatureVectors(all, L95, exon, intron, gmap); + Data* testingData = Train::juncs2FeatureVectors(all, mf); // Load forest from disk diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index ff70137c..ec3767ff 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -53,6 +53,7 @@ using boost::filesystem::symbolic_link_exists; #include #include #include +#include using portcullis::bam::GenomeMapper; using portcullis::PortcullisFS; using portcullis::Intron; @@ -60,6 +61,7 @@ using portcullis::IntronHasher; using portcullis::Performance; using portcullis::JuncResultMap; using portcullis::MarkovModel; +using portcullis::ModelFeatures; namespace portcullis { @@ -275,7 +277,7 @@ class JunctionFilter { protected: - void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap); + void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf); shared_ptr calcPerformance(const JunctionList& pass, const JunctionList& fail) { return calcPerformance(pass, fail, false); @@ -285,11 +287,7 @@ class JunctionFilter { void printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix); void doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail, const string& prefix, JuncResultMap& resultMap); - - uint32_t calcIntronThreshold(const JunctionList& pass); - - void trainMMs(const JunctionList& in, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap); - + public: static string helpMessage() { diff --git a/src/train.cc b/src/train.cc index b65ef7c2..801af91f 100644 --- a/src/train.cc +++ b/src/train.cc @@ -68,7 +68,7 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){ verbose = false; } -Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) { +Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, ModelFeatures& mf) { vector headers; headers.reserve( VAR_NAMES.size() + JO_NAMES.size() ); @@ -93,8 +93,8 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3 //d[3 * x.size() + row] = ; d[4 * x.size() + row] = j->getReliable2RawRatio(); //d[5 * x.size() + row] = j->getMeanMismatches(); - d[5 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score - d[6 * x.size() + row] = exon.size() > 0 && intron.size() > 0 ? j->calcCodingPotential(gmap, exon, intron) : 0.0; // Coding potential score + d[5 * x.size() + row] = mf.L95 == 0 ? 0.0 : j->calcIntronScore(mf.L95); // Intron score + d[6 * x.size() + row] = mf.isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(mf.gmap, mf.exonModel, mf.intronModel); // Coding potential score d[7 * x.size() + row] = j->isGenuine(); // Junction overhang values at each position are first converted into deviation from expected distributions double half_read_length = (double)j->getMeanQueryLength() / 2.0; @@ -116,10 +116,10 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3 return data; } -shared_ptr portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) { +shared_ptr portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, ModelFeatures& mf) { cout << "Creating feature vector" << endl; - Data* trainingData = juncs2FeatureVectors(x, L95, exon, intron, gmap); + Data* trainingData = juncs2FeatureVectors(x, mf); cout << "Initialising random forest" << endl; shared_ptr f = nullptr; diff --git a/src/train.hpp b/src/train.hpp index 6fedc455..7e2f009b 100644 --- a/src/train.hpp +++ b/src/train.hpp @@ -37,8 +37,9 @@ using boost::filesystem::path; #include #include #include +#include using portcullis::bam::GenomeMapper; - +using portcullis::ModelFeatures; namespace portcullis { @@ -235,23 +236,21 @@ class Train { static int main(int argc, char *argv[]); static Data* juncs2FeatureVectors(const JunctionList& x) { - GenomeMapper gmap; - MarkovModel m(1); - return juncs2FeatureVectors(x, 0, m, m, gmap); + ModelFeatures mf; + return juncs2FeatureVectors(x, mf); } - static Data* juncs2FeatureVectors(const JunctionList& x, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap); + static Data* juncs2FeatureVectors(const JunctionList& x, ModelFeatures& modelFeatures); static shared_ptr trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) { - GenomeMapper gmap; - MarkovModel m(1); + ModelFeatures mf; return trainInstance(x, outputPrefix, trees, threads, regressionMode, verbose, - 0, m, m, gmap); + mf); } static shared_ptr trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, - const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap); + ModelFeatures& modelFeatures); protected: From c4625ed11ccf7722af1dc8e5c6cc8936dba343f6 Mon Sep 17 00:00:00 2001 From: Dan Date: Mon, 25 Apr 2016 13:40:33 +0100 Subject: [PATCH 06/15] Shifted creation of feature vector into model_features. --- lib/Makefile.am | 1 + lib/include/portcullis/model_features.hpp | 25 +++++++++++ lib/src/model_features.cc | 50 +++++++++++++++++++++ src/junction_filter.cc | 2 +- src/train.cc | 53 ++--------------------- src/train.hpp | 24 +--------- 6 files changed, 82 insertions(+), 73 deletions(-) diff --git a/lib/Makefile.am b/lib/Makefile.am index 529a4809..643163b7 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -46,6 +46,7 @@ library_include_HEADERS = \ libportcullis_la_CPPFLAGS = \ -isystem $(top_srcdir)/deps/htslib-1.3 \ + -isystem $(top_srcdir)/deps/ranger-0.3.8/include \ -isystem $(top_srcdir)/lib/include \ -DDATADIR=\"$(datadir)\" \ @AM_CPPFLAGS@ \ diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp index 8179e57c..b3e5a917 100644 --- a/lib/include/portcullis/model_features.hpp +++ b/lib/include/portcullis/model_features.hpp @@ -20,12 +20,35 @@ #include using boost::filesystem::path; +#include + #include #include +#include using portcullis::bam::GenomeMapper; using portcullis::MarkovModel; +using portcullis::JunctionList; namespace portcullis { + +// List of variable names +const vector VAR_NAMES = { + //"M2-nb-reads", + //"M3-nb_dist_aln", + "nb_rel_aln", + //"M8-max_min_anc", + //"M9-dif_anc", + //"M10-dist_anc", + "entropy", + "maxmmes", + "min_hamming_score", + //"M14-hamming3p", + "rel2raw_ratio", + //"mean_mismatches", + "IntronScore", + "CodingPotential", + "Genuine" }; + class ModelFeatures { public: @@ -46,5 +69,7 @@ class ModelFeatures { uint32_t calcIntronThreshold(const JunctionList& juncs); void trainCodingPotentialModel(const JunctionList& in); + + Data* juncs2FeatureVectors(const JunctionList& x); }; } \ No newline at end of file diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 5bb344ad..8d40b880 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -19,6 +19,8 @@ using std::cout; using std::endl; +#include + #include using portcullis::Junction; @@ -85,4 +87,52 @@ void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in exonModel.train(exons, 5); intronModel.train(introns, 5); +} + +Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { + + vector headers; + headers.reserve( VAR_NAMES.size() + JO_NAMES.size() ); + headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() ); + headers.insert( headers.end(), JO_NAMES.begin()+14, JO_NAMES.end() ); + + // Convert junction list info to double* + double* d = new double[x.size() * headers.size()]; + + uint32_t row = 0; + for (const auto& j : x) { + + //d[0 * x.size() + row] = j->getNbJunctionAlignments(); + //d[0 * x.size() + row] = j->getNbDistinctAlignments(); + d[0 * x.size() + row] = j->getNbReliableAlignments(); + //d[3 * x.size() + row] = j->getMaxMinAnchor(); + //d[4 * x.size() + row] = j->getDiffAnchor(); + //d[5 * x.size() + row] = j->getNbDistinctAnchors(); + d[1 * x.size() + row] = j->getEntropy(); + d[2 * x.size() + row] = j->getMaxMMES(); + d[3 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); + //d[3 * x.size() + row] = ; + d[4 * x.size() + row] = j->getReliable2RawRatio(); + //d[5 * x.size() + row] = j->getMeanMismatches(); + d[5 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score + d[6 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); // Coding potential score + d[7 * x.size() + row] = j->isGenuine(); + // Junction overhang values at each position are first converted into deviation from expected distributions + double half_read_length = (double)j->getMeanQueryLength() / 2.0; + for(size_t i = 14; i < JO_NAMES.size(); i++) { + double Ni = j->getJunctionOverhangs(i); // Actual count at this position + if (Ni == 0.0) Ni = 0.000000000001; // Ensure some value > 0 here otherwise we get -infinity later. + double Pi = 1.0 - ((double)i / half_read_length); // Likely scale at this position + double Ei = (double)j->getNbJunctionAlignments() * Pi; // Expected count at this position + double Xi = abs(log2(Ni / Ei)); // Log deviation + + d[(i-14 + 8) * x.size() + row] = Xi; + } + + row++; + } + + Data* data = new DataDouble(d, headers, x.size(), headers.size()); + data->setExternalData(false); // This causes 'd' to be deleted when 'data' is deleted + return data; } \ No newline at end of file diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 05e5ae26..526c5ad6 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -570,7 +570,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction cerr << endl << "Preparing junction metrics into matrix" << endl; } - Data* testingData = Train::juncs2FeatureVectors(all, mf); + Data* testingData = mf.juncs2FeatureVectors(all); // Load forest from disk diff --git a/src/train.cc b/src/train.cc index 801af91f..9c59ab63 100644 --- a/src/train.cc +++ b/src/train.cc @@ -68,58 +68,12 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){ verbose = false; } -Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, ModelFeatures& mf) { - - vector headers; - headers.reserve( VAR_NAMES.size() + JO_NAMES.size() ); - headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() ); - headers.insert( headers.end(), JO_NAMES.begin()+14, JO_NAMES.end() ); - - // Convert junction list info to double* - double* d = new double[x.size() * headers.size()]; - - uint32_t row = 0; - for (const auto& j : x) { - - //d[0 * x.size() + row] = j->getNbJunctionAlignments(); - //d[0 * x.size() + row] = j->getNbDistinctAlignments(); - d[0 * x.size() + row] = j->getNbReliableAlignments(); - //d[3 * x.size() + row] = j->getMaxMinAnchor(); - //d[4 * x.size() + row] = j->getDiffAnchor(); - //d[5 * x.size() + row] = j->getNbDistinctAnchors(); - d[1 * x.size() + row] = j->getEntropy(); - d[2 * x.size() + row] = j->getMaxMMES(); - d[3 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); - //d[3 * x.size() + row] = ; - d[4 * x.size() + row] = j->getReliable2RawRatio(); - //d[5 * x.size() + row] = j->getMeanMismatches(); - d[5 * x.size() + row] = mf.L95 == 0 ? 0.0 : j->calcIntronScore(mf.L95); // Intron score - d[6 * x.size() + row] = mf.isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(mf.gmap, mf.exonModel, mf.intronModel); // Coding potential score - d[7 * x.size() + row] = j->isGenuine(); - // Junction overhang values at each position are first converted into deviation from expected distributions - double half_read_length = (double)j->getMeanQueryLength() / 2.0; - for(size_t i = 14; i < JO_NAMES.size(); i++) { - double Ni = j->getJunctionOverhangs(i); // Actual count at this position - if (Ni == 0.0) Ni = 0.000000000001; // Ensure some value > 0 here otherwise we get -infinity later. - double Pi = 1.0 - ((double)i / half_read_length); // Likely scale at this position - double Ei = (double)j->getNbJunctionAlignments() * Pi; // Expected count at this position - double Xi = abs(log2(Ni / Ei)); // Log deviation - - d[(i-14 + 8) * x.size() + row] = Xi; - } - - row++; - } - - Data* data = new DataDouble(d, headers, x.size(), headers.size()); - data->setExternalData(false); // This causes 'd' to be deleted when 'data' is deleted - return data; -} + shared_ptr portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, ModelFeatures& mf) { cout << "Creating feature vector" << endl; - Data* trainingData = juncs2FeatureVectors(x, mf); + Data* trainingData = mf.juncs2FeatureVectors(x); cout << "Initialising random forest" << endl; shared_ptr f = nullptr; @@ -162,7 +116,8 @@ shared_ptr portcullis::Train::trainInstance(const JunctionList& x, strin void portcullis::Train::testInstance(shared_ptr f, const JunctionList& x) { // Convert testing set junctions into feature vector - Data* testingData = juncs2FeatureVectors(x); + ModelFeatures mf; + Data* testingData = mf.juncs2FeatureVectors(x); f->setPredictionMode(true); f->setData(testingData); diff --git a/src/train.hpp b/src/train.hpp index 7e2f009b..2525c188 100644 --- a/src/train.hpp +++ b/src/train.hpp @@ -53,23 +53,7 @@ const uint16_t DEFAULT_TRAIN_THREADS = 1; const double DEFAULT_TRAIN_FRACTION = 1.0; const int DEFAULT_SEED = 1234567; // To avoid non-deterministic behaviour -// List of variable names -const vector VAR_NAMES = { - //"M2-nb-reads", - //"M3-nb_dist_aln", - "nb_rel_aln", - //"M8-max_min_anc", - //"M9-dif_anc", - //"M10-dist_anc", - "entropy", - "maxmmes", - "min_hamming_score", - //"M14-hamming3p", - "rel2raw_ratio", - //"mean_mismatches", - "IntronScore", - "CodingPotential", - "Genuine" }; + // Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/ template @@ -235,12 +219,6 @@ class Train { static int main(int argc, char *argv[]); - static Data* juncs2FeatureVectors(const JunctionList& x) { - ModelFeatures mf; - return juncs2FeatureVectors(x, mf); - } - static Data* juncs2FeatureVectors(const JunctionList& x, ModelFeatures& modelFeatures); - static shared_ptr trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) { ModelFeatures mf; From eff0ac4c46b63d213b7e80151e55368495b53da4 Mon Sep 17 00:00:00 2001 From: Dan Date: Mon, 25 Apr 2016 14:01:05 +0100 Subject: [PATCH 07/15] Removed filter tool dependence on the training tool. Training now handled in modelfeatures. --- lib/include/portcullis/model_features.hpp | 9 ++++ lib/src/model_features.cc | 51 +++++++++++++++++++++++ src/junction_filter.cc | 9 ++-- src/junction_filter.hpp | 1 + src/train.cc | 46 +------------------- src/train.hpp | 10 ----- 6 files changed, 66 insertions(+), 60 deletions(-) diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp index b3e5a917..50f1245e 100644 --- a/lib/include/portcullis/model_features.hpp +++ b/lib/include/portcullis/model_features.hpp @@ -17,10 +17,14 @@ #pragma once +#include +using std::shared_ptr; + #include using boost::filesystem::path; #include +#include #include #include @@ -30,6 +34,8 @@ using portcullis::MarkovModel; using portcullis::JunctionList; namespace portcullis { + +typedef shared_ptr ForestPtr; // List of variable names const vector VAR_NAMES = { @@ -71,5 +77,8 @@ class ModelFeatures { void trainCodingPotentialModel(const JunctionList& in); Data* juncs2FeatureVectors(const JunctionList& x); + + ForestPtr trainInstance(const JunctionList& x, string outputPrefix, + uint16_t trees, uint16_t threads, bool regressionMode, bool verbose); }; } \ No newline at end of file diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 8d40b880..e6adfc9d 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -16,10 +16,15 @@ // ******************************************************************* #include +#include using std::cout; +using std::cerr; using std::endl; +using std::make_shared; #include +#include +#include #include using portcullis::Junction; @@ -135,4 +140,50 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { Data* data = new DataDouble(d, headers, x.size(), headers.size()); data->setExternalData(false); // This causes 'd' to be deleted when 'data' is deleted return data; +} + + + +portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionList& x, + string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) { + + cout << "Creating feature vector" << endl; + Data* trainingData = juncs2FeatureVectors(x); + + cout << "Initialising random forest" << endl; + ForestPtr f = nullptr; + if (regressionMode) { + f = make_shared(); + } + else { + f = make_shared(); + } + + vector catVars; + + f->init( + "Genuine", // Dependant variable name + MEM_DOUBLE, // Memory mode + trainingData, // Data object + 0, // M Try (0 == use default) + outputPrefix, // Output prefix + trees, // Number of trees + 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible + threads, // Number of threads + IMP_GINI, // Importance measure + regressionMode ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size + "", // Status var name + false, // Prediction mode + true, // Replace + catVars, // Unordered categorical variable names (vector) + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + false, // predall + 1.0); // Sample fraction + + cout << "Training" << endl; + f->setVerboseOut(&cerr); + f->run(verbose); + + return f; } \ No newline at end of file diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 526c5ad6..7b5eaa35 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -68,9 +68,6 @@ using portcullis::eval; using portcullis::bam::GenomeMapper; using portcullis::MarkovModel; -#include "train.hpp" -using portcullis::Train; - #include "junction_filter.hpp" portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile, @@ -336,7 +333,7 @@ void portcullis::JunctionFilter::filter() { training.insert(training.end(), neg.begin(), neg.end()); cout << "Training a random forest model using the initial positive and negative datasets" << endl; - shared_ptr forest = Train::trainInstance(training, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, mf); + shared_ptr forest = mf.trainInstance(training, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); forest->saveToFile(); forest->writeOutput(&cout); @@ -594,8 +591,8 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction testingData, // Data object 0, // M Try (0 == use default) "", // Output prefix - DEFAULT_TRAIN_TREES, // Number of trees (will be overwritten when loading the model) - DEFAULT_SEED, + DEFAULT_SELFTRAIN_TREES, // Number of trees (will be overwritten when loading the model) + 1234567890, // Seed for random generator threads, // Number of threads IMP_GINI, // Importance measure train ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index ec3767ff..760bf45c 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -77,6 +77,7 @@ const string DEFAULT_FILTER_MODEL_FILE = "default_model.forest"; const string ST_IPOS_RULES_FILE = "selftrain_initial_pos.json"; const string ST_INEG_RULES_FILE = "selftrain_initial_neg.json"; const uint16_t DEFAULT_FILTER_THREADS = 1; +const uint16_t DEFAULT_SELFTRAIN_TREES = 100; class JunctionFilter { diff --git a/src/train.cc b/src/train.cc index 9c59ab63..689fa1c5 100644 --- a/src/train.cc +++ b/src/train.cc @@ -70,48 +70,6 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){ -shared_ptr portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, ModelFeatures& mf) { - - cout << "Creating feature vector" << endl; - Data* trainingData = mf.juncs2FeatureVectors(x); - - cout << "Initialising random forest" << endl; - shared_ptr f = nullptr; - if (regressionMode) { - f = make_shared(); - } - else { - f = make_shared(); - } - - vector catVars; - - f->init( - "Genuine", // Dependant variable name - MEM_DOUBLE, // Memory mode - trainingData, // Data object - 0, // M Try (0 == use default) - outputPrefix, // Output prefix - trees, // Number of trees - DEFAULT_SEED, // Use fixed seed to avoid non-deterministic behaviour - threads, // Number of threads - IMP_GINI, // Importance measure - regressionMode ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size - "", // Status var name - false, // Prediction mode - true, // Replace - catVars, // Unordered categorical variable names (vector) - false, // Memory saving - DEFAULT_SPLITRULE, // Split rule - false, // predall - 1.0); // Sample fraction - - cout << "Training" << endl; - f->setVerboseOut(&cerr); - f->run(verbose); - - return f; -} void portcullis::Train::testInstance(shared_ptr f, const JunctionList& x) { @@ -198,7 +156,7 @@ void portcullis::Train::train() { cout << "Training on full dataset" << endl; - shared_ptr f = trainInstance(junctions, outputPrefix.string(), trees, threads, false, true); + ForestPtr f = ModelFeatures().trainInstance(junctions, outputPrefix.string(), trees, threads, false, true); f->saveToFile(); f->writeOutput(&cout); @@ -233,7 +191,7 @@ void portcullis::Train::train() { kf.getFold(i, back_inserter(train), back_inserter(test)); // Train on this particular set - shared_ptr f = trainInstance(train, outputPrefix.string(), trees, threads, false, false); + ForestPtr f = ModelFeatures().trainInstance(train, outputPrefix.string(), trees, threads, false, false); // Test model instance testInstance(f, test); diff --git a/src/train.hpp b/src/train.hpp index 2525c188..40fe2b34 100644 --- a/src/train.hpp +++ b/src/train.hpp @@ -219,16 +219,6 @@ class Train { static int main(int argc, char *argv[]); - static shared_ptr trainInstance(const JunctionList& x, string outputPrefix, - uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) { - ModelFeatures mf; - return trainInstance(x, outputPrefix, trees, threads, regressionMode, verbose, - mf); - } - - static shared_ptr trainInstance(const JunctionList& x, string outputPrefix, - uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, - ModelFeatures& modelFeatures); protected: From db1b54ce69a519dc17cd650ea8979d42436f1504 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Tue, 26 Apr 2016 16:33:30 +0100 Subject: [PATCH 08/15] Implemented splicing positional weights scores, although it doesn't appear to improve performance. --- deps/ranger-0.3.8/include/ranger/Data.h | 178 +++++++------ deps/ranger-0.3.8/src/Data.cpp | 296 ++++++++++++---------- lib/include/portcullis/junction.hpp | 16 +- lib/include/portcullis/markov_model.hpp | 44 +++- lib/include/portcullis/model_features.hpp | 44 ++-- lib/include/portcullis/seq_utils.hpp | 16 +- lib/src/junction.cc | 66 +++-- lib/src/markov_model.cc | 57 ++++- lib/src/model_features.cc | 99 ++++++-- src/junction_filter.cc | 55 ++-- src/junction_filter.hpp | 2 - 11 files changed, 546 insertions(+), 327 deletions(-) diff --git a/deps/ranger-0.3.8/include/ranger/Data.h b/deps/ranger-0.3.8/include/ranger/Data.h index 0894fdb3..56dd3810 100644 --- a/deps/ranger-0.3.8/include/ranger/Data.h +++ b/deps/ranger-0.3.8/include/ranger/Data.h @@ -31,107 +31,131 @@ #include #include +#include +#include #include "globals.h" class Data { public: - Data(); - virtual ~Data(); + Data(); + virtual ~Data(); - virtual double get(size_t row, size_t col) const = 0; + virtual double get(size_t row, size_t col) const = 0; - size_t getVariableID(std::string variable_name); + size_t getVariableID(std::string variable_name); - virtual void reserveMemory() = 0; - virtual void set(size_t col, size_t row, double value, bool& error) = 0; + virtual void reserveMemory() = 0; + virtual void set(size_t col, size_t row, double value, bool& error) = 0; - void addSparseData(unsigned char* sparse_data, size_t num_cols_sparse); + void addSparseData(unsigned char* sparse_data, size_t num_cols_sparse); - bool loadFromFile(std::string filename); - bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line); - bool loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator); + bool loadFromFile(std::string filename); + bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line); + bool loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator); - void getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID); + void getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID); - size_t getIndex(size_t row, size_t col) const { - if (col < num_cols_no_sparse) { - return index_data[col * num_rows + row]; - } else { - // Get data out of sparse storage. -1 because of GenABEL coding. - size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; - size_t result = (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); + size_t getIndex(size_t row, size_t col) const { + if (col < num_cols_no_sparse) { + return index_data[col * num_rows + row]; + } else { + // Get data out of sparse storage. -1 because of GenABEL coding. + size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; + size_t result = (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); - // TODO: Better way to treat missing values? - if (result > 2) { - return 0; - } else { - return result; - } + // TODO: Better way to treat missing values? + if (result > 2) { + return 0; + } else { + return result; + } + } } - } - - double getUniqueDataValue(size_t varID, size_t index) const { - if (varID < num_cols_no_sparse) { - return unique_data_values[varID][index]; - } else { - // For GWAS data the index is the value - return (index); + + double getUniqueDataValue(size_t varID, size_t index) const { + if (varID < num_cols_no_sparse) { + return unique_data_values[varID][index]; + } else { + // For GWAS data the index is the value + return (index); + } + } + + size_t getNumUniqueDataValues(size_t varID) const { + if (varID < num_cols_no_sparse) { + return unique_data_values[varID].size(); + } else { + // For GWAS data 0,1,2 + return (3); + } + } + + void sort(); + + const std::vector& getVariableNames() const { + return variable_names; + } + + size_t getNumCols() const { + return num_cols; } - } - - size_t getNumUniqueDataValues(size_t varID) const { - if (varID < num_cols_no_sparse) { - return unique_data_values[varID].size(); - } else { - // For GWAS data 0,1,2 - return (3); + + size_t getNumRows() const { + return num_rows; + } + + size_t getMaxNumUniqueValues() const { + if (sparse_data == 0 || max_num_unique_values > 3) { + // If no sparse data or one variable with more than 3 unique values, return that value + return max_num_unique_values; + } else { + // If sparse data and no variable with more than 3 unique values, return 3 + return 3; + } + } + + void setExternalData(bool external) { + this->externalData = external; + } + + void saveToTSV(const std::string& file); + + std::string getRow(size_t row) const { + std::stringstream ss; + ss << this->get(row, 0); + for (size_t j = 1; j < num_cols; j++) { + ss << "\t" << this->get(row, j); + } + return ss.str(); } - } - - void sort(); - - const std::vector& getVariableNames() const { - return variable_names; - } - size_t getNumCols() const { - return num_cols; - } - size_t getNumRows() const { - return num_rows; - } - - size_t getMaxNumUniqueValues() const { - if (sparse_data == 0 || max_num_unique_values > 3) { - // If no sparse data or one variable with more than 3 unique values, return that value - return max_num_unique_values; - } else { - // If sparse data and no variable with more than 3 unique values, return 3 - return 3; + + std::string getHeader() const { + std::stringstream ss; + ss << variable_names[0]; + for (size_t i = 1; i < variable_names.size(); i++) { + ss << "\t" << variable_names[i]; + } + return ss.str(); } - } - - void setExternalData(bool external) { - this->externalData = external; - } protected: - std::vector variable_names; - size_t num_rows; - size_t num_rows_rounded; - size_t num_cols; + std::vector variable_names; + size_t num_rows; + size_t num_rows_rounded; + size_t num_cols; - unsigned char* sparse_data; - size_t num_cols_no_sparse; + unsigned char* sparse_data; + size_t num_cols_no_sparse; - bool externalData; + bool externalData; - size_t* index_data; - std::vector> unique_data_values; - size_t max_num_unique_values; + size_t* index_data; + std::vector> unique_data_values; + size_t max_num_unique_values; private: - DISALLOW_COPY_AND_ASSIGN(Data); + DISALLOW_COPY_AND_ASSIGN(Data); }; #endif /* DATA_H_ */ diff --git a/deps/ranger-0.3.8/src/Data.cpp b/deps/ranger-0.3.8/src/Data.cpp index 03b09203..712afb81 100644 --- a/deps/ranger-0.3.8/src/Data.cpp +++ b/deps/ranger-0.3.8/src/Data.cpp @@ -36,181 +36,201 @@ #include Data::Data() : - num_rows(0), num_rows_rounded(0), num_cols(0), sparse_data(0), num_cols_no_sparse(0), externalData(true), index_data( - 0), max_num_unique_values(0) { +num_rows(0), num_rows_rounded(0), num_cols(0), sparse_data(0), num_cols_no_sparse(0), externalData(true), index_data( +0), max_num_unique_values(0) { } Data::~Data() { - if (index_data != 0) { - delete[] index_data; - } + if (index_data != 0) { + delete[] index_data; + } } size_t Data::getVariableID(std::string variable_name) { - std::vector::iterator it = std::find(variable_names.begin(), variable_names.end(), variable_name); - if (it == variable_names.end()) { - throw std::runtime_error("Variable " + variable_name + " not found."); - } - return (std::distance(variable_names.begin(), it)); + std::vector::iterator it = std::find(variable_names.begin(), variable_names.end(), variable_name); + if (it == variable_names.end()) { + throw std::runtime_error("Variable " + variable_name + " not found."); + } + return (std::distance(variable_names.begin(), it)); } void Data::addSparseData(unsigned char* sparse_data, size_t num_cols_sparse) { - num_cols = num_cols_no_sparse + num_cols_sparse; - num_rows_rounded = roundToNextMultiple(num_rows, 4); - this->sparse_data = sparse_data; + num_cols = num_cols_no_sparse + num_cols_sparse; + num_rows_rounded = roundToNextMultiple(num_rows, 4); + this->sparse_data = sparse_data; } bool Data::loadFromFile(std::string filename) { - bool result; - - // Open input file - std::ifstream input_file; - input_file.open(filename); - if (!input_file.good()) { - throw std::runtime_error("Could not open input file."); - } - - // Count number of rows - size_t line_count = 0; - std::string line; - while (getline(input_file, line)) { - ++line_count; - } - num_rows = line_count - 1; - input_file.close(); - input_file.open(filename); - - // Check if comma, semicolon or whitespace seperated - std::string header_line; - getline(input_file, header_line); - - // Find out if comma, semicolon or whitespace seperated and call appropriate method - if (header_line.find(",") != std::string::npos) { - result = loadFromFileOther(input_file, header_line, ','); - } else if (header_line.find(";") != std::string::npos) { - result = loadFromFileOther(input_file, header_line, ';'); - } else { - result = loadFromFileWhitespace(input_file, header_line); - } - - externalData = false; - input_file.close(); - return result; + bool result; + + // Open input file + std::ifstream input_file; + input_file.open(filename); + if (!input_file.good()) { + throw std::runtime_error("Could not open input file."); + } + + // Count number of rows + size_t line_count = 0; + std::string line; + while (getline(input_file, line)) { + ++line_count; + } + num_rows = line_count - 1; + input_file.close(); + input_file.open(filename); + + // Check if comma, semicolon or whitespace seperated + std::string header_line; + getline(input_file, header_line); + + // Find out if comma, semicolon or whitespace seperated and call appropriate method + if (header_line.find(",") != std::string::npos) { + result = loadFromFileOther(input_file, header_line, ','); + } else if (header_line.find(";") != std::string::npos) { + result = loadFromFileOther(input_file, header_line, ';'); + } else { + result = loadFromFileWhitespace(input_file, header_line); + } + + externalData = false; + input_file.close(); + return result; } bool Data::loadFromFileWhitespace(std::ifstream& input_file, std::string header_line) { - // Read header - std::string header_token; - std::stringstream header_line_stream(header_line); - while (header_line_stream >> header_token) { - variable_names.push_back(header_token); - } - num_cols = variable_names.size(); - num_cols_no_sparse = num_cols; - - // Read body - reserveMemory(); - bool error = false; - std::string line; - size_t row = 0; - while (getline(input_file, line)) { - double token; - std::stringstream line_stream(line); - size_t column = 0; - while (line_stream >> token) { - set(column, row, token, error); - ++column; + // Read header + std::string header_token; + std::stringstream header_line_stream(header_line); + while (header_line_stream >> header_token) { + variable_names.push_back(header_token); } - if (column > num_cols) { - throw std::runtime_error("Could not open input file. Too many columns in a row."); - } else if (column < num_cols) { - throw std::runtime_error("Could not open input file. Too few columns in a row. Are all values numeric?"); + num_cols = variable_names.size(); + num_cols_no_sparse = num_cols; + + // Read body + reserveMemory(); + bool error = false; + std::string line; + size_t row = 0; + while (getline(input_file, line)) { + double token; + std::stringstream line_stream(line); + size_t column = 0; + while (line_stream >> token) { + set(column, row, token, error); + ++column; + } + if (column > num_cols) { + throw std::runtime_error("Could not open input file. Too many columns in a row."); + } else if (column < num_cols) { + throw std::runtime_error("Could not open input file. Too few columns in a row. Are all values numeric?"); + } + ++row; } - ++row; - } - num_rows = row; - return error; + num_rows = row; + return error; } bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator) { - // Read header - std::string header_token; - std::stringstream header_line_stream(header_line); - while (getline(header_line_stream, header_token, seperator)) { - variable_names.push_back(header_token); - } - num_cols = variable_names.size(); - num_cols_no_sparse = num_cols; - - // Read body - reserveMemory(); - bool error = false; - std::string line; - size_t row = 0; - while (getline(input_file, line)) { - std::string token_string; - double token; - std::stringstream line_stream(line); - size_t column = 0; - while (getline(line_stream, token_string, seperator)) { - std::stringstream token_stream(token_string); - token_stream >> token; - set(column, row, token, error); - ++column; + // Read header + std::string header_token; + std::stringstream header_line_stream(header_line); + while (getline(header_line_stream, header_token, seperator)) { + variable_names.push_back(header_token); + } + num_cols = variable_names.size(); + num_cols_no_sparse = num_cols; + + // Read body + reserveMemory(); + bool error = false; + std::string line; + size_t row = 0; + while (getline(input_file, line)) { + std::string token_string; + double token; + std::stringstream line_stream(line); + size_t column = 0; + while (getline(line_stream, token_string, seperator)) { + std::stringstream token_stream(token_string); + token_stream >> token; + set(column, row, token, error); + ++column; + } + ++row; } - ++row; - } - num_rows = row; - return error; + num_rows = row; + return error; } void Data::getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID) { - // All values for varID (no duplicates) for given sampleIDs - if (varID < num_cols_no_sparse) { - - all_values.reserve(sampleIDs.size()); - for (size_t i = 0; i < sampleIDs.size(); ++i) { - all_values.push_back(get(sampleIDs[i], varID)); + // All values for varID (no duplicates) for given sampleIDs + if (varID < num_cols_no_sparse) { + + all_values.reserve(sampleIDs.size()); + for (size_t i = 0; i < sampleIDs.size(); ++i) { + all_values.push_back(get(sampleIDs[i], varID)); + } + std::sort(all_values.begin(), all_values.end()); + all_values.erase(unique(all_values.begin(), all_values.end()), all_values.end()); + } else { + // If GWA data just use 0, 1, 2 + all_values = std::vector({0, 1, 2}); } - std::sort(all_values.begin(), all_values.end()); - all_values.erase(unique(all_values.begin(), all_values.end()), all_values.end()); - } else { - // If GWA data just use 0, 1, 2 - all_values = std::vector( { 0, 1, 2 }); - } } void Data::sort() { - // Reserve memory - index_data = new size_t[num_cols_no_sparse * num_rows]; + // Reserve memory + index_data = new size_t[num_cols_no_sparse * num_rows]; + + // For all columns, get unique values and save index for each observation + for (size_t col = 0; col < num_cols_no_sparse; ++col) { + + // Get all unique values + std::vector unique_values(num_rows); + for (size_t row = 0; row < num_rows; ++row) { + unique_values[row] = get(row, col); + } + std::sort(unique_values.begin(), unique_values.end()); + unique_values.erase(unique(unique_values.begin(), unique_values.end()), unique_values.end()); + + // Get index of unique value + for (size_t row = 0; row < num_rows; ++row) { + size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get(row, col)) - unique_values.begin(); + index_data[col * num_rows + row] = idx; + } + + // Save unique values + unique_data_values.push_back(unique_values); + if (unique_values.size() > max_num_unique_values) { + max_num_unique_values = unique_values.size(); + } + } +} - // For all columns, get unique values and save index for each observation - for (size_t col = 0; col < num_cols_no_sparse; ++col) { +void Data::saveToTSV(const std::string& file) { + std::ofstream fout(file.c_str(), std::ofstream::out); - // Get all unique values - std::vector unique_values(num_rows); - for (size_t row = 0; row < num_rows; ++row) { - unique_values[row] = get(row, col); + fout << variable_names[0]; + for (size_t i = 1; i < variable_names.size(); i++) { + fout << "\t" << variable_names[i]; } - std::sort(unique_values.begin(), unique_values.end()); - unique_values.erase(unique(unique_values.begin(), unique_values.end()), unique_values.end()); - - // Get index of unique value - for (size_t row = 0; row < num_rows; ++row) { - size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get(row, col)) - unique_values.begin(); - index_data[col * num_rows + row] = idx; + fout << std::endl; + + for (size_t i = 0; i < num_rows; i++) { + fout << this->get(i, 0); + for (size_t j = 1; j < num_cols; j++) { + fout << "\t" << this->get(i, j); + } + fout << std::endl; } - // Save unique values - unique_data_values.push_back(unique_values); - if (unique_values.size() > max_num_unique_values) { - max_num_unique_values = unique_values.size(); - } - } + fout.close(); } diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp index 73279b1c..4bd64869 100644 --- a/lib/include/portcullis/junction.hpp +++ b/lib/include/portcullis/junction.hpp @@ -46,6 +46,8 @@ using namespace portcullis::bam; #include "intron.hpp" #include "seq_utils.hpp" #include "markov_model.hpp" +using portcullis::KmerMarkovModel; +using portcullis::PosMarkovModel; using portcullis::Intron; @@ -843,7 +845,15 @@ class Junction { void setJunctionOverhangs(size_t index, uint32_t val) { junctionOverhangs[index] = val; } - + + double getJunctionOverhangLogDeviation(size_t i) const { + double Ni = junctionOverhangs[i]; // Actual count at this position + if (Ni == 0.0) Ni = 0.000000000001; // Ensure some value > 0 here otherwise we get -infinity later. + double Pi = 1.0 - ((double)i / (double)(this->meanQueryLength / 2.0)); // Likely scale at this position + double Ei = (double)this->getNbJunctionAlignments() * Pi; // Expected count at this position + double Xi = log2(Ni / Ei); + return Xi; + } bool isGenuine() const { return genuine; @@ -870,7 +880,9 @@ class Junction { } - double calcCodingPotential(GenomeMapper& gmap, MarkovModel& exon, MarkovModel& intron) const; + double calcCodingPotential(GenomeMapper& gmap, KmerMarkovModel& exon, KmerMarkovModel& intron) const; + + double calcPositionWeightScore(GenomeMapper& gmap, PosMarkovModel& donor, PosMarkovModel& acceptor) const; // **** Output methods **** diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/markov_model.hpp index 1b524858..bcc4744e 100644 --- a/lib/include/portcullis/markov_model.hpp +++ b/lib/include/portcullis/markov_model.hpp @@ -30,18 +30,12 @@ namespace portcullis { typedef boost::error_info MMErrorInfo; struct MMException: virtual boost::exception, virtual std::exception {}; - - -/** - * Intended to be used as a map containing the probability of seeing a given character - */ -typedef unordered_map NTP; - /** * This datastructure consists of a map of kmers to a map of nucleotides with assoicated * probabilities. */ -typedef unordered_map MMU; +typedef unordered_map> KMMU; +typedef unordered_map> PMMU; /** * Simple Markov chain implementation derived originally from Truesight. @@ -51,10 +45,9 @@ typedef unordered_map MMU; */ class MarkovModel { -private: - portcullis::MMU model; +protected: uint16_t order; - + public: MarkovModel() : MarkovModel(1) {} @@ -72,18 +65,45 @@ class MarkovModel { train(input, order); } - void train(const vector& input, const uint32_t order); + virtual void train(const vector& input, const uint32_t order) = 0; uint16_t getOrder() const { return order; } + virtual double getScore(const string& seq) = 0; +}; + +class KmerMarkovModel : public portcullis::MarkovModel { +private: + portcullis::KMMU model; +public: + KmerMarkovModel() : MarkovModel(1) {} + KmerMarkovModel(const uint32_t _order) : MarkovModel(_order) {}; + KmerMarkovModel(const vector& input, const uint32_t _order) : MarkovModel(input, _order) {} + + void train(const vector& input, const uint32_t order); + double getScore(const string& seq); size_t size() const { return model.size(); } +}; + +class PosMarkovModel : public portcullis::MarkovModel { +private: + portcullis::PMMU model; +public: + PosMarkovModel() : MarkovModel(1) {} + PosMarkovModel(const uint32_t _order) : MarkovModel(_order) {}; + PosMarkovModel(const vector& input, const uint32_t _order) : MarkovModel(input, _order) {} + void train(const vector& input, const uint32_t order); double getScore(const string& seq); + size_t size() const { + return model.size(); + } }; + } diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp index 50f1245e..28393f72 100644 --- a/lib/include/portcullis/model_features.hpp +++ b/lib/include/portcullis/model_features.hpp @@ -39,28 +39,32 @@ typedef shared_ptr ForestPtr; // List of variable names const vector VAR_NAMES = { - //"M2-nb-reads", - //"M3-nb_dist_aln", - "nb_rel_aln", - //"M8-max_min_anc", - //"M9-dif_anc", - //"M10-dist_anc", - "entropy", - "maxmmes", - "min_hamming_score", - //"M14-hamming3p", - "rel2raw_ratio", - //"mean_mismatches", - "IntronScore", - "CodingPotential", - "Genuine" }; + "Genuine", + //"M2-nb-reads", + //"M3-nb_dist_aln", + "nb_rel_aln", + //"M8-max_min_anc", + //"M9-dif_anc", + //"M10-dist_anc", + "entropy", + "maxmmes", + "min_hamming_score", + //"M14-hamming3p", + "rel2raw_ratio", + //"mean_mismatches", + "IntronScore", + //"CodingPotential" + "PWS" +}; class ModelFeatures { public: uint32_t L95; - MarkovModel exonModel; - MarkovModel intronModel; + KmerMarkovModel exonModel; + KmerMarkovModel intronModel; + PosMarkovModel donorPWModel; + PosMarkovModel acceptorPWModel; GenomeMapper gmap; ModelFeatures() : L95(0) { @@ -70,12 +74,18 @@ class ModelFeatures { return exonModel.size() == 0 || intronModel.size() == 0; } + bool isPWModelEmpty() { + return donorPWModel.size() == 0 || acceptorPWModel.size() == 0; + } + void initGenomeMapper(const path& genomeFile); uint32_t calcIntronThreshold(const JunctionList& juncs); void trainCodingPotentialModel(const JunctionList& in); + void trainPositionWeightModel(const JunctionList& in); + Data* juncs2FeatureVectors(const JunctionList& x); ForestPtr trainInstance(const JunctionList& x, string outputPrefix, diff --git a/lib/include/portcullis/seq_utils.hpp b/lib/include/portcullis/seq_utils.hpp index 09bcc0a8..867f8b37 100644 --- a/lib/include/portcullis/seq_utils.hpp +++ b/lib/include/portcullis/seq_utils.hpp @@ -40,12 +40,24 @@ typedef boost::error_info SeqUtilsErrorInfo; struct SeqUtilsException: virtual boost::exception, virtual std::exception { }; - - class SeqUtils { public: + static bool dnaNt(const char c) { + return c == 'A' || c == 'T' || c == 'G' || c == 'C'; + } + + static string makeClean(const string& s) { + string sequp = boost::to_upper_copy(s); + + for(size_t i = 0; i < sequp.size(); i++) { + sequp[i] = dnaNt(sequp[i]) ? sequp[i] : 'N'; + } + + return sequp; + } + static int16_t hammingDistance(const string& s1, const string& s2) { if (s1.size() != s2.size()) diff --git a/lib/src/junction.cc b/lib/src/junction.cc index f10b9f9b..e2237ac9 100644 --- a/lib/src/junction.cc +++ b/lib/src/junction.cc @@ -45,12 +45,6 @@ using portcullis::bam::CigarOp; using portcullis::bam::GenomeMapper; using portcullis::bam::Strand; -#include -#include -#include -using portcullis::Intron; -using portcullis::MarkovModel; - #include @@ -1228,38 +1222,64 @@ shared_ptr portcullis::Junction::parse(const string& line) return j; } -double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, MarkovModel& exon, MarkovModel& intron) const { +double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, KmerMarkovModel& exon, KmerMarkovModel& intron) const { int len = 0; const char* ref = this->intron->ref.name.c_str(); + const bool neg = getConsensusStrand() == Strand::NEGATIVE; - string left_exon = gmap.fetchBases(ref, this->intron->start - 80, this->intron->start, &len); - if (this->getConsensusStrand() == Strand::NEGATIVE) { + string left_exon = gmap.fetchBases(ref, this->intron->start - 82, this->intron->start-2, &len); + if (neg) { left_exon = SeqUtils::reverseComplement(left_exon); - } - - string left_intron = gmap.fetchBases(ref, this->intron->start, this->intron->start+81, &len); - if (this->getConsensusStrand() == Strand::NEGATIVE) { + } + + string left_intron = gmap.fetchBases(ref, this->intron->start, this->intron->start+80, &len); + if (neg) { left_intron = SeqUtils::reverseComplement(left_intron); - } + } - string right_intron = gmap.fetchBases(ref, this->intron->end-80, this->intron->end+1, &len); - if (this->getConsensusStrand() == Strand::NEGATIVE) { + string right_intron = gmap.fetchBases(ref, this->intron->end-80, this->intron->end, &len); + if (neg) { right_intron = SeqUtils::reverseComplement(right_intron); } - string right_exon = gmap.fetchBases(ref, this->intron->end + 1, this->intron->end + 80, &len); - if (this->getConsensusStrand() == Strand::NEGATIVE) { + string right_exon = gmap.fetchBases(ref, this->intron->end + 1, this->intron->end + 81, &len); + if (neg) { right_exon = SeqUtils::reverseComplement(right_exon); - } - - double score = exon.getScore(left_intron) + intron.getScore(left_exon) + } + /* + cout << "Left exon : " << this->consensusStrand << " : " << left_exon << endl; + cout << "Left intron : " << this->consensusStrand << " : " << left_intron << endl; + cout << "Right intron: " << this->consensusStrand << " : " << right_intron << endl; + cout << "Right exon : " << this->consensusStrand << " : " << right_exon << endl; + */ + double score =( exon.getScore(left_intron) + intron.getScore(left_exon) ) - ( intron.getScore(left_intron) + exon.getScore(left_exon) ) + ( intron.getScore(right_exon) + exon.getScore(right_intron) ) - ( exon.getScore(right_exon) + intron.getScore(right_intron) ); - //cout << score << endl; - return score; } + +double portcullis::Junction::calcPositionWeightScore(GenomeMapper& gmap, PosMarkovModel& donor, PosMarkovModel& acceptor) const { + + int len = 0; + const char* ref = this->intron->ref.name.c_str(); + const bool neg = getConsensusStrand() == Strand::NEGATIVE; + + string left = gmap.fetchBases(ref, intron->start - 3, intron->start + 20, &len); + if (neg) { + left = SeqUtils::reverseComplement(left); + } + + string right = gmap.fetchBases(ref, intron->start, intron->end, &len); + if (neg) { + right = SeqUtils::reverseComplement(right); + } + + string donorseq = neg ? right : left; + string acceptorseq = neg ? left : right; + + return donor.getScore(donorseq) + acceptor.getScore(acceptorseq); +} diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc index 50d061db..18cd2d92 100644 --- a/lib/src/markov_model.cc +++ b/lib/src/markov_model.cc @@ -22,15 +22,18 @@ using std::endl; #include #include +#include +using portcullis::SeqUtils; + #include -void portcullis::MarkovModel::train(const vector& input, const uint32_t _order) { +void portcullis::KmerMarkovModel::train(const vector& input, const uint32_t _order) { order = _order; - MMU temp; + KMMU temp; for(auto& seq : input) { - string sequp = boost::to_upper_copy(seq); - for(uint16_t i = order; i < sequp.size(); i++) { - temp[sequp.substr(i-order, order)][sequp.substr(i, 1)]++; + string s = SeqUtils::makeClean(seq); + for(uint16_t i = order; i < s.size(); i++) { + temp[s.substr(i-order, order)][s.substr(i, 1)]++; } } model.clear(); @@ -47,14 +50,50 @@ void portcullis::MarkovModel::train(const vector& input, const uint32_t } -double portcullis::MarkovModel::getScore(const string& seq) { - string sequp = boost::to_upper_copy(seq); +double portcullis::KmerMarkovModel::getScore(const string& seq) { + string s = SeqUtils::makeClean(seq); double score = 1.0; - for(uint16_t i = order; i < sequp.size(); i++){ - score *= model[sequp.substr(i-order, order)][sequp.substr(i, 1)]; + for(uint16_t i = order; i < s.size(); i++){ + score *= model[s.substr(i-order, order)][s.substr(i, 1)]; } if(score == 0.0) { return -100.0; } return log(score); } + +void portcullis::PosMarkovModel::train(const vector& input, const uint32_t _order) { + order = _order; + PMMU temp; + for(auto& seq : input) { + string s = SeqUtils::makeClean(seq); + for(uint16_t i = order; i < s.size(); i++) { + temp[i][s.substr(i, 1)]++; + } + } + model.clear(); + for(auto& i : temp) { + double sum = 0; + for(auto& j : i.second) { + sum += j.second; + } + for(auto& j : i.second) { + //cout << i.first << " " << j.first << " " << j.second << endl; + model[i.first][j.first] = j.second / sum; + } + } +} + + +double portcullis::PosMarkovModel::getScore(const string& seq) { + string s = SeqUtils::makeClean(seq); + double score = 1.0; + for(uint16_t i = order; i < s.size(); i++){ + score *= model[i][s.substr(i, 1)]; + } + if(score == 0.0) { + return -300.0; + } + return log(score); +} + diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index e6adfc9d..f00bee43 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -15,11 +15,13 @@ // along with Portcullis. If not, see . // ******************************************************************* +#include #include #include using std::cout; using std::cerr; using std::endl; +using std::ofstream; using std::make_shared; #include @@ -64,26 +66,32 @@ void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in int len = 0; - string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 80, j->getIntron()->start, &len); + string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 202, j->getIntron()->start - 2, &len); if (j->getConsensusStrand() == Strand::NEGATIVE) { left_exon = SeqUtils::reverseComplement(left_exon); } exons.push_back(left_exon); - string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+81, &len); + /*string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+80, &len); if (j->getConsensusStrand() == Strand::NEGATIVE) { left_intron = SeqUtils::reverseComplement(left_intron); } introns.push_back(left_intron); - string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end+1, &len); + string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end, &len); if (j->getConsensusStrand() == Strand::NEGATIVE) { right_intron = SeqUtils::reverseComplement(right_intron); } - introns.push_back(right_intron); + introns.push_back(right_intron);*/ + + string intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->end, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + intron = SeqUtils::reverseComplement(intron); + } + introns.push_back(intron); - string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 80, &len); + string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 201, &len); if (j->getConsensusStrand() == Strand::NEGATIVE) { right_exon = SeqUtils::reverseComplement(right_exon); } @@ -94,44 +102,73 @@ void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in intronModel.train(introns, 5); } +void portcullis::ModelFeatures::trainPositionWeightModel(const JunctionList& in) { + + vector donors; + vector acceptors; + for(auto& j : in) { + + int len = 0; + + string left = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 3, j->getIntron()->start + 20, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + left = SeqUtils::reverseComplement(left); + } + + string right = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->end, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + right = SeqUtils::reverseComplement(right); + } + + if (j->getConsensusStrand() == Strand::NEGATIVE) { + donors.push_back(right); + acceptors.push_back(left); + } + else { + donors.push_back(left); + acceptors.push_back(right); + } + + } + + donorPWModel.train(donors, 1); + acceptorPWModel.train(acceptors, 1); +} + Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { + const size_t JO_OFFSET = 10; vector headers; headers.reserve( VAR_NAMES.size() + JO_NAMES.size() ); headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() ); - headers.insert( headers.end(), JO_NAMES.begin()+14, JO_NAMES.end() ); + headers.insert( headers.end(), JO_NAMES.begin()+JO_OFFSET-1, JO_NAMES.end() ); // Convert junction list info to double* double* d = new double[x.size() * headers.size()]; uint32_t row = 0; for (const auto& j : x) { - + d[0 * x.size() + row] = j->isGenuine(); + // //d[0 * x.size() + row] = j->getNbJunctionAlignments(); //d[0 * x.size() + row] = j->getNbDistinctAlignments(); - d[0 * x.size() + row] = j->getNbReliableAlignments(); + d[1 * x.size() + row] = j->getNbReliableAlignments(); //d[3 * x.size() + row] = j->getMaxMinAnchor(); //d[4 * x.size() + row] = j->getDiffAnchor(); //d[5 * x.size() + row] = j->getNbDistinctAnchors(); - d[1 * x.size() + row] = j->getEntropy(); - d[2 * x.size() + row] = j->getMaxMMES(); - d[3 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); + d[2 * x.size() + row] = j->getEntropy(); + d[3 * x.size() + row] = j->getMaxMMES(); + d[4 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); //d[3 * x.size() + row] = ; - d[4 * x.size() + row] = j->getReliable2RawRatio(); + d[5 * x.size() + row] = j->getReliable2RawRatio(); //d[5 * x.size() + row] = j->getMeanMismatches(); - d[5 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score - d[6 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); // Coding potential score - d[7 * x.size() + row] = j->isGenuine(); - // Junction overhang values at each position are first converted into deviation from expected distributions - double half_read_length = (double)j->getMeanQueryLength() / 2.0; - for(size_t i = 14; i < JO_NAMES.size(); i++) { - double Ni = j->getJunctionOverhangs(i); // Actual count at this position - if (Ni == 0.0) Ni = 0.000000000001; // Ensure some value > 0 here otherwise we get -infinity later. - double Pi = 1.0 - ((double)i / half_read_length); // Likely scale at this position - double Ei = (double)j->getNbJunctionAlignments() * Pi; // Expected count at this position - double Xi = abs(log2(Ni / Ei)); // Log deviation - - d[(i-14 + 8) * x.size() + row] = Xi; + d[6 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score + //d[7 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); + d[7 * x.size() + row] = isPWModelEmpty() ? 0.0 : j->calcPositionWeightScore(gmap, donorPWModel, acceptorPWModel); + + //Junction overhang values at each position are first converted into deviation from expected distributions + for(size_t i = JO_OFFSET; i <= JO_NAMES.size(); i++) { + d[(i-JO_OFFSET + 8) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1); } row++; @@ -150,6 +187,18 @@ portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionLis cout << "Creating feature vector" << endl; Data* trainingData = juncs2FeatureVectors(x); + path feature_file = outputPrefix + ".features"; + cout << "Saving feature vector to disk: " << feature_file << endl; + + ofstream fout(feature_file.c_str(), std::ofstream::out); + + fout << Intron::locationOutputHeader() << "\t" << trainingData->getHeader() << endl; + for(size_t i = 0; i < x.size(); i++) { + fout << *(x[i]->getIntron()) << "\t" << trainingData->getRow(i) << endl; + } + + fout.close(); + cout << "Initialising random forest" << endl; ForestPtr f = nullptr; if (regressionMode) { diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 7b5eaa35..232b807e 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -59,14 +59,12 @@ namespace po = boost::program_options; #include #include #include -#include using portcullis::PortcullisFS; using portcullis::Intron; using portcullis::IntronHasher; using portcullis::Performance; using portcullis::eval; using portcullis::bam::GenomeMapper; -using portcullis::MarkovModel; #include "junction_filter.hpp" @@ -220,19 +218,35 @@ void portcullis::JunctionFilter::filter() { JunctionSystem isp(passJuncs); isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp"); + // Analyse positive set to get L0.05 of intron size + cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl; + + const uint32_t poslongintronthreshold = mf.L95 * 2; + JunctionList pass2juncs; for(auto& j : passJuncs) { + if (j->getIntronSize() <= poslongintronthreshold) { + pass2juncs.push_back(j); + } + else { + failJuncs.push_back(j); + } + } + cout << "Removed " << (passJuncs.size() - pass2juncs.size()) << " junctions from positive set with introns > L95 x 2 = " << poslongintronthreshold << endl << endl; + + + for(auto& j : pass2juncs) { JunctionPtr copy = make_shared(*j); copy->setGenuine(true); pos.push_back(copy); } if (!genuineFile.empty()) { - shared_ptr p = calcPerformance(passJuncs, failJuncs); + shared_ptr p = calcPerformance(pass2juncs, failJuncs); cout << "Performance of initial positive set (High PPV is important):" << endl; cout << Performance::longHeader() << endl; cout << p->toLongString() << endl << endl; JunctionSystem invalidPos; - for(auto& j : passJuncs) { + for(auto& j : pass2juncs) { if (!j->isGenuine()) { invalidPos.addJunction(j); } @@ -251,16 +265,17 @@ void portcullis::JunctionFilter::filter() { missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp"); } - - // Analyse positive set to get L0.05 of intron size - cout << "Length of intron at 95th percentile of positive set: " << mf.calcIntronThreshold(passJuncs) << endl << endl; - cout << "Training coding potential markov models on positive set ..."; cout.flush(); mf.trainCodingPotentialModel(pos); cout << " done." << endl; cout << "Exon model contains " << mf.exonModel.size() << " 5-mers." << endl; - cout << "Intron model contains " << mf.intronModel.size() << " 5-mers." << endl; + cout << "Intron model contains " << mf.intronModel.size() << " 5-mers." << endl << endl; + + cout << "Training position weight markov models on positive set ..."; + cout.flush(); + mf.trainPositionWeightModel(pos); + cout << " done." << endl; passJuncs.clear(); failJuncs.clear(); @@ -268,14 +283,14 @@ void portcullis::JunctionFilter::filter() { doRuleBasedFiltering(this->getIntitalNegRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial negative set for training", resultMap); - const uint32_t longintronthreshold = mf.L95 * 6; - cout << endl << "Adding junctions to negative set with MaxMMES < 10 and excessively long intron size of positive set (> L95 x 6 = " << longintronthreshold << ")..."; + const uint32_t neglongintronthreshold = mf.L95 * 6; + cout << endl << "Adding junctions to negative set with MaxMMES < 10 and excessively long intron size of positive set (> L95 x 6 = " << neglongintronthreshold << ")..."; cout.flush(); uint32_t nblongintrons = 0; JunctionList fail2Juncs; for(auto& j : failJuncs) { - if (j->getIntronSize() > longintronthreshold && j->getMaxMMES() < 10 ) { + if (j->getIntronSize() > neglongintronthreshold && j->getMaxMMES() < 10 ) { passJuncs.push_back(j); nblongintrons++; } @@ -332,8 +347,11 @@ void portcullis::JunctionFilter::filter() { training.insert(training.end(), pos.begin(), pos.end()); training.insert(training.end(), neg.begin(), neg.end()); + JunctionSystem trainingSystem(training); + trainingSystem.sort(); + cout << "Training a random forest model using the initial positive and negative datasets" << endl; - shared_ptr forest = mf.trainInstance(training, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); + shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); forest->saveToFile(); forest->writeOutput(&cout); @@ -341,9 +359,7 @@ void portcullis::JunctionFilter::filter() { modelFile = output.string() + ".selftrain.forest"; cout << endl; } - - - + // Manage a junction system of all discarded junctions JunctionSystem discardedJuncs; @@ -497,9 +513,6 @@ void portcullis::JunctionFilter::filter() { } - - - void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) { // Output stats size_t diff = in.size() - pass.size(); @@ -626,9 +639,11 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction path scorepath = output.string() + ".scores"; ofstream scoreStream(scorepath.c_str()); + scoreStream << "Score\t" << Intron::locationOutputHeader() << "\t" << testingData->getHeader() << endl; + for(size_t i = 0; i < all.size(); i++) { - scoreStream << f->getPredictions()[i][0] << endl; + scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) << "\t" << testingData->getRow(i) << endl; if (f->getPredictions()[i][0] >= 0.1) { pass.push_back(all[i]); } diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index 760bf45c..dd970caf 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -52,7 +52,6 @@ using boost::filesystem::symbolic_link_exists; #include #include #include -#include #include using portcullis::bam::GenomeMapper; using portcullis::PortcullisFS; @@ -60,7 +59,6 @@ using portcullis::Intron; using portcullis::IntronHasher; using portcullis::Performance; using portcullis::JuncResultMap; -using portcullis::MarkovModel; using portcullis::ModelFeatures; From c95babb67efe942ea5a6a71f69dbff578e33fb5a Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Tue, 26 Apr 2016 17:16:57 +0100 Subject: [PATCH 09/15] Implemented ability to control filtering threshold value as a hidden command line argument. --- lib/include/portcullis/markov_model.hpp | 2 +- lib/include/portcullis/model_features.hpp | 7 +++---- lib/src/model_features.cc | 23 +++++++++++------------ src/junction_filter.cc | 7 ++++++- src/junction_filter.hpp | 10 ++++++++++ 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/markov_model.hpp index bcc4744e..613fb177 100644 --- a/lib/include/portcullis/markov_model.hpp +++ b/lib/include/portcullis/markov_model.hpp @@ -101,7 +101,7 @@ class PosMarkovModel : public portcullis::MarkovModel { double getScore(const string& seq); size_t size() const { return model.size(); - } + } }; diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp index 28393f72..6451530f 100644 --- a/lib/include/portcullis/model_features.hpp +++ b/lib/include/portcullis/model_features.hpp @@ -41,7 +41,7 @@ typedef shared_ptr ForestPtr; const vector VAR_NAMES = { "Genuine", //"M2-nb-reads", - //"M3-nb_dist_aln", + "M3-nb_dist_aln", "nb_rel_aln", //"M8-max_min_anc", //"M9-dif_anc", @@ -49,11 +49,10 @@ const vector VAR_NAMES = { "entropy", "maxmmes", "min_hamming_score", - //"M14-hamming3p", "rel2raw_ratio", - //"mean_mismatches", + "mean_mismatches", "IntronScore", - //"CodingPotential" + "CodingPotential", "PWS" }; diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index f00bee43..30672ee2 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -151,24 +151,23 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { d[0 * x.size() + row] = j->isGenuine(); // //d[0 * x.size() + row] = j->getNbJunctionAlignments(); - //d[0 * x.size() + row] = j->getNbDistinctAlignments(); - d[1 * x.size() + row] = j->getNbReliableAlignments(); + d[1 * x.size() + row] = j->getNbDistinctAlignments(); + d[2 * x.size() + row] = j->getNbReliableAlignments(); //d[3 * x.size() + row] = j->getMaxMinAnchor(); //d[4 * x.size() + row] = j->getDiffAnchor(); //d[5 * x.size() + row] = j->getNbDistinctAnchors(); - d[2 * x.size() + row] = j->getEntropy(); - d[3 * x.size() + row] = j->getMaxMMES(); - d[4 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); - //d[3 * x.size() + row] = ; - d[5 * x.size() + row] = j->getReliable2RawRatio(); - //d[5 * x.size() + row] = j->getMeanMismatches(); - d[6 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score - //d[7 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); - d[7 * x.size() + row] = isPWModelEmpty() ? 0.0 : j->calcPositionWeightScore(gmap, donorPWModel, acceptorPWModel); + d[3 * x.size() + row] = j->getEntropy(); + d[4 * x.size() + row] = j->getMaxMMES(); + d[5 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); + d[6 * x.size() + row] = j->getReliable2RawRatio(); + d[7 * x.size() + row] = j->getMeanMismatches(); + d[8 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score + d[9 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); + d[10 * x.size() + row] = isPWModelEmpty() ? 0.0 : j->calcPositionWeightScore(gmap, donorPWModel, acceptorPWModel); //Junction overhang values at each position are first converted into deviation from expected distributions for(size_t i = JO_OFFSET; i <= JO_NAMES.size(); i++) { - d[(i-JO_OFFSET + 8) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1); + d[(i-JO_OFFSET + 11) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1); } row++; diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 232b807e..a25f045e 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -85,6 +85,7 @@ portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile, filterNovel = false; source = DEFAULT_FILTER_SOURCE; verbose = false; + threshold = DEFAULT_FILTER_THRESHOLD; } @@ -644,7 +645,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction for(size_t i = 0; i < all.size(); i++) { scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) << "\t" << testingData->getRow(i) << endl; - if (f->getPredictions()[i][0] >= 0.1) { + if (f->getPredictions()[i][0] >= this->threshold) { pass.push_back(all[i]); } else { @@ -675,6 +676,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { int32_t max_length; string canonical; string source; + double threshold; bool verbose; bool help; @@ -718,6 +720,8 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { hidden_options.add_options() ("genome-file", po::value(&genomeFile), "Path to the genome file to process.") ("junction-file", po::value(&junctionFile), "Path to the junction file to process.") + ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), + "The threshold score at which we determine a junction to be genuine or not.") ; // Positional option for the input bam file @@ -771,6 +775,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { } filter.setReferenceFile(referenceFile); filter.setGenomeFile(genomeFile); + filter.setThreshold(threshold); filter.filter(); diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index dd970caf..ce568a66 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -76,6 +76,7 @@ const string ST_IPOS_RULES_FILE = "selftrain_initial_pos.json"; const string ST_INEG_RULES_FILE = "selftrain_initial_neg.json"; const uint16_t DEFAULT_FILTER_THREADS = 1; const uint16_t DEFAULT_SELFTRAIN_TREES = 100; +const double DEFAULT_FILTER_THRESHOLD = 0.1; class JunctionFilter { @@ -97,6 +98,7 @@ class JunctionFilter { bool filterSemi; bool filterNovel; string source; + double threshold; bool verbose; @@ -139,6 +141,14 @@ class JunctionFilter { this->genomeFile = genomeFile; } + double getThreshold() const { + return threshold; + } + + void setThreshold(double threshold) { + this->threshold = threshold; + } + path getOutput() const { return output; From 6a2c46907da679592c42e038869c8a4dd92dc21d Mon Sep 17 00:00:00 2001 From: Dan Date: Tue, 26 Apr 2016 22:28:10 +0100 Subject: [PATCH 10/15] Added splicing signal feature and cleaned up some bugs. --- lib/include/portcullis/junction.hpp | 9 +++- lib/include/portcullis/model_features.hpp | 13 ++++-- lib/src/junction.cc | 21 ++++++--- lib/src/markov_model.cc | 6 ++- lib/src/model_features.cc | 47 ++++++++++++++++--- src/junction_filter.cc | 55 +++++++++-------------- 6 files changed, 97 insertions(+), 54 deletions(-) diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp index 4bd64869..b18c3d2a 100644 --- a/lib/include/portcullis/junction.hpp +++ b/lib/include/portcullis/junction.hpp @@ -180,7 +180,10 @@ static string cssToString(CanonicalSS css) { return string("No"); } - +struct SplicingScores { + double positionWeighting = 0.0; + double splicingSignal = 0.0; +}; struct AlignmentInfo { BamAlignmentPtr ba; @@ -882,7 +885,9 @@ class Junction { double calcCodingPotential(GenomeMapper& gmap, KmerMarkovModel& exon, KmerMarkovModel& intron) const; - double calcPositionWeightScore(GenomeMapper& gmap, PosMarkovModel& donor, PosMarkovModel& acceptor) const; + SplicingScores calcSplicingScores(GenomeMapper& gmap, KmerMarkovModel& donorT, KmerMarkovModel& donorF, + KmerMarkovModel& acceptorT, KmerMarkovModel& acceptorF, + PosMarkovModel& donorP, PosMarkovModel& acceptorP) const; // **** Output methods **** diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp index 6451530f..7d35f3ca 100644 --- a/lib/include/portcullis/model_features.hpp +++ b/lib/include/portcullis/model_features.hpp @@ -31,7 +31,10 @@ using boost::filesystem::path; #include using portcullis::bam::GenomeMapper; using portcullis::MarkovModel; +using portcullis::Junction; +using portcullis::JunctionPtr; using portcullis::JunctionList; +using portcullis::SplicingScores; namespace portcullis { @@ -53,15 +56,19 @@ const vector VAR_NAMES = { "mean_mismatches", "IntronScore", "CodingPotential", - "PWS" + "PWS", + "SS" }; - class ModelFeatures { public: uint32_t L95; KmerMarkovModel exonModel; KmerMarkovModel intronModel; + KmerMarkovModel donorTModel; + KmerMarkovModel donorFModel; + KmerMarkovModel acceptorTModel; + KmerMarkovModel acceptorFModel; PosMarkovModel donorPWModel; PosMarkovModel acceptorPWModel; GenomeMapper gmap; @@ -83,7 +90,7 @@ class ModelFeatures { void trainCodingPotentialModel(const JunctionList& in); - void trainPositionWeightModel(const JunctionList& in); + void trainSplicingModels(const JunctionList& pass, const JunctionList& fail); Data* juncs2FeatureVectors(const JunctionList& x); diff --git a/lib/src/junction.cc b/lib/src/junction.cc index e2237ac9..62f1db20 100644 --- a/lib/src/junction.cc +++ b/lib/src/junction.cc @@ -1254,15 +1254,18 @@ double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, KmerMarkovM cout << "Right intron: " << this->consensusStrand << " : " << right_intron << endl; cout << "Right exon : " << this->consensusStrand << " : " << right_exon << endl; */ - double score =( exon.getScore(left_intron) + intron.getScore(left_exon) ) - - ( intron.getScore(left_intron) + exon.getScore(left_exon) ) - + ( intron.getScore(right_exon) + exon.getScore(right_intron) ) - - ( exon.getScore(right_exon) + intron.getScore(right_intron) ); + double score =( exon.getScore(left_exon) - intron.getScore(left_exon) ) + + ( intron.getScore(left_intron) - exon.getScore(left_intron) ) + + ( intron.getScore(right_intron) - exon.getScore(right_intron) ) + + ( exon.getScore(right_exon) + intron.getScore(right_exon) ); return score; } -double portcullis::Junction::calcPositionWeightScore(GenomeMapper& gmap, PosMarkovModel& donor, PosMarkovModel& acceptor) const { + +portcullis::SplicingScores portcullis::Junction::calcSplicingScores(GenomeMapper& gmap, KmerMarkovModel& donorT, KmerMarkovModel& donorF, + KmerMarkovModel& acceptorT, KmerMarkovModel& acceptorF, + PosMarkovModel& donorP, PosMarkovModel& acceptorP) const { int len = 0; const char* ref = this->intron->ref.name.c_str(); @@ -1273,7 +1276,7 @@ double portcullis::Junction::calcPositionWeightScore(GenomeMapper& gmap, PosMark left = SeqUtils::reverseComplement(left); } - string right = gmap.fetchBases(ref, intron->start, intron->end, &len); + string right = gmap.fetchBases(ref, intron->end - 20, intron->end + 2, &len); if (neg) { right = SeqUtils::reverseComplement(right); } @@ -1281,5 +1284,9 @@ double portcullis::Junction::calcPositionWeightScore(GenomeMapper& gmap, PosMark string donorseq = neg ? right : left; string acceptorseq = neg ? left : right; - return donor.getScore(donorseq) + acceptor.getScore(acceptorseq); + SplicingScores ss; + ss.positionWeighting = donorP.getScore(donorseq) + acceptorP.getScore(acceptorseq); + ss.splicingSignal = (donorT.getScore(donorseq) - donorF.getScore(donorseq)) + + (acceptorT.getScore(acceptorseq) - acceptorF.getScore(acceptorseq)); + return ss; } diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc index 18cd2d92..2e7828d7 100644 --- a/lib/src/markov_model.cc +++ b/lib/src/markov_model.cc @@ -32,8 +32,10 @@ void portcullis::KmerMarkovModel::train(const vector& input, const uint3 KMMU temp; for(auto& seq : input) { string s = SeqUtils::makeClean(seq); - for(uint16_t i = order; i < s.size(); i++) { - temp[s.substr(i-order, order)][s.substr(i, 1)]++; + if (s.size() > order+1) { + for(uint16_t i = order; i < s.size(); i++) { + temp[s.substr(i-order, order)][s.substr(i, 1)]++; + } } } model.clear(); diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 30672ee2..11f8af77 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -102,11 +102,11 @@ void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in intronModel.train(introns, 5); } -void portcullis::ModelFeatures::trainPositionWeightModel(const JunctionList& in) { +void portcullis::ModelFeatures::trainSplicingModels(const JunctionList& pass, const JunctionList& fail) { vector donors; vector acceptors; - for(auto& j : in) { + for(auto& j : pass) { int len = 0; @@ -115,7 +115,7 @@ void portcullis::ModelFeatures::trainPositionWeightModel(const JunctionList& in) left = SeqUtils::reverseComplement(left); } - string right = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->end, &len); + string right = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end - 20, j->getIntron()->end + 2, &len); if (j->getConsensusStrand() == Strand::NEGATIVE) { right = SeqUtils::reverseComplement(right); } @@ -133,6 +133,38 @@ void portcullis::ModelFeatures::trainPositionWeightModel(const JunctionList& in) donorPWModel.train(donors, 1); acceptorPWModel.train(acceptors, 1); + donorTModel.train(donors, 5); + acceptorTModel.train(acceptors, 5); + + donors.clear(); + acceptors.clear(); + + for(auto& j : fail) { + + int len = 0; + + string left = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 3, j->getIntron()->start + 20, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + left = SeqUtils::reverseComplement(left); + } + + string right = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end - 20, j->getIntron()->end + 2, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + right = SeqUtils::reverseComplement(right); + } + + if (j->getConsensusStrand() == Strand::NEGATIVE) { + donors.push_back(right); + acceptors.push_back(left); + } + else { + donors.push_back(left); + acceptors.push_back(right); + } + } + + donorFModel.train(donors, 5); + acceptorFModel.train(acceptors, 5); } Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { @@ -148,6 +180,9 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { uint32_t row = 0; for (const auto& j : x) { + SplicingScores ss = j->calcSplicingScores(gmap, donorTModel, donorFModel, acceptorTModel, acceptorFModel, + donorPWModel, acceptorPWModel); + d[0 * x.size() + row] = j->isGenuine(); // //d[0 * x.size() + row] = j->getNbJunctionAlignments(); @@ -163,11 +198,13 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { d[7 * x.size() + row] = j->getMeanMismatches(); d[8 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score d[9 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); - d[10 * x.size() + row] = isPWModelEmpty() ? 0.0 : j->calcPositionWeightScore(gmap, donorPWModel, acceptorPWModel); + d[10 * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.positionWeighting; + d[11 * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.splicingSignal; + //Junction overhang values at each position are first converted into deviation from expected distributions for(size_t i = JO_OFFSET; i <= JO_NAMES.size(); i++) { - d[(i-JO_OFFSET + 11) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1); + d[(i-JO_OFFSET + 12) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1); } row++; diff --git a/src/junction_filter.cc b/src/junction_filter.cc index a25f045e..e0848f6d 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -266,18 +266,6 @@ void portcullis::JunctionFilter::filter() { missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp"); } - cout << "Training coding potential markov models on positive set ..."; - cout.flush(); - mf.trainCodingPotentialModel(pos); - cout << " done." << endl; - cout << "Exon model contains " << mf.exonModel.size() << " 5-mers." << endl; - cout << "Intron model contains " << mf.intronModel.size() << " 5-mers." << endl << endl; - - cout << "Training position weight markov models on positive set ..."; - cout.flush(); - mf.trainPositionWeightModel(pos); - cout << " done." << endl; - passJuncs.clear(); failJuncs.clear(); resultMap.clear(); @@ -342,6 +330,14 @@ void portcullis::JunctionFilter::filter() { cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl; + cout << "Training markov models ..."; + cout.flush(); + mf.trainCodingPotentialModel(pos); + mf.trainSplicingModels(pos, neg); + cout << " done." << endl << endl; + + + // Build the training set by combining the positive and negative sets JunctionList training; training.reserve(pos.size() + neg.size()); @@ -351,7 +347,8 @@ void portcullis::JunctionFilter::filter() { JunctionSystem trainingSystem(training); trainingSystem.sort(); - cout << "Training a random forest model using the initial positive and negative datasets" << endl; + cout << "Training a random forest model" << endl + << "------------------------------" << endl << endl; shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); forest->saveToFile(); @@ -366,7 +363,8 @@ void portcullis::JunctionFilter::filter() { // Do ML based filtering if requested if(!modelFile.empty() && exists(modelFile)){ - cout << "Predicting valid junctions using random forest model" << endl << endl; + cout << "Predicting valid junctions using random forest model" << endl + << "----------------------------------------------------" << endl << endl; JunctionList passJuncs; JunctionList failJuncs; @@ -577,17 +575,11 @@ void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, cons void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf) { - if (verbose) { - cerr << endl << "Preparing junction metrics into matrix" << endl; - } + cout << "Creating feature vector" << endl; Data* testingData = mf.juncs2FeatureVectors(all); - // Load forest from disk - - if (verbose) { - cerr << "Initialising random forest" << endl; - } + cout << "Initialising random forest" << endl; shared_ptr f = nullptr; if (train) { @@ -621,30 +613,23 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction f->setVerboseOut(&cerr); - if (verbose) { - cerr << "Loading forest model from disk" << endl; - } - // Load trees from saved model f->loadFromFile(modelFile.string()); - if (verbose) { - cerr << "Making predictions" << endl; - } + cout << "Making predictions" << endl; f->run(verbose); - if (verbose) { - cerr << "Separating original junction data into pass and fail categories." << endl; - } - path scorepath = output.string() + ".scores"; ofstream scoreStream(scorepath.c_str()); - scoreStream << "Score\t" << Intron::locationOutputHeader() << "\t" << testingData->getHeader() << endl; + scoreStream << "Score\t" << Intron::locationOutputHeader() << "\tStrand\tSS\t" << testingData->getHeader() << endl; for(size_t i = 0; i < all.size(); i++) { - scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) << "\t" << testingData->getRow(i) << endl; + scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) + << "\t" << strandToChar(all[i]->getConsensusStrand()) + << "\t" << cssToChar(all[i]->getSpliceSiteType()) + << "\t" << testingData->getRow(i) << endl; if (f->getPredictions()[i][0] >= this->threshold) { pass.push_back(all[i]); } From 1853cc2e9f8a3745773a71e701f65c5387b1d5a3 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Thu, 28 Apr 2016 18:10:48 +0100 Subject: [PATCH 11/15] Tidying up filtering code and trying to tweak performance. --- Makefile.am | 10 +- configure.ac | 1 + data/selftrain_initial_neg.layer1.json | 93 ++++ data/selftrain_initial_neg.layer2.json | 93 ++++ data/selftrain_initial_neg.layer3.json | 93 ++++ data/selftrain_initial_neg.layer4.json | 93 ++++ data/selftrain_initial_neg.layer5.json | 93 ++++ ...json => selftrain_initial_neg.layer6.json} | 20 +- ...json => selftrain_initial_pos.layer1.json} | 18 +- data/selftrain_initial_pos.layer2.json | 69 +++ lib/include/portcullis/model_features.hpp | 66 ++- lib/src/junction.cc | 2 +- lib/src/model_features.cc | 77 ++-- src/junction_filter.cc | 397 ++++++++++++------ src/junction_filter.hpp | 20 +- 15 files changed, 961 insertions(+), 184 deletions(-) create mode 100644 data/selftrain_initial_neg.layer1.json create mode 100644 data/selftrain_initial_neg.layer2.json create mode 100644 data/selftrain_initial_neg.layer3.json create mode 100644 data/selftrain_initial_neg.layer4.json create mode 100644 data/selftrain_initial_neg.layer5.json rename data/{selftrain_initial_neg.json => selftrain_initial_neg.layer6.json} (77%) rename data/{selftrain_initial_pos.json => selftrain_initial_pos.layer1.json} (79%) create mode 100644 data/selftrain_initial_pos.layer2.json diff --git a/Makefile.am b/Makefile.am index e78793b8..0d39c81c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -56,8 +56,14 @@ dist_bin_SCRIPTS = scripts/analyse_alignments.py \ configdir = $(datadir)/portcullis dist_config_DATA = \ data/default_filter.json \ - data/selftrain_initial_neg.json \ - data/selftrain_initial_pos.json + data/selftrain_initial_neg.layer1.json \ + data/selftrain_initial_neg.layer2.json \ + data/selftrain_initial_neg.layer3.json \ + data/selftrain_initial_neg.layer4.json \ + data/selftrain_initial_neg.layer5.json \ + data/selftrain_initial_neg.layer6.json \ + data/selftrain_initial_pos.layer1.json \ + data/selftrain_initial_pos.layer2.json # SRC DIRS make_dirs=deps/htslib-1.3 deps/ranger-0.3.8 lib src tests diff --git a/configure.ac b/configure.ac index 346afb02..6996e2cd 100644 --- a/configure.ac +++ b/configure.ac @@ -143,6 +143,7 @@ AX_BOOST_CHRONO AX_BOOST_PROGRAM_OPTIONS AX_BOOST_TIMER + # Combine BOOST variables BOOST_LIBS="${BOOST_TIMER_STATIC_LIB} ${BOOST_CHRONO_STATIC_LIB} ${BOOST_FILESYSTEM_STATIC_LIB} ${BOOST_PROGRAM_OPTIONS_STATIC_LIB} ${BOOST_SYSTEM_STATIC_LIB}" AC_SUBST([BOOST_LIBS]) diff --git a/data/selftrain_initial_neg.layer1.json b/data/selftrain_initial_neg.layer1.json new file mode 100644 index 00000000..9b1a5844 --- /dev/null +++ b/data/selftrain_initial_neg.layer1.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( M20-nb_usrs & M21-nb_msrs & M12-maxmmes.2 & M22-rel2raw.2 )" +} diff --git a/data/selftrain_initial_neg.layer2.json b/data/selftrain_initial_neg.layer2.json new file mode 100644 index 00000000..e9a9fc33 --- /dev/null +++ b/data/selftrain_initial_neg.layer2.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( M20-nb_usrs.2 & M1-canonical_ss & M12-maxmmes.2 )" +} diff --git a/data/selftrain_initial_neg.layer3.json b/data/selftrain_initial_neg.layer3.json new file mode 100644 index 00000000..ec5cbc5d --- /dev/null +++ b/data/selftrain_initial_neg.layer3.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( Suspect & M11-entropy.2 )" +} diff --git a/data/selftrain_initial_neg.layer4.json b/data/selftrain_initial_neg.layer4.json new file mode 100644 index 00000000..44890918 --- /dev/null +++ b/data/selftrain_initial_neg.layer4.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( M12-maxmmes & M22-rel2raw )" +} diff --git a/data/selftrain_initial_neg.layer5.json b/data/selftrain_initial_neg.layer5.json new file mode 100644 index 00000000..945bd289 --- /dev/null +++ b/data/selftrain_initial_neg.layer5.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( M1-canonical_ss & M22-rel2raw.2 )" +} diff --git a/data/selftrain_initial_neg.json b/data/selftrain_initial_neg.layer6.json similarity index 77% rename from data/selftrain_initial_neg.json rename to data/selftrain_initial_neg.layer6.json index 4885e954..0265d1eb 100644 --- a/data/selftrain_initial_neg.json +++ b/data/selftrain_initial_neg.layer6.json @@ -24,14 +24,22 @@ "operator": "gt", "value": 2.0 }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, "M12-maxmmes": { "operator": "lt", "value": 7 }, "M12-maxmmes.2": { - "operator": "lte", + "operator": "lt", "value": 10 }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, "M8-max_min_anc": { "operator": "lt", "value": 16 @@ -71,7 +79,15 @@ "M21-nb_msrs": { "operator": "gte", "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 } }, - "expression": "( M20-nb_usrs & M21-nb_msrs & M12-maxmmes.2 ) | ( M20-nb_usrs.2 & M1-canonical_ss & M12-maxmmes.2 ) | ( Suspect & M11-entropy.2 ) | M12-maxmmes | ( M1-canonical_ss & M4-nb_rel_aln )" + "expression": "( PFP )" } diff --git a/data/selftrain_initial_pos.json b/data/selftrain_initial_pos.layer1.json similarity index 79% rename from data/selftrain_initial_pos.json rename to data/selftrain_initial_pos.layer1.json index 83539e6f..d351e911 100644 --- a/data/selftrain_initial_pos.json +++ b/data/selftrain_initial_pos.layer1.json @@ -4,6 +4,10 @@ "operator": "in", "value": ["C", "S"] }, + "M1-canonical_ss.2": { + "operator": "in", + "value": ["N"] + }, "M4-nb_rel_aln": { "operator": "gte", "value": 5 @@ -16,10 +20,18 @@ "operator": "gte", "value": 20 }, + "M12-maxmmes.2": { + "operator": "gte", + "value": 10 + }, "M11-entropy": { "operator": "gt", "value": 5.0 }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, "M13-hamming5p": { "operator": "gte", "value": 5 @@ -42,7 +54,7 @@ }, "M19-mean_mismatches.2": { "operator": "lt", - "value": 1.0 + "value": 0.2 }, "M20-nb_usrs": { "operator": "gte", @@ -50,8 +62,8 @@ }, "M22-rel2raw": { "operator": "gte", - "value": 0.1 + "value": 0.5 } }, - "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M22-rel2raw & ( ( M19-mean_mismatches & M20-nb_usrs ) | ( M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) )" + "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M22-rel2raw & M12-maxmmes.2" } diff --git a/data/selftrain_initial_pos.layer2.json b/data/selftrain_initial_pos.layer2.json new file mode 100644 index 00000000..d91483ac --- /dev/null +++ b/data/selftrain_initial_pos.layer2.json @@ -0,0 +1,69 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["C", "S"] + }, + "M1-canonical_ss.2": { + "operator": "in", + "value": ["N"] + }, + "M4-nb_rel_aln": { + "operator": "gte", + "value": 5 + }, + "M4-nb_rel_aln.2": { + "operator": "gte", + "value": 1 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 20 + }, + "M12-maxmmes.2": { + "operator": "gte", + "value": 10 + }, + "M11-entropy": { + "operator": "gt", + "value": 5.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 5 + }, + "M13-hamming5p.2": { + "operator": "gte", + "value": 7 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 5 + }, + "M14-hamming3p.2": { + "operator": "gte", + "value": 7 + }, + "M19-mean_mismatches": { + "operator": "eq", + "value": 0 + }, + "M19-mean_mismatches.2": { + "operator": "lt", + "value": 0.2 + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 5 + }, + "M22-rel2raw": { + "operator": "gte", + "value": 0.5 + } + }, + "expression": "( ( M1-canonical_ss & M19-mean_mismatches & M20-nb_usrs ) | ( M1-canonical_ss & M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) | ( M1-canonical_ss.2 & M19-mean_mismatches & M12-maxmmes & M11-entropy ) )" +} diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp index 7d35f3ca..cdc76d05 100644 --- a/lib/include/portcullis/model_features.hpp +++ b/lib/include/portcullis/model_features.hpp @@ -43,24 +43,29 @@ typedef shared_ptr ForestPtr; // List of variable names const vector VAR_NAMES = { "Genuine", - //"M2-nb-reads", - "M3-nb_dist_aln", - "nb_rel_aln", - //"M8-max_min_anc", - //"M9-dif_anc", - //"M10-dist_anc", - "entropy", - "maxmmes", - "min_hamming_score", - "rel2raw_ratio", - "mean_mismatches", - "IntronScore", - "CodingPotential", - "PWS", - "SS" + "rna_usrs", + "rna_dist", + "rna_rel", + "rna_entropy", + "rna_rel2raw", + "rna_maxminanc", + "rna_maxmmes", + "rna_missmatch", + "rna_intron", + "dna_minhamm", + "dna_coding", + "dna_pws", + "dna_ss" +}; + +struct Feature { + string name; + bool active; }; class ModelFeatures { +private: + size_t fi; public: uint32_t L95; KmerMarkovModel exonModel; @@ -72,10 +77,25 @@ class ModelFeatures { PosMarkovModel donorPWModel; PosMarkovModel acceptorPWModel; GenomeMapper gmap; + vector features; ModelFeatures() : L95(0) { + fi = 1; + features.clear(); + for(size_t i = 0; i < VAR_NAMES.size(); i++) { + Feature f; + f.name = VAR_NAMES[i]; + f.active = true; + features.push_back(f); + } + for(size_t i = 0; i < JO_NAMES.size(); i++) { + Feature f; + f.name = JO_NAMES[i]; + f.active = true; + features.push_back(f); + } } - + bool isCodingPotentialModelEmpty() { return exonModel.size() == 0 || intronModel.size() == 0; } @@ -96,5 +116,19 @@ class ModelFeatures { ForestPtr trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose); + + void resetActiveFeatureIndex() { + fi = 0; + } + int16_t getNextActiveFeatureIndex() { + for(int16_t i = fi+1; i < features.size(); i++) { + if (features[i].active) { + fi = i; + return i; + } + } + return -1; + } + }; } \ No newline at end of file diff --git a/lib/src/junction.cc b/lib/src/junction.cc index 62f1db20..d791b272 100644 --- a/lib/src/junction.cc +++ b/lib/src/junction.cc @@ -1257,7 +1257,7 @@ double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, KmerMarkovM double score =( exon.getScore(left_exon) - intron.getScore(left_exon) ) + ( intron.getScore(left_intron) - exon.getScore(left_intron) ) + ( intron.getScore(right_intron) - exon.getScore(right_intron) ) - + ( exon.getScore(right_exon) + intron.getScore(right_exon) ); + + ( exon.getScore(right_exon) - intron.getScore(right_exon) ); return score; } diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 11f8af77..f5cf9a57 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -168,43 +168,72 @@ void portcullis::ModelFeatures::trainSplicingModels(const JunctionList& pass, co } Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { - - const size_t JO_OFFSET = 10; + vector headers; - headers.reserve( VAR_NAMES.size() + JO_NAMES.size() ); - headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() ); - headers.insert( headers.end(), JO_NAMES.begin()+JO_OFFSET-1, JO_NAMES.end() ); + for(auto& f : features) { + if (f.active) { + headers.push_back(f.name); + } + } // Convert junction list info to double* double* d = new double[x.size() * headers.size()]; uint32_t row = 0; - for (const auto& j : x) { + for (const auto& j : x) { SplicingScores ss = j->calcSplicingScores(gmap, donorTModel, donorFModel, acceptorTModel, acceptorFModel, donorPWModel, acceptorPWModel); d[0 * x.size() + row] = j->isGenuine(); - // - //d[0 * x.size() + row] = j->getNbJunctionAlignments(); - d[1 * x.size() + row] = j->getNbDistinctAlignments(); - d[2 * x.size() + row] = j->getNbReliableAlignments(); - //d[3 * x.size() + row] = j->getMaxMinAnchor(); - //d[4 * x.size() + row] = j->getDiffAnchor(); - //d[5 * x.size() + row] = j->getNbDistinctAnchors(); - d[3 * x.size() + row] = j->getEntropy(); - d[4 * x.size() + row] = j->getMaxMMES(); - d[5 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); - d[6 * x.size() + row] = j->getReliable2RawRatio(); - d[7 * x.size() + row] = j->getMeanMismatches(); - d[8 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); // Intron score - d[9 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); - d[10 * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.positionWeighting; - d[11 * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.splicingSignal; + uint16_t i = 1; + + if (features[1].active) { + d[i++ * x.size() + row] = j->getNbUniquelySplicedReads(); + } + if (features[2].active) { + d[i++ * x.size() + row] = j->getNbDistinctAlignments(); + } + if (features[3].active) { + d[i++ * x.size() + row] = j->getNbReliableAlignments(); + } + if (features[4].active) { + d[i++ * x.size() + row] = j->getEntropy(); + } + if (features[5].active) { + d[i++ * x.size() + row] = j->getReliable2RawRatio(); + } + if (features[6].active) { + d[i++ * x.size() + row] = j->getMaxMinAnchor(); + } + if (features[7].active) { + d[i++ * x.size() + row] = j->getMaxMMES(); + } + if (features[8].active) { + d[i++ * x.size() + row] = j->getMeanMismatches(); + } + if (features[9].active) { + d[i++ * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); + } + if (features[10].active) { + d[i++ * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());; + } + if (features[11].active) { + d[i++ * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); + } + if (features[12].active) { + d[i++ * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.positionWeighting; + } + if (features[13].active) { + d[i++ * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.splicingSignal; + } + //Junction overhang values at each position are first converted into deviation from expected distributions - for(size_t i = JO_OFFSET; i <= JO_NAMES.size(); i++) { - d[(i-JO_OFFSET + 12) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1); + for(size_t joi = 0; joi < JO_NAMES.size(); joi++) { + if (features[joi + 14].active) { + d[i++ * x.size() + row] = j->getJunctionOverhangLogDeviation(joi); + } } row++; diff --git a/src/junction_filter.cc b/src/junction_filter.cc index e0848f6d..e4994a78 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -207,126 +207,11 @@ void portcullis::JunctionFilter::filter() { // The initial positive and negative sets JunctionList pos, neg; - cout << "Self training mode activated. Collecting initial positive and negative sets from input." << endl; + cout << "Self training mode activated." << endl << endl; - JunctionList passJuncs; - JunctionList failJuncs; - JuncResultMap resultMap; - - doRuleBasedFiltering(this->getIntitalPosRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial positive set for training", resultMap); - - cout << "Saving initial positive set:" << endl; - JunctionSystem isp(passJuncs); - isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp"); - - // Analyse positive set to get L0.05 of intron size - cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl; - - const uint32_t poslongintronthreshold = mf.L95 * 2; - JunctionList pass2juncs; - for(auto& j : passJuncs) { - if (j->getIntronSize() <= poslongintronthreshold) { - pass2juncs.push_back(j); - } - else { - failJuncs.push_back(j); - } - } - cout << "Removed " << (passJuncs.size() - pass2juncs.size()) << " junctions from positive set with introns > L95 x 2 = " << poslongintronthreshold << endl << endl; - - - for(auto& j : pass2juncs) { - JunctionPtr copy = make_shared(*j); - copy->setGenuine(true); - pos.push_back(copy); - } - if (!genuineFile.empty()) { - shared_ptr p = calcPerformance(pass2juncs, failJuncs); - cout << "Performance of initial positive set (High PPV is important):" << endl; - cout << Performance::longHeader() << endl; - cout << p->toLongString() << endl << endl; - - JunctionSystem invalidPos; - for(auto& j : pass2juncs) { - if (!j->isGenuine()) { - invalidPos.addJunction(j); - } - } - JunctionSystem missedPos; - for(auto& j : failJuncs) { - if (j->isGenuine()) { - missedPos.addJunction(j); - } - } - - cout << "Saving invalid junctions in initial positive set to disk:" << endl; - invalidPos.saveAll(output.string() + ".selftrain.initialset.invalidpos", "portcullis_invalid_isp"); - - cout << "Saving missed positive junctions to disk:" << endl; - missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp"); - } - - passJuncs.clear(); - failJuncs.clear(); - resultMap.clear(); - - doRuleBasedFiltering(this->getIntitalNegRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial negative set for training", resultMap); - - const uint32_t neglongintronthreshold = mf.L95 * 6; - cout << endl << "Adding junctions to negative set with MaxMMES < 10 and excessively long intron size of positive set (> L95 x 6 = " << neglongintronthreshold << ")..."; - cout.flush(); - - uint32_t nblongintrons = 0; - JunctionList fail2Juncs; - for(auto& j : failJuncs) { - if (j->getIntronSize() > neglongintronthreshold && j->getMaxMMES() < 10 ) { - passJuncs.push_back(j); - nblongintrons++; - } - else { - fail2Juncs.push_back(j); - } - } - - cout << " done." << endl - << "Found an additional " << nblongintrons << " junctions with long introns." << endl << endl; - - cout << "Saving initial negative set:" << endl; - JunctionSystem isn(passJuncs); - isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn"); - - for(auto& j : passJuncs) { - JunctionPtr copy = make_shared(*j); - copy->setGenuine(false); - neg.push_back(copy); - } + createPositiveSet(currentJuncs, pos, mf); - if (!genuineFile.empty()) { - shared_ptr p = calcPerformance(passJuncs, fail2Juncs, true); - cout << "Performance of initial negative set (High NPV is important):" << endl; - cout << Performance::longHeader() << endl; - cout << p->toLongString() << endl << endl; - - JunctionSystem invalidNeg; - JunctionSystem missedNeg; - - for(auto& j : passJuncs) { - if (j->isGenuine()) { - invalidNeg.addJunction(j); - } - } - for(auto& j : fail2Juncs) { - if (!j->isGenuine()) { - missedNeg.addJunction(j); - } - } - - cout << "Saving genuine valid junctions in initial negative set to disk:" << endl; - invalidNeg.saveAll(output.string() + ".selftrain.initialset.invalidneg", "portcullis_invalid_isn"); - - cout << "Saving missed negative junctions to disk:" << endl; - missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); - } + createNegativeSet(mf.L95, currentJuncs, neg); cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl; @@ -347,9 +232,47 @@ void portcullis::JunctionFilter::filter() { JunctionSystem trainingSystem(training); trainingSystem.sort(); - cout << "Training a random forest model" << endl - << "------------------------------" << endl << endl; - shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); + cout << "Training Random Forest" << endl + << "----------------------" << endl << endl; + bool done = false; + shared_ptr forest = nullptr; + while(!done) { + + forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); + const vector importance = forest->getVariableImportance(); + mf.resetActiveFeatureIndex(); + uint16_t min_importance_idx = 10000; + double min_importance_val = 100000.0; + uint16_t min_feature_idx = 10000; + // This approach to feature selection (i.e. removing worst features by importance) + // doesn't appear to have much impact on final results and is computationally + // expensive. Removing for now. + /*for(size_t i = 0; i < importance.size(); i++) { + double imp = importance[i]; + int16_t j = mf.getNextActiveFeatureIndex(); + if (imp < 0.1 && imp < min_importance_val) { + min_importance_idx = i; + min_importance_val = imp; + min_feature_idx = j; + } + }*/ + if (min_importance_idx != 10000) { + mf.features[min_feature_idx].active = false; + cout << "Deactivating feature: " << mf.features[min_feature_idx].name << " - " << min_importance_val << endl; + } + else { + done = true; + } + } + + const vector importance = forest->getVariableImportance(); + bool foundIrrelevant = false; + mf.resetActiveFeatureIndex(); + cout << "Active features remaining:" << endl; + for(auto& i : importance) { + int16_t j = mf.getNextActiveFeatureIndex(); + cout << mf.features[j].name << " - " << i << endl; + } forest->saveToFile(); forest->writeOutput(&cout); @@ -511,6 +434,177 @@ void portcullis::JunctionFilter::filter() { } } +void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, JunctionList& pos, ModelFeatures& mf) { + JuncResultMap resultMap; + + cout << "Creating initial positive set for training" << endl + << "------------------------------------------" << endl << endl; + + if (!genuineFile.empty()) { + cout << "Performance of each positive filter layer (Low FPs is important):" << endl; + cout << "LAYER\t" << Performance::longHeader() << endl; + } + JunctionList p1, p2, f1, f2; + RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, f1, "Creating initial positive set for training", resultMap); + if (!genuineFile.empty()) { + cout << "1\t" << calcPerformance(p1, f1, true)->toLongString() << endl; + } + RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, f2, "Creating initial positive set for training", resultMap); + if (!genuineFile.empty()) { + cout << "2\t" << calcPerformance(p2, f2, true)->toLongString() << endl; + } + + const uint32_t L95 = mf.calcIntronThreshold(p2); + const uint32_t L95x2 = L95 * 2; + + JunctionList passJuncs, failJuncs; + for(auto& j : p2) { + if (j->getIntronSize() <= L95x2) { + passJuncs.push_back(j); + } + else { + failJuncs.push_back(j); + } + } + if (!genuineFile.empty()) { + cout << "L95x2\t" << calcPerformance(passJuncs, failJuncs, true)->toLongString() << endl; + } + + // Analyse positive set to get L0.05 of intron size + cout << endl << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl; + + cout << "Saving initial positive set:" << endl; + JunctionSystem isp(passJuncs); + isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp"); + + + for(auto& j : passJuncs) { + JunctionPtr copy = make_shared(*j); + copy->setGenuine(true); + pos.push_back(copy); + } + if (!genuineFile.empty()) { + + JunctionSystem invalidPos; + for(auto& j : passJuncs) { + if (!j->isGenuine()) { + invalidPos.addJunction(j); + } + } + JunctionSystem missedPos; + for(auto& j : failJuncs) { + if (j->isGenuine()) { + missedPos.addJunction(j); + } + } + + cout << "Saving invalid junctions in initial positive set to disk:" << endl; + invalidPos.saveAll(output.string() + ".selftrain.initialset.invalidpos", "portcullis_invalid_isp"); + + cout << "Saving missed positive junctions to disk:" << endl; + missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp"); + } +} + +void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg) { + + JuncResultMap resultMap; + + cout << "Creating initial negative set for training" << endl + << "------------------------------------------" << endl << endl; + + if (!genuineFile.empty()) { + cout << "Performance of each negative filter layer (Low FNs is important):" << endl; + cout << "LAYER\t" << Performance::longHeader() << endl; + } + JunctionList p1, p2, p3, p4, p5, p6, p7, f1, f2, f3, f4, f5, f6, f7; + RuleFilter::filter(this->getIntitalNegRulesFile(1), all, p1, f1, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << "1\t" << calcPerformance(p1, f1, true)->toLongString() << endl; + } + RuleFilter::filter(this->getIntitalNegRulesFile(2), f1, p2, f2, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << "2\t" << calcPerformance(p2, f2, true)->toLongString() << endl; + } + RuleFilter::filter(this->getIntitalNegRulesFile(3), f2, p3, f3, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << "3\t" << calcPerformance(p3, f3, true)->toLongString() << endl; + } + RuleFilter::filter(this->getIntitalNegRulesFile(4), f3, p4, f4, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << "4\t" << calcPerformance(p4, f4, true)->toLongString() << endl; + } + RuleFilter::filter(this->getIntitalNegRulesFile(5), f4, p5, f5, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << "5\t" << calcPerformance(p5, f5, true)->toLongString() << endl; + } + RuleFilter::filter(this->getIntitalNegRulesFile(6), f5, p6, f6, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << "6\t" << calcPerformance(p6, f6, true)->toLongString() << endl; + } + + JunctionList passJuncs, failJuncs; + const uint32_t L95x5 = L95 * 5; + for(auto& j : f6) { + if (j->getIntronSize() > L95x5 && j->getMaxMMES() < 10 ) { + p7.push_back(j); + } + else { + failJuncs.push_back(j); + } + } + if (!genuineFile.empty()) { + cout << "L95x5\t" << calcPerformance(p7, failJuncs, true)->toLongString() << endl; + } + + cout << endl << "Concatenating TNs from each layer to create negative set" << endl << endl; + + passJuncs.insert(passJuncs.end(), p1.begin(), p1.end()); + passJuncs.insert(passJuncs.end(), p2.begin(), p2.end()); + passJuncs.insert(passJuncs.end(), p3.begin(), p3.end()); + passJuncs.insert(passJuncs.end(), p4.begin(), p4.end()); + passJuncs.insert(passJuncs.end(), p5.begin(), p5.end()); + passJuncs.insert(passJuncs.end(), p6.begin(), p6.end()); + passJuncs.insert(passJuncs.end(), p7.begin(), p7.end()); + + if (!genuineFile.empty()) { + cout << "Final\t" << calcPerformance(passJuncs, failJuncs, true)->toLongString() << endl; + } + + cout << endl << "Saving initial negative set:" << endl; + JunctionSystem isn(passJuncs); + isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn"); + + for(auto& j : passJuncs) { + JunctionPtr copy = make_shared(*j); + copy->setGenuine(false); + neg.push_back(copy); + } + + if (!genuineFile.empty()) { + + JunctionSystem invalidNeg; + JunctionSystem missedNeg; + + for(auto& j : passJuncs) { + if (j->isGenuine()) { + invalidNeg.addJunction(j); + } + } + for(auto& j : failJuncs) { + if (!j->isGenuine()) { + missedNeg.addJunction(j); + } + } + + cout << "Saving genuine valid junctions in initial negative set to disk:" << endl; + invalidNeg.saveAll(output.string() + ".selftrain.initialset.invalidneg", "portcullis_invalid_isn"); + + cout << "Saving missed negative junctions to disk:" << endl; + missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); + } +} + void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) { // Output stats @@ -526,7 +620,7 @@ void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, c shared_ptr p = calcPerformance(pass, fail); cout << Performance::longHeader() << endl; cout << p->toLongString() << endl << endl; - } + } } shared_ptr portcullis::JunctionFilter::calcPerformance(const JunctionList& pass, const JunctionList& fail, bool invert) { @@ -625,27 +719,72 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction scoreStream << "Score\t" << Intron::locationOutputHeader() << "\tStrand\tSS\t" << testingData->getHeader() << endl; for(size_t i = 0; i < all.size(); i++) { - + scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) << "\t" << strandToChar(all[i]->getConsensusStrand()) << "\t" << cssToChar(all[i]->getSpliceSiteType()) << "\t" << testingData->getRow(i) << endl; - if (f->getPredictions()[i][0] >= this->threshold) { - pass.push_back(all[i]); - } - else { - fail.push_back(all[i]); - } } scoreStream.close(); + + if (!genuineFile.empty() && exists(genuineFile)) { + vector thresholds; + for(double i = 0.05; i <= 0.5; i += 0.01) { + thresholds.push_back(i); + } + double max_mcc = 0.0; + double max_f1 = 0.0; + double best_t_mcc = 0.0; + double best_t_f1 = 0.0; + + cout << "Threshold\t" << Performance::longHeader() << endl; + for(auto& t : thresholds) { + JunctionList pjl; + JunctionList fjl; + categorise(f, all, pjl, fjl, t); + shared_ptr perf = calcPerformance(pjl, fjl); + double mcc = perf->getMCC(); + double f1 = perf->getF1Score(); + cout << t << "\t" << perf->toLongString() << endl; + if (mcc > max_mcc) { + max_mcc = mcc; + best_t_mcc = t; + } + if (f1 > max_f1) { + max_f1 = f1; + best_t_f1 = t; + } + } + + cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; + cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; + + categorise(f, all, pass, fail, best_t_mcc); + } + else { + categorise(f, all, pass, fail, threshold); + } + cout << "Saved junction scores to: " << scorepath << endl; delete testingData; } +void portcullis::JunctionFilter::categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t) { + + for(size_t i = 0; i < all.size(); i++) { + if (f->getPredictions()[i][0] >= t) { + pass.push_back(all[i]); + } + else { + fail.push_back(all[i]); + } + } +} + int portcullis::JunctionFilter::main(int argc, char *argv[]) { path junctionFile; diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index ce568a66..e0ebc948 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -72,11 +72,11 @@ const string DEFAULT_FILTER_OUTPUT = "portcullis_filter/portcullis"; const string DEFAULT_FILTER_SOURCE = "portcullis"; const string DEFAULT_FILTER_RULE_FILE = "default_filter.json"; const string DEFAULT_FILTER_MODEL_FILE = "default_model.forest"; -const string ST_IPOS_RULES_FILE = "selftrain_initial_pos.json"; -const string ST_INEG_RULES_FILE = "selftrain_initial_neg.json"; +const string ST_IPOS_RULES_FILE = "selftrain_initial_pos"; +const string ST_INEG_RULES_FILE = "selftrain_initial_neg"; const uint16_t DEFAULT_FILTER_THREADS = 1; const uint16_t DEFAULT_SELFTRAIN_TREES = 100; -const double DEFAULT_FILTER_THRESHOLD = 0.1; +const double DEFAULT_FILTER_THRESHOLD = 0.25; class JunctionFilter { @@ -273,12 +273,12 @@ class JunctionFilter { this->maxLength = maxLength; } - path getIntitalPosRulesFile() const { - return path(dataDir.string() + "/" + ST_IPOS_RULES_FILE); + path getIntitalPosRulesFile(uint16_t index) const { + return path(dataDir.string() + "/" + ST_IPOS_RULES_FILE + ".layer" + std::to_string(index) + ".json"); } - path getIntitalNegRulesFile() const { - return path(dataDir.string() + "/" + ST_INEG_RULES_FILE); + path getIntitalNegRulesFile(uint16_t index) const { + return path(dataDir.string() + "/" + ST_INEG_RULES_FILE + ".layer" + std::to_string(index) + ".json"); } void filter(); @@ -296,7 +296,13 @@ class JunctionFilter { void printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix); void doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail, const string& prefix, JuncResultMap& resultMap); + + void categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t); + void createPositiveSet(const JunctionList& all, JunctionList& pos, ModelFeatures& mf); + + void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg); + public: static string helpMessage() { From d9ffcb223c9cfbe77bb822a6345ab082ed2252d9 Mon Sep 17 00:00:00 2001 From: Dan Date: Thu, 28 Apr 2016 22:49:34 +0100 Subject: [PATCH 12/15] Tidy up output from filtering tool. --- lib/src/model_features.cc | 2 +- src/junction_filter.cc | 42 +++++++++++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index f5cf9a57..49f1952d 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -216,7 +216,7 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { d[i++ * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); } if (features[10].active) { - d[i++ * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());; + d[i++ * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); } if (features[11].active) { d[i++ * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); diff --git a/src/junction_filter.cc b/src/junction_filter.cc index e4994a78..c66d52d2 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -438,21 +438,30 @@ void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, Junc JuncResultMap resultMap; cout << "Creating initial positive set for training" << endl - << "------------------------------------------" << endl << endl; + << "------------------------------------------" << endl << endl + << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial positive set." << endl << endl; if (!genuineFile.empty()) { - cout << "Performance of each positive filter layer (Low FPs is important):" << endl; - cout << "LAYER\t" << Performance::longHeader() << endl; + cout << "Performance of each positive filter layer (Low FPs is important):" << endl; } + cout << "LAYER\t"; + if (!genuineFile.empty()) { + cout << Performance::longHeader(); + } + JunctionList p1, p2, f1, f2; + + cout << endl << "1\t"; RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, f1, "Creating initial positive set for training", resultMap); if (!genuineFile.empty()) { - cout << "1\t" << calcPerformance(p1, f1, true)->toLongString() << endl; + cout << calcPerformance(p1, f1, true)->toLongString(); } + cout << endl << "2\t"; RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, f2, "Creating initial positive set for training", resultMap); if (!genuineFile.empty()) { - cout << "2\t" << calcPerformance(p2, f2, true)->toLongString() << endl; + cout << calcPerformance(p2, f2, true)->toLongString(); } + cout << endl; const uint32_t L95 = mf.calcIntronThreshold(p2); const uint32_t L95x2 = L95 * 2; @@ -511,37 +520,50 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL JuncResultMap resultMap; cout << "Creating initial negative set for training" << endl - << "------------------------------------------" << endl << endl; + << "------------------------------------------" << endl << endl + << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial negative set." << endl << endl; if (!genuineFile.empty()) { - cout << "Performance of each negative filter layer (Low FNs is important):" << endl; - cout << "LAYER\t" << Performance::longHeader() << endl; + cout << "Performance of each negative filter layer (Low FNs is important):" << endl; + } + cout << "LAYER\t"; + if (!genuineFile.empty()) { + cout << Performance::longHeader(); } + JunctionList p1, p2, p3, p4, p5, p6, p7, f1, f2, f3, f4, f5, f6, f7; + + cout << endl << "1\t"; RuleFilter::filter(this->getIntitalNegRulesFile(1), all, p1, f1, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << "1\t" << calcPerformance(p1, f1, true)->toLongString() << endl; + cout << calcPerformance(p1, f1, true)->toLongString(); } + cout << endl << "2\t"; RuleFilter::filter(this->getIntitalNegRulesFile(2), f1, p2, f2, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { cout << "2\t" << calcPerformance(p2, f2, true)->toLongString() << endl; } + cout << endl << "3\t"; RuleFilter::filter(this->getIntitalNegRulesFile(3), f2, p3, f3, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { cout << "3\t" << calcPerformance(p3, f3, true)->toLongString() << endl; } + cout << endl << "4\t"; RuleFilter::filter(this->getIntitalNegRulesFile(4), f3, p4, f4, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { cout << "4\t" << calcPerformance(p4, f4, true)->toLongString() << endl; } + cout << endl << "5\t"; RuleFilter::filter(this->getIntitalNegRulesFile(5), f4, p5, f5, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { cout << "5\t" << calcPerformance(p5, f5, true)->toLongString() << endl; } + cout << endl << "6\t"; RuleFilter::filter(this->getIntitalNegRulesFile(6), f5, p6, f6, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { cout << "6\t" << calcPerformance(p6, f6, true)->toLongString() << endl; } + cout << endl; JunctionList passJuncs, failJuncs; const uint32_t L95x5 = L95 * 5; @@ -557,7 +579,7 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL cout << "L95x5\t" << calcPerformance(p7, failJuncs, true)->toLongString() << endl; } - cout << endl << "Concatenating TNs from each layer to create negative set" << endl << endl; + cout << endl << "Concatenating negatives from each layer to create negative set" << endl << endl; passJuncs.insert(passJuncs.end(), p1.begin(), p1.end()); passJuncs.insert(passJuncs.end(), p2.begin(), p2.end()); From 999eafa71c16d6ea087ea8db4c790414ba6823ba Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 29 Apr 2016 11:14:08 +0100 Subject: [PATCH 13/15] Determined an optimal set of features for filtering. --- lib/include/portcullis/model_features.hpp | 2 +- lib/src/model_features.cc | 12 ++--- src/junction_filter.cc | 61 +++++++++++++++++------ src/junction_filter.hpp | 2 +- 4 files changed, 55 insertions(+), 22 deletions(-) diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp index cdc76d05..1820cc01 100644 --- a/lib/include/portcullis/model_features.hpp +++ b/lib/include/portcullis/model_features.hpp @@ -115,7 +115,7 @@ class ModelFeatures { Data* juncs2FeatureVectors(const JunctionList& x); ForestPtr trainInstance(const JunctionList& x, string outputPrefix, - uint16_t trees, uint16_t threads, bool regressionMode, bool verbose); + uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose); void resetActiveFeatureIndex() { fi = 0; diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 49f1952d..902c7813 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -25,7 +25,7 @@ using std::ofstream; using std::make_shared; #include -#include +#include #include #include @@ -247,7 +247,7 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionList& x, - string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) { + string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) { cout << "Creating feature vector" << endl; Data* trainingData = juncs2FeatureVectors(x); @@ -266,8 +266,8 @@ portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionLis cout << "Initialising random forest" << endl; ForestPtr f = nullptr; - if (regressionMode) { - f = make_shared(); + if (probabilityMode) { + f = make_shared(); } else { f = make_shared(); @@ -281,11 +281,11 @@ portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionLis trainingData, // Data object 0, // M Try (0 == use default) outputPrefix, // Output prefix - trees, // Number of trees + 250, //trees, // Number of trees 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible threads, // Number of threads IMP_GINI, // Importance measure - regressionMode ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size + probabilityMode ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size "", // Status var name false, // Prediction mode true, // Replace diff --git a/src/junction_filter.cc b/src/junction_filter.cc index c66d52d2..aec33fc8 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -49,7 +49,7 @@ namespace po = boost::program_options; #include -#include +#include #include #include @@ -201,6 +201,38 @@ void portcullis::JunctionFilter::filter() { ModelFeatures mf; mf.initGenomeMapper(this->genomeFile); + mf.features[1].active=false; // NB USRS (BAD) + mf.features[2].active=false; // NB DISTRS (BAD) + //mf.features[3].active=false; // NB RELRS (GOOD) + mf.features[4].active = false; // ENTROPY (BAD - JO LOGDEV ARE BETTER) + //mf.features[5].active = false; // REL2RAW (GOOD) + mf.features[6].active=false; // MAXMINANC (BAD - MAXMMES IS BETTER) + //mf.features[7].active=false; // MAXMMES (GOOD) + //mf.features[8].active=false; // MEAN MISMATCH (GOOD) + //mf.features[9].active=false; // INTRON (GOOD) + //mf.features[10].active=false; // MIN_HAMM (GOOD) + mf.features[11].active=false; // CODING POTENTIAL (BAD) + //mf.features[12].active=false; // POS WEIGHTS (GOOD) + //mf.features[13].active=false; // SPLICE SIGNAL (GOOD) + /*mf.features[14].active=false; // JO LOGDEV FEATURES BETTER THAN ENTROPY + mf.features[15].active=false; + mf.features[16].active=false; + mf.features[17].active=false; + mf.features[18].active=false; + mf.features[19].active=false; + mf.features[20].active=false; + mf.features[21].active=false; + mf.features[22].active=false; + mf.features[23].active=false; + mf.features[24].active=false; + mf.features[25].active=false; + mf.features[26].active=false; + mf.features[27].active=false; + mf.features[28].active=false; + mf.features[29].active=false;*/ + + + if (train) { @@ -541,29 +573,29 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL cout << endl << "2\t"; RuleFilter::filter(this->getIntitalNegRulesFile(2), f1, p2, f2, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << "2\t" << calcPerformance(p2, f2, true)->toLongString() << endl; + cout << calcPerformance(p2, f2, true)->toLongString(); } cout << endl << "3\t"; RuleFilter::filter(this->getIntitalNegRulesFile(3), f2, p3, f3, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << "3\t" << calcPerformance(p3, f3, true)->toLongString() << endl; + cout << calcPerformance(p3, f3, true)->toLongString(); } cout << endl << "4\t"; RuleFilter::filter(this->getIntitalNegRulesFile(4), f3, p4, f4, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << "4\t" << calcPerformance(p4, f4, true)->toLongString() << endl; + cout << calcPerformance(p4, f4, true)->toLongString(); } cout << endl << "5\t"; RuleFilter::filter(this->getIntitalNegRulesFile(5), f4, p5, f5, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << "5\t" << calcPerformance(p5, f5, true)->toLongString() << endl; + cout << calcPerformance(p5, f5, true)->toLongString(); } cout << endl << "6\t"; RuleFilter::filter(this->getIntitalNegRulesFile(6), f5, p6, f6, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << "6\t" << calcPerformance(p6, f6, true)->toLongString() << endl; + cout << calcPerformance(p6, f6, true)->toLongString(); } - cout << endl; + cout << endl << "L95x5\t"; JunctionList passJuncs, failJuncs; const uint32_t L95x5 = L95 * 5; @@ -576,10 +608,10 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL } } if (!genuineFile.empty()) { - cout << "L95x5\t" << calcPerformance(p7, failJuncs, true)->toLongString() << endl; + cout << calcPerformance(p7, failJuncs, true)->toLongString(); } - cout << endl << "Concatenating negatives from each layer to create negative set" << endl << endl; + cout << endl << endl << "Concatenating negatives from each layer to create negative set" << endl << endl; passJuncs.insert(passJuncs.end(), p1.begin(), p1.end()); passJuncs.insert(passJuncs.end(), p2.begin(), p2.end()); @@ -699,7 +731,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction shared_ptr f = nullptr; if (train) { - f = make_shared(); + f = make_shared(); } else { f = make_shared(); @@ -713,11 +745,12 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction testingData, // Data object 0, // M Try (0 == use default) "", // Output prefix - DEFAULT_SELFTRAIN_TREES, // Number of trees (will be overwritten when loading the model) + 250, //DEFAULT_SELFTRAIN_TREES, // Number of trees (will be overwritten when loading the model) 1234567890, // Seed for random generator threads, // Number of threads IMP_GINI, // Importance measure - train ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size + train ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size + //DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, "", // Status var name true, // Prediction mode true, // Replace @@ -753,7 +786,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction if (!genuineFile.empty() && exists(genuineFile)) { vector thresholds; - for(double i = 0.05; i <= 0.5; i += 0.01) { + for(double i = 0.0; i <= 1.0; i += 0.01) { thresholds.push_back(i); } double max_mcc = 0.0; @@ -798,7 +831,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction void portcullis::JunctionFilter::categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t) { for(size_t i = 0; i < all.size(); i++) { - if (f->getPredictions()[i][0] >= t) { + if (f->getPredictions()[i][0] <= t) { pass.push_back(all[i]); } else { diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index e0ebc948..d45edae9 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -76,7 +76,7 @@ const string ST_IPOS_RULES_FILE = "selftrain_initial_pos"; const string ST_INEG_RULES_FILE = "selftrain_initial_neg"; const uint16_t DEFAULT_FILTER_THREADS = 1; const uint16_t DEFAULT_SELFTRAIN_TREES = 100; -const double DEFAULT_FILTER_THRESHOLD = 0.25; +const double DEFAULT_FILTER_THRESHOLD = 0.75; class JunctionFilter { From 9404738c5233646ebf2152a74d0db1ccd4b87a14 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 29 Apr 2016 15:49:08 +0100 Subject: [PATCH 14/15] Additional improvements to the filtering tool. --- Makefile.am | 4 +- data/selftrain_initial_neg.layer7.json | 21 +++++++ data/selftrain_initial_pos.layer1.json | 52 +++-------------- data/selftrain_initial_pos.layer2.json | 52 +++++------------ data/selftrain_initial_pos.layer3.json | 77 +++++++++++++++++++++++++ src/junction_filter.cc | 78 ++++++++++++++++---------- src/junction_filter.hpp | 2 +- 7 files changed, 172 insertions(+), 114 deletions(-) create mode 100644 data/selftrain_initial_neg.layer7.json create mode 100644 data/selftrain_initial_pos.layer3.json diff --git a/Makefile.am b/Makefile.am index 0d39c81c..65431406 100644 --- a/Makefile.am +++ b/Makefile.am @@ -62,8 +62,10 @@ dist_config_DATA = \ data/selftrain_initial_neg.layer4.json \ data/selftrain_initial_neg.layer5.json \ data/selftrain_initial_neg.layer6.json \ + data/selftrain_initial_neg.layer7.json \ data/selftrain_initial_pos.layer1.json \ - data/selftrain_initial_pos.layer2.json + data/selftrain_initial_pos.layer2.json \ + data/selftrain_initial_pos.layer3.json # SRC DIRS make_dirs=deps/htslib-1.3 deps/ranger-0.3.8 lib src tests diff --git a/data/selftrain_initial_neg.layer7.json b/data/selftrain_initial_neg.layer7.json new file mode 100644 index 00000000..5b6bde7c --- /dev/null +++ b/data/selftrain_initial_neg.layer7.json @@ -0,0 +1,21 @@ +{ + "parameters": { + "M2-nb_reads": { + "operator": "gt", + "value": 100 + }, + "M22-rel2raw": { + "operator": "eq", + "value": 0.0 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 4 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 4 + } + }, + "expression": "( M2-nb_reads & M22-rel2raw & M13-hamming5p & M14-hamming3p )" +} diff --git a/data/selftrain_initial_pos.layer1.json b/data/selftrain_initial_pos.layer1.json index d351e911..1bc58557 100644 --- a/data/selftrain_initial_pos.layer1.json +++ b/data/selftrain_initial_pos.layer1.json @@ -1,69 +1,33 @@ { "parameters": { - "M1-canonical_ss": { - "operator": "in", - "value": ["C", "S"] - }, - "M1-canonical_ss.2": { - "operator": "in", - "value": ["N"] - }, "M4-nb_rel_aln": { - "operator": "gte", - "value": 5 - }, - "M4-nb_rel_aln.2": { "operator": "gte", "value": 1 }, "M12-maxmmes": { "operator": "gte", - "value": 20 - }, - "M12-maxmmes.2": { - "operator": "gte", - "value": 10 + "value": 8 }, - "M11-entropy": { - "operator": "gt", - "value": 5.0 - }, - "M11-entropy.2": { - "operator": "gt", - "value": 2.0 - }, - "M13-hamming5p": { + "M13-hamming5p": { "operator": "gte", "value": 5 }, - "M13-hamming5p.2": { - "operator": "gte", - "value": 7 - }, "M14-hamming3p": { "operator": "gte", "value": 5 }, - "M14-hamming3p.2": { - "operator": "gte", - "value": 7 - }, - "M19-mean_mismatches": { - "operator": "eq", - "value": 0 - }, - "M19-mean_mismatches.2": { - "operator": "lt", - "value": 0.2 + "M19-mean_mismatches": { + "operator": "lte", + "value": 1.5 }, "M20-nb_usrs": { "operator": "gte", - "value": 5 + "value": 1 }, "M22-rel2raw": { "operator": "gte", - "value": 0.5 + "value": 0.25 } }, - "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M22-rel2raw & M12-maxmmes.2" + "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw" } diff --git a/data/selftrain_initial_pos.layer2.json b/data/selftrain_initial_pos.layer2.json index d91483ac..62c0273e 100644 --- a/data/selftrain_initial_pos.layer2.json +++ b/data/selftrain_initial_pos.layer2.json @@ -1,69 +1,45 @@ { "parameters": { - "M1-canonical_ss": { - "operator": "in", - "value": ["C", "S"] - }, - "M1-canonical_ss.2": { - "operator": "in", - "value": ["N"] - }, "M4-nb_rel_aln": { "operator": "gte", - "value": 5 + "value": 3 }, "M4-nb_rel_aln.2": { "operator": "gte", - "value": 1 + "value": 2 }, "M12-maxmmes": { "operator": "gte", - "value": 20 + "value": 25 }, "M12-maxmmes.2": { - "operator": "gte", - "value": 10 + "operator": "gt", + "value": 12 }, - "M11-entropy": { - "operator": "gt", - "value": 5.0 - }, - "M11-entropy.2": { - "operator": "gt", - "value": 2.0 - }, "M13-hamming5p": { "operator": "gte", - "value": 5 + "value": 8 }, "M13-hamming5p.2": { "operator": "gte", - "value": 7 + "value": 10 }, "M14-hamming3p": { "operator": "gte", - "value": 5 + "value": 8 }, "M14-hamming3p.2": { "operator": "gte", - "value": 7 + "value": 10 }, - "M19-mean_mismatches": { - "operator": "eq", + "M19-mean_mismatches": { + "operator": "lt", "value": 0 }, "M19-mean_mismatches.2": { "operator": "lt", - "value": 0.2 - }, - "M20-nb_usrs": { - "operator": "gte", - "value": 5 - }, - "M22-rel2raw": { - "operator": "gte", - "value": 0.5 - } + "value": 0.5 + } }, - "expression": "( ( M1-canonical_ss & M19-mean_mismatches & M20-nb_usrs ) | ( M1-canonical_ss & M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) | ( M1-canonical_ss.2 & M19-mean_mismatches & M12-maxmmes & M11-entropy ) )" + "expression": "( M4-nb_rel_aln & M12-maxmmes ) | ( M4-nb_rel_aln.2 & M12-maxmmes.2 & M13-hamming5p & M14-hamming3p & M19-mean_mismatches.2 ) | ( M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches )" } diff --git a/data/selftrain_initial_pos.layer3.json b/data/selftrain_initial_pos.layer3.json new file mode 100644 index 00000000..349d1317 --- /dev/null +++ b/data/selftrain_initial_pos.layer3.json @@ -0,0 +1,77 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["C"] + }, + "M1-canonical_ss.2": { + "operator": "in", + "value": ["S"] + }, + "M1-canonical_ss.3": { + "operator": "in", + "value": ["N"] + }, + "M4-nb_rel_aln": { + "operator": "gte", + "value": 5 + }, + "M4-nb_rel_aln.2": { + "operator": "gte", + "value": 1 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 20 + }, + "M12-maxmmes.2": { + "operator": "gte", + "value": 10 + }, + "M11-entropy": { + "operator": "gt", + "value": 5.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 7 + }, + "M13-hamming5p.2": { + "operator": "gte", + "value": 8 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 7 + }, + "M14-hamming3p.2": { + "operator": "gte", + "value": 8 + }, + "M19-mean_mismatches": { + "operator": "eq", + "value": 0 + }, + "M19-mean_mismatches": { + "operator": "lt", + "value": 0.1 + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 5 + }, + "M22-rel2raw": { + "operator": "gte", + "value": 0.5 + }, + "M22-rel2raw.2": { + "operator": "gte", + "value": 0.75 + } + }, + "expression": "( M1-canonical_ss ) | ( M1-canonical_ss.2 & M22-rel2raw & M13-hamming5p & M14-hamming3p ) | ( M1-canonical_ss.3 & M22-rel2raw.2 & M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches & M11-entropy.2 )" +} diff --git a/src/junction_filter.cc b/src/junction_filter.cc index aec33fc8..b3e78942 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -237,13 +237,13 @@ void portcullis::JunctionFilter::filter() { if (train) { // The initial positive and negative sets - JunctionList pos, neg; + JunctionList pos, unlabelled, neg; cout << "Self training mode activated." << endl << endl; - createPositiveSet(currentJuncs, pos, mf); + createPositiveSet(currentJuncs, pos, unlabelled, mf); - createNegativeSet(mf.L95, currentJuncs, neg); + createNegativeSet(mf.L95, unlabelled, neg); cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl; @@ -466,7 +466,7 @@ void portcullis::JunctionFilter::filter() { } } -void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, JunctionList& pos, ModelFeatures& mf) { +void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf) { JuncResultMap resultMap; cout << "Creating initial positive set for training" << endl @@ -481,39 +481,47 @@ void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, Junc cout << Performance::longHeader(); } - JunctionList p1, p2, f1, f2; + JunctionList p1, p2, p3; cout << endl << "1\t"; - RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, f1, "Creating initial positive set for training", resultMap); + RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, unlabelled, "Creating initial positive set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p1, f1, true)->toLongString(); + cout << calcPerformance(p1, unlabelled)->toLongString(); } cout << endl << "2\t"; - RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, f2, "Creating initial positive set for training", resultMap); + RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, unlabelled, "Creating initial positive set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p2, f2, true)->toLongString(); + cout << calcPerformance(p2, unlabelled)->toLongString(); } - cout << endl; + cout << endl << "3\t"; + RuleFilter::filter(this->getIntitalPosRulesFile(3), p2, p3, unlabelled, "Creating initial positive set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p3, unlabelled)->toLongString(); + } + cout << endl << "L95x1.5\t"; const uint32_t L95 = mf.calcIntronThreshold(p2); - const uint32_t L95x2 = L95 * 2; + const uint32_t pos_length_limit = L95 * 1.5; - JunctionList passJuncs, failJuncs; - for(auto& j : p2) { - if (j->getIntronSize() <= L95x2) { + JunctionList passJuncs; + for(auto& j : p3) { + if (j->getIntronSize() <= pos_length_limit) { passJuncs.push_back(j); } else { - failJuncs.push_back(j); + unlabelled.push_back(j); } } if (!genuineFile.empty()) { - cout << "L95x2\t" << calcPerformance(passJuncs, failJuncs, true)->toLongString() << endl; + cout << calcPerformance(passJuncs, unlabelled)->toLongString(); } - - // Analyse positive set to get L0.05 of intron size - cout << endl << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl; + + cout << endl << endl << "Found " << passJuncs.size() << " junctions for the positive set" << endl << endl; + // Analyse positive set to get L0.05 of intron size + cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl; + + cout << "Saving initial positive set:" << endl; JunctionSystem isp(passJuncs); isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp"); @@ -533,7 +541,7 @@ void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, Junc } } JunctionSystem missedPos; - for(auto& j : failJuncs) { + for(auto& j : unlabelled) { if (j->isGenuine()) { missedPos.addJunction(j); } @@ -563,7 +571,7 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL cout << Performance::longHeader(); } - JunctionList p1, p2, p3, p4, p5, p6, p7, f1, f2, f3, f4, f5, f6, f7; + JunctionList p1, p2, p3, p4, p5, p6, p7, p8, f1, f2, f3, f4, f5, f6, f7, f8; cout << endl << "1\t"; RuleFilter::filter(this->getIntitalNegRulesFile(1), all, p1, f1, "Creating initial negative set for training", resultMap); @@ -595,11 +603,16 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL if (!genuineFile.empty()) { cout << calcPerformance(p6, f6, true)->toLongString(); } + cout << endl << "7\t"; + RuleFilter::filter(this->getIntitalNegRulesFile(7), f6, p7, f7, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p7, f7, true)->toLongString(); + } cout << endl << "L95x5\t"; JunctionList passJuncs, failJuncs; const uint32_t L95x5 = L95 * 5; - for(auto& j : f6) { + for(auto& j : f7) { if (j->getIntronSize() > L95x5 && j->getMaxMMES() < 10 ) { p7.push_back(j); } @@ -608,7 +621,7 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL } } if (!genuineFile.empty()) { - cout << calcPerformance(p7, failJuncs, true)->toLongString(); + cout << calcPerformance(p8, failJuncs, true)->toLongString(); } cout << endl << endl << "Concatenating negatives from each layer to create negative set" << endl << endl; @@ -620,16 +633,21 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL passJuncs.insert(passJuncs.end(), p5.begin(), p5.end()); passJuncs.insert(passJuncs.end(), p6.begin(), p6.end()); passJuncs.insert(passJuncs.end(), p7.begin(), p7.end()); + passJuncs.insert(passJuncs.end(), p8.begin(), p8.end()); + // This will remove any duplicates + JunctionSystem isn(passJuncs); + if (!genuineFile.empty()) { - cout << "Final\t" << calcPerformance(passJuncs, failJuncs, true)->toLongString() << endl; + cout << "Final\t" << calcPerformance(isn.getJunctions(), failJuncs, true)->toLongString() << endl << endl; } - + + cout << "Found " << isn.getJunctions().size() << " junctions for the negative set" << endl << endl; + cout << endl << "Saving initial negative set:" << endl; - JunctionSystem isn(passJuncs); isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn"); - for(auto& j : passJuncs) { + for(auto& j : isn.getJunctions()) { JunctionPtr copy = make_shared(*j); copy->setGenuine(false); neg.push_back(copy); @@ -640,7 +658,7 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL JunctionSystem invalidNeg; JunctionSystem missedNeg; - for(auto& j : passJuncs) { + for(auto& j : isn.getJunctions()) { if (j->isGenuine()) { invalidNeg.addJunction(j); } @@ -815,8 +833,8 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; - - categorise(f, all, pass, fail, best_t_mcc); + cout << "Actual threshold set at " << threshold << endl; + categorise(f, all, pass, fail, best_t_f1); } else { categorise(f, all, pass, fail, threshold); diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index d45edae9..302a5d86 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -299,7 +299,7 @@ class JunctionFilter { void categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t); - void createPositiveSet(const JunctionList& all, JunctionList& pos, ModelFeatures& mf); + void createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf); void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg); From e99e9ec2851a0aab3e858e98e789aad29d37deb6 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 29 Apr 2016 15:58:27 +0100 Subject: [PATCH 15/15] Changed the default filtering threshold level. --- src/junction_filter.hpp | 2 +- tests/Makefile.am | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index 302a5d86..d21b14e2 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -76,7 +76,7 @@ const string ST_IPOS_RULES_FILE = "selftrain_initial_pos"; const string ST_INEG_RULES_FILE = "selftrain_initial_neg"; const uint16_t DEFAULT_FILTER_THREADS = 1; const uint16_t DEFAULT_SELFTRAIN_TREES = 100; -const double DEFAULT_FILTER_THRESHOLD = 0.75; +const double DEFAULT_FILTER_THRESHOLD = 0.625; class JunctionFilter { diff --git a/tests/Makefile.am b/tests/Makefile.am index b3a42b38..d6d27253 100755 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -32,6 +32,7 @@ check_unit_tests_SOURCES = bam_tests.cpp \ check_unit_tests_CXXFLAGS = -O0 @AM_CXXFLAGS@ @CXXFLAGS@ check_unit_tests_CPPFLAGS = -isystem $(top_srcdir)/deps/htslib-1.3 \ + -isystem $(top_srcdir)/deps/ranger-0.3.8/include \ -I$(top_srcdir)/lib/include \ -DRESOURCESDIR=\"$(top_srcdir)/tests/resources\" \ -DDATADIR=\"$(datadir)\" \ @@ -40,12 +41,14 @@ check_unit_tests_CPPFLAGS = -isystem $(top_srcdir)/deps/htslib-1.3 \ check_unit_tests_LDFLAGS = -L../lib \ -L../deps/htslib-1.3 \ + -L../deps/ranger-0.3.8 \ @AM_LDFLAGS@ \ @LDFLAGS@ check_unit_tests_LDADD = libgtest.la \ -lportcullis \ -lphts \ + -lranger \ @AM_LIBS@ clean-local: clean-local-check