From 7315c67e235c8f96c89ce38c02c1b3bf93dc579a Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Wed, 20 Apr 2016 15:27:01 +0100
Subject: [PATCH 01/15] Adding soapsplice2bed script.

Fixed a bug in the full pipeline which prevented junction filtering from working.
---
 Makefile.am       | 1 +
 src/portcullis.cc | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 59ddf4de..e78793b8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -39,6 +39,7 @@ dist_bin_SCRIPTS =  scripts/analyse_alignments.py \
 		    scripts/predict.snakefile \
 		    scripts/read_gen.snakefile \
 		    scripts/snakey.py \
+		    scripts/soapsplice2bed.py \
 		    scripts/spanki2bed.py \
 		    scripts/spanki_filter.py \
 		    scripts/split_error_classes.py \
diff --git a/src/portcullis.cc b/src/portcullis.cc
index 10f9b5fc..74af944f 100644
--- a/src/portcullis.cc
+++ b/src/portcullis.cc
@@ -260,7 +260,8 @@ int mainFull(int argc, char *argv[]) {
     cout << "Identifying junctions and calculating metrics" << endl
          << "---------------------------------------------" << endl << endl;
     
-    path juncOut = outputDir.string() + "/2-junc/portcullis_all";
+    path juncDir = outputDir.string() + "/2-junc";
+    path juncOut = juncDir.string() + "/portcullis_all";
     
     // Identify junctions and calculate metrics
     JunctionBuilder jb(prepDir.string(), juncOut);
@@ -281,7 +282,7 @@ int mainFull(int argc, char *argv[]) {
          << "-------------------" << endl << endl;
     
     path filtOut = outputDir.string() + "/3-filt/portcullis_filtered";
-    path juncTab = juncOut.string() + "/portcullis_all.junctions.tab";
+    path juncTab = juncDir.string() + "/portcullis_all.junctions.tab";
     
     JunctionFilter filter(juncTab, filtOut);
     filter.setVerbose(verbose);

From 7fa519c12ef60beb6243f387679fac9ae653a09f Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Thu, 21 Apr 2016 19:00:44 +0100
Subject: [PATCH 02/15] Fixed a critical bug in the self training routine which
 prevented it from working unless you label the data.

Added ability to generate an intron score to help boost accuracy.
---
 configure.ac                               |  2 +-
 data/selftrain_initial_neg.json            | 44 +++++++++++----
 data/selftrain_initial_neg.json.old        | 57 +++++++++++++++++++
 data/selftrain_initial_pos.json            | 34 ++++++++---
 data/selftrain_initial_pos.json.old        | 37 ++++++++++++
 doc/source/conf.py                         |  4 +-
 lib/include/portcullis/junction.hpp        | 57 +++++++++++++------
 lib/include/portcullis/junction_system.hpp |  2 -
 lib/include/portcullis/rule_parser.hpp     | 11 ++--
 lib/src/junction.cc                        | 63 +++++++++++----------
 lib/src/junction_system.cc                 | 13 -----
 lib/src/rule_parser.cc                     | 14 +++--
 src/junction_filter.cc                     | 65 ++++++++++++++++++----
 src/junction_filter.hpp                    |  4 +-
 src/train.cc                               | 37 ++++++------
 src/train.hpp                              | 32 +++++++++--
 16 files changed, 348 insertions(+), 128 deletions(-)
 create mode 100644 data/selftrain_initial_neg.json.old
 create mode 100644 data/selftrain_initial_pos.json.old

diff --git a/configure.ac b/configure.ac
index 94687932..346afb02 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4,7 +4,7 @@
 
 # Autoconf setup
 AC_PREREQ([2.68])
-AC_INIT([portcullis],[0.15.0],[daniel.mapleson@tgac.ac.uk],[portcullis],[http://www.tgac.ac.uk])
+AC_INIT([portcullis],[0.16.0],[daniel.mapleson@tgac.ac.uk],[portcullis],[http://www.tgac.ac.uk])
 AC_CONFIG_SRCDIR([src/portcullis.cc])
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_MACRO_DIR([m4])
diff --git a/data/selftrain_initial_neg.json b/data/selftrain_initial_neg.json
index 23e962fb..4885e954 100644
--- a/data/selftrain_initial_neg.json
+++ b/data/selftrain_initial_neg.json
@@ -2,27 +2,35 @@
 	"parameters": {
                 "M1-canonical_ss": {
 			"operator": "in",
-			"value": ["N"]
+			"value": ["N", "S"]
 		},
                 "M2-nb_reads": {
                         "operator": "lte",
                         "value": 1
                 },                
 		"M3-nb_dist_aln": {
-			"operator": "eq",
-			"value": 1
+			"operator": "gte",
+			"value": 2
 		},
 		"M4-nb_rel_aln": {
                         "operator": "eq",
                         "value": 0
                 },
                 "M11-entropy": {
-			"operator": "eq",
-			"value": 0.0
+			"operator": "lt",
+			"value": 1.0
+		},
+                "M11-entropy.2": {
+			"operator": "gt",
+			"value": 2.0
 		},
                 "M12-maxmmes": {
                         "operator": "lt",
-                        "value": 8
+                        "value": 7
+                },
+                "M12-maxmmes.2": {
+                        "operator": "lte",
+                        "value": 10
                 },
                 "M8-max_min_anc": {
                         "operator": "lt",
@@ -38,16 +46,32 @@
 		},
                 "M13-hamming5p": {
                         "operator": "lte",
-                        "value": 1
+                        "value": 2
                 },
                 "M14-hamming3p": {
                         "operator": "lte",
-                        "value": 1
+                        "value": 2
                 },
                 "M19-mean_mismatches": {
 			"operator": "gte",
 			"value": 5.0
-		}
+		},
+                "M19-mean_mismatches.2": {
+			"operator": "gte",
+			"value": 2.0
+		},
+                "M20-nb_usrs": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M20-nb_usrs.2": {
+                        "operator": "eq",
+                        "value": 1
+                },
+                "M21-nb_msrs": {
+                        "operator": "gte",
+                        "value": 1
+                }
 	},
-	"expression": "M2-nb_reads | M12-maxmmes | Suspect"  
+	"expression": "( M20-nb_usrs & M21-nb_msrs & M12-maxmmes.2 ) | ( M20-nb_usrs.2 & M1-canonical_ss & M12-maxmmes.2 ) | ( Suspect & M11-entropy.2 ) | M12-maxmmes | ( M1-canonical_ss & M4-nb_rel_aln )"  
 }
diff --git a/data/selftrain_initial_neg.json.old b/data/selftrain_initial_neg.json.old
new file mode 100644
index 00000000..86eb3f59
--- /dev/null
+++ b/data/selftrain_initial_neg.json.old
@@ -0,0 +1,57 @@
+{
+	"parameters": {
+                "M1-canonical_ss": {
+			"operator": "in",
+			"value": ["N"]
+		},
+                "M2-nb_reads": {
+                        "operator": "lte",
+                        "value": 1
+                },                
+		"M3-nb_dist_aln": {
+			"operator": "gte",
+			"value": 2
+		},
+		"M4-nb_rel_aln": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M11-entropy": {
+			"operator": "lt",
+			"value": 1.0
+		},
+                "M12-maxmmes": {
+                        "operator": "lt",
+                        "value": 9
+                },
+                "M12-maxmmes.2": {
+                        "operator": "lte",
+                        "value": 12
+                },
+                "M8-max_min_anc": {
+                        "operator": "lt",
+                        "value": 16
+                },
+		"Suspect": {
+			"operator": "eq",
+			"value": 1
+		},
+                "PFP": {
+			"operator": "eq",
+			"value": 1
+		},
+                "M13-hamming5p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M14-hamming3p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M19-mean_mismatches": {
+			"operator": "gte",
+			"value": 5.0
+		}
+	},
+	"expression": "M19-mean_mismatches | ( M1-canonical_ss & M11-entropy ) | ( M4-nb_rel_aln & M11-entropy ) | ( M12-maxmmes & M11-entropy ) | ( M12-maxmmes.2 & M2-nb_reads ) | ( M13-hamming5p & M14-hamming3p ) | PFP | ( Suspect & M2-nb_reads )"  
+}
diff --git a/data/selftrain_initial_pos.json b/data/selftrain_initial_pos.json
index d850bd63..a3440720 100644
--- a/data/selftrain_initial_pos.json
+++ b/data/selftrain_initial_pos.json
@@ -2,32 +2,52 @@
 	"parameters": {
 		"M1-canonical_ss": {
 			"operator": "in",
-			"value": ["C"]
+			"value": ["C", "S"]
 		},
 		"M4-nb_rel_aln": {
                         "operator": "gte",
                         "value": 5
                 },
+                "M4-nb_rel_aln.2": {
+                        "operator": "gte",
+                        "value": 1
+                },
                 "M12-maxmmes": {
                         "operator": "gte",
                         "value": 20
                 },
 		"M11-entropy": {
 			"operator": "gt",
-			"value": 2.5
+			"value": 5.0
 		},
                 "M13-hamming5p": {
                         "operator": "gte",
-                        "value": 8
+                        "value": 5
+                },
+                "M13-hamming5p.2": {
+                        "operator": "gte",
+                        "value": 7
                 },
                 "M14-hamming3p": {
                         "operator": "gte",
-                        "value": 8
+                        "value": 5
+                },
+                "M14-hamming3p.2": {
+                        "operator": "gte",
+                        "value": 7
                 },
 		"M19-mean_mismatches": {
-			"operator": "lte",
+			"operator": "eq",
+			"value": 0
+		},
+                "M19-mean_mismatches.2": {
+			"operator": "lt",
 			"value": 1.0
-		}
+		},
+                "M20-nb_usrs": {
+                        "operator": "gte",
+                        "value": 5
+                }
 	},
-	"expression": "M1-canonical_ss & M4-nb_rel_aln & M11-entropy & M12-maxmmes & M13-hamming5p & M14-hamming3p & M19-mean_mismatches"  
+	"expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & ( ( M19-mean_mismatches & M20-nb_usrs ) | ( M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) )"  
 }
diff --git a/data/selftrain_initial_pos.json.old b/data/selftrain_initial_pos.json.old
new file mode 100644
index 00000000..c4552792
--- /dev/null
+++ b/data/selftrain_initial_pos.json.old
@@ -0,0 +1,37 @@
+{
+	"parameters": {
+		"M1-canonical_ss": {
+			"operator": "in",
+			"value": ["C"]
+		},
+		"M4-nb_rel_aln": {
+                        "operator": "gte",
+                        "value": 5
+                },
+                "M12-maxmmes": {
+                        "operator": "gte",
+                        "value": 18
+                },
+		"M11-entropy": {
+			"operator": "gt",
+			"value": 2.5
+		},
+                "M13-hamming5p": {
+                        "operator": "gte",
+                        "value": 7
+                },
+                "M14-hamming3p": {
+                        "operator": "gte",
+                        "value": 7
+                },
+		"M19-mean_mismatches": {
+			"operator": "eq",
+			"value": 0.0
+		},
+                "M20-nb_usrs": {
+                        "operator": "gte",
+                        "value": 5
+                }
+	},
+	"expression": "M1-canonical_ss & M4-nb_rel_aln & M11-entropy & M12-maxmmes & M13-hamming5p & M14-hamming3p & M19-mean_mismatches & M20-nb_usrs"  
+}
diff --git a/doc/source/conf.py b/doc/source/conf.py
index cec4720f..419f9401 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -53,9 +53,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.15.0'
+version = '0.16.0'
 # The full version, including alpha/beta/rc tags.
-release = '0.15.0'
+release = '0.16.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp
index ed510d25..047c0744 100644
--- a/lib/include/portcullis/junction.hpp
+++ b/lib/include/portcullis/junction.hpp
@@ -24,6 +24,7 @@
 #include <memory>
 #include <unordered_map>
 using std::ostream;
+using std::cout;
 using std::endl;
 using std::min;
 using std::max;
@@ -97,14 +98,15 @@ const vector<string> METRIC_NAMES = {
         "M17-primary_junc",
         "M18-mm_score",
         "M19-mean_mismatches",
-        "M20-nb_msrs",
-        "M21-nb_up_juncs",
-        "M22-nb_down_juncs",
-        "M23-up_aln",
-        "M24-down_aln",
-        "M25-dist_2_up_junc",
-        "M26-dist_2_down_junc",
-        "M27-dist_nearest_junc"
+        "M20-nb_usrs",
+        "M21-nb_msrs",
+        "M22-nb_up_juncs",
+        "M23-nb_down_juncs",
+        "M24-up_aln",
+        "M25-down_aln",
+        "M26-dist_2_up_junc",
+        "M27-dist_2_down_junc",
+        "M28-dist_nearest_junc"
     };
 
 const vector<string> STRAND_NAMES = {
@@ -257,15 +259,16 @@ class Junction {
     bool     primaryJunction;                   // Metric 17    
     double   multipleMappingScore;              // Metric 18
     double   meanMismatches;                    // Metric 19
-    uint32_t nbMultipleSplicedReads;            // Metric 20
-    uint16_t nbUpstreamJunctions;               // Metric 21
-    uint16_t nbDownstreamJunctions;             // Metric 22
-    uint32_t nbUpstreamFlankingAlignments;      // Metric 23
-    uint32_t nbDownstreamFlankingAlignments;    // Metric 24
-    int32_t distanceToNextUpstreamJunction;     // Metric 25
-    int32_t distanceToNextDownstreamJunction;   // Metric 26
-    int32_t distanceToNearestJunction;          // Metric 27
-    vector<uint32_t> junctionOverhangs;         // Metric 28-37
+    //uint32_t nbUniquelySplicedReads;            // Metric 20 (Use getter)
+    uint32_t nbMultipleSplicedReads;            // Metric 21
+    uint16_t nbUpstreamJunctions;               // Metric 22
+    uint16_t nbDownstreamJunctions;             // Metric 23
+    uint32_t nbUpstreamFlankingAlignments;      // Metric 24
+    uint32_t nbDownstreamFlankingAlignments;    // Metric 25
+    int32_t distanceToNextUpstreamJunction;     // Metric 26
+    int32_t distanceToNextDownstreamJunction;   // Metric 27
+    int32_t distanceToNearestJunction;          // Metric 28
+    vector<uint32_t> junctionOverhangs;         // Metric 29-38
     
     // **** Predictions ****
     
@@ -689,6 +692,11 @@ class Junction {
         return nbMultipleSplicedReads;
     }
     
+    uint32_t getNbUniquelySplicedReads() const {
+        return nbJunctionAlignments - nbMultipleSplicedReads;
+    }
+
+    
     uint16_t getNbDownstreamJunctions() const {
         return nbDownstreamJunctions;
     }
@@ -813,7 +821,7 @@ class Junction {
     void setMeanMismatches(double meanMismatches) {
         this->meanMismatches = meanMismatches;
     }
-    
+        
     void setNbMultipleSplicedReads(uint32_t nbMultipleSplicedReads) {
         this->nbMultipleSplicedReads = nbMultipleSplicedReads;
     }
@@ -842,6 +850,18 @@ class Junction {
     string locationAsString() const {
         return this->intron->toString() + strandToChar(this->consensusStrand);
     }
+    
+    /**
+     * Calculates a score for this intron size based on how this intron size fits
+     * into an expected distribution specified by the length at the 95th percentile
+     * (L95) provided by the user.  Introns of length < L95 have a score of 0. 
+     * Introns with length > L95 have score: -ln(size - L95)
+     * @param L95 Intron size at 95 percentile of a correct distribution
+     * @return A score for this intron size given the L95
+     */
+    double calcIntronScore(const uint32_t L95) const {
+        return this->intron->size() <= L95 ? 0.0 : -log(this->intron->size() - L95);
+    }
 
     
     
@@ -921,6 +941,7 @@ class Junction {
                     << j.primaryJunction << "\t"
                     << j.multipleMappingScore << "\t"
                     << j.meanMismatches << "\t"
+                    << j.getNbUniquelySplicedReads() << "\t"
                     << j.nbMultipleSplicedReads << "\t"
                     << j.nbDownstreamJunctions << "\t"
                     << j.nbUpstreamJunctions << "\t"
diff --git a/lib/include/portcullis/junction_system.hpp b/lib/include/portcullis/junction_system.hpp
index 61b7bc8e..47b91d03 100644
--- a/lib/include/portcullis/junction_system.hpp
+++ b/lib/include/portcullis/junction_system.hpp
@@ -152,8 +152,6 @@ class JunctionSystem {
     
     void calcJunctionStats(bool verbose);
     
-    void calcTrimmedOverhangScore();
-    
     void sort();
     
     void saveAll(const path& outputPrefix, const string& source);
diff --git a/lib/include/portcullis/rule_parser.hpp b/lib/include/portcullis/rule_parser.hpp
index 27dfd2b5..ac24a33b 100644
--- a/lib/include/portcullis/rule_parser.hpp
+++ b/lib/include/portcullis/rule_parser.hpp
@@ -88,11 +88,12 @@ const unordered_set<string> numericParams = {
     "M17-primary_junc",
     "M18-mm_score",
     "M19-nb_mismatches",
-    "M20-nb_msrs",
-    "M21-nb_up_juncs",
-    "M22-nb_down_juncs",
-    "M23-up_aln",
-    "M24-down_aln"        
+    "M20-nb_usrs",
+    "M21-nb_msrs",
+    "M22-nb_up_juncs",
+    "M23-nb_down_juncs",
+    "M24-up_aln",
+    "M25-down_aln"        
 };
 
 const unordered_set<string> stringParams = {
diff --git a/lib/src/junction.cc b/lib/src/junction.cc
index 5e0d4c2d..af3a438d 100644
--- a/lib/src/junction.cc
+++ b/lib/src/junction.cc
@@ -823,10 +823,10 @@ void portcullis::Junction::calcMismatchStats() {
     // Assuming we have some mismatches determine if this junction has no overhangs
     // extending beyond first mismatch.  If so determine if that distance is small
     // enough to consider the junction as suspicious
-    if (nbMismatches > 0) {
+    if (nbMismatches > 0 && firstMismatch < 20) {
         bool found = false;
         for(const auto& a : alignments) {
-            if (a->mmes > firstMismatch) {
+            if (a->minMatch > firstMismatch) {
                 found = true;
                 break;
             }
@@ -941,14 +941,15 @@ void portcullis::Junction::outputDescription(std::ostream &strm, string delimite
          << "M17: Primary Junction: " << boolalpha << primaryJunction << delimiter
          << "M18: Multiple mapping score: " << multipleMappingScore << delimiter
          << "M19: Mean mismatches: " << meanMismatches << delimiter
-         << "M20: # Multiple Spliced Reads: " << nbMultipleSplicedReads << delimiter
-         << "M21: # Upstream Junctions: " << nbUpstreamJunctions << delimiter
-         << "M22: # Downstream Junctions: " << nbDownstreamJunctions << delimiter
-         << "M23: # Upstream Non-Spliced Alignments: " << nbUpstreamFlankingAlignments << delimiter
-         << "M24: # Downstream Non-Spliced Alignments: " << nbDownstreamFlankingAlignments << delimiter
-         << "M25: Distance to next upstream junction: " << distanceToNextUpstreamJunction << delimiter
-         << "M26: Distance to next downstream junction: " << distanceToNextDownstreamJunction << delimiter
-         << "M27: Distance to nearest junction: " << distanceToNearestJunction;         
+         << "M20: # Uniquely Spliced Reads: " << getNbUniquelySplicedReads() << delimiter
+         << "M21: # Multiple Spliced Reads: " << nbMultipleSplicedReads << delimiter
+         << "M22: # Upstream Junctions: " << nbUpstreamJunctions << delimiter
+         << "M23: # Downstream Junctions: " << nbDownstreamJunctions << delimiter
+         << "M24: # Upstream Non-Spliced Alignments: " << nbUpstreamFlankingAlignments << delimiter
+         << "M25: # Downstream Non-Spliced Alignments: " << nbDownstreamFlankingAlignments << delimiter
+         << "M26: Distance to next upstream junction: " << distanceToNextUpstreamJunction << delimiter
+         << "M27: Distance to next downstream junction: " << distanceToNextDownstreamJunction << delimiter
+         << "M28: Distance to nearest junction: " << distanceToNearestJunction;         
 }
     
 /**
@@ -979,14 +980,15 @@ void portcullis::Junction::condensedOutputDescription(std::ostream &strm, string
          << "M17-PrimaryJunction=" << boolalpha << primaryJunction << delimiter
          << "M18-MultipleMappingScore=" << multipleMappingScore << delimiter
          << "M19-MeanMismatches=" << meanMismatches << delimiter
-         << "M20-NbMultipleSplicedReads=" << nbMultipleSplicedReads << delimiter
-         << "M21-NbUpstreamJunctions=" << nbUpstreamJunctions << delimiter
-         << "M22-NbDownstreamJunctions=" << nbDownstreamJunctions << delimiter
-         << "M23-NbUpstreamNonSplicedAlignments=" << nbUpstreamFlankingAlignments << delimiter
-         << "M24-NbDownstreamNonSplicedAlignments=" << nbDownstreamFlankingAlignments << delimiter
-         << "M25-DistanceToNextUpstreamJunction=" << distanceToNextUpstreamJunction << delimiter
-         << "M26-DistanceToNextDownstreamJunction=" << distanceToNextDownstreamJunction << delimiter
-         << "M27-DistanceToNearestJunction=" << distanceToNearestJunction << delimiter
+         << "M20-NbUniquelySplicedReads=" << getNbUniquelySplicedReads() << delimiter
+         << "M21-NbMultipleSplicedReads=" << nbMultipleSplicedReads << delimiter
+         << "M22-NbUpstreamJunctions=" << nbUpstreamJunctions << delimiter
+         << "M23-NbDownstreamJunctions=" << nbDownstreamJunctions << delimiter
+         << "M24-NbUpstreamNonSplicedAlignments=" << nbUpstreamFlankingAlignments << delimiter
+         << "M25-NbDownstreamNonSplicedAlignments=" << nbDownstreamFlankingAlignments << delimiter
+         << "M26-DistanceToNextUpstreamJunction=" << distanceToNextUpstreamJunction << delimiter
+         << "M27-DistanceToNextDownstreamJunction=" << distanceToNextDownstreamJunction << delimiter
+         << "M28-DistanceToNearestJunction=" << distanceToNearestJunction << delimiter
          << "PFP=" << boolalpha << pfp;         
 }
 
@@ -1200,21 +1202,22 @@ shared_ptr<portcullis::Junction> portcullis::Junction::parse(const string& line)
     j->setPrimaryJunction(lexical_cast<bool>(parts[29]));
     j->setMultipleMappingScore(lexical_cast<double>(parts[30]));
     j->setMeanMismatches(lexical_cast<double>(parts[31]));
-    j->setNbMultipleSplicedReads(lexical_cast<uint32_t>(parts[32]));
-    j->setNbUpstreamJunctions(lexical_cast<uint16_t>(parts[33]));
-    j->setNbDownstreamJunctions(lexical_cast<uint16_t>(parts[34]));
-    j->setNbUpstreamFlankingAlignments(lexical_cast<uint32_t>(parts[35]));
-    j->setNbDownstreamFlankingAlignments(lexical_cast<uint32_t>(parts[36]));
-    j->setDistanceToNextUpstreamJunction(lexical_cast<uint32_t>(parts[37]));
-    j->setDistanceToNextDownstreamJunction(lexical_cast<uint32_t>(parts[38]));
-    j->setDistanceToNearestJunction(lexical_cast<uint32_t>(parts[39]));
+    //j->setNbUniquelySplicedReads(lexical_cast<uint32_t>(parts[32]));
+    j->setNbMultipleSplicedReads(lexical_cast<uint32_t>(parts[33]));
+    j->setNbUpstreamJunctions(lexical_cast<uint16_t>(parts[34]));
+    j->setNbDownstreamJunctions(lexical_cast<uint16_t>(parts[35]));
+    j->setNbUpstreamFlankingAlignments(lexical_cast<uint32_t>(parts[36]));
+    j->setNbDownstreamFlankingAlignments(lexical_cast<uint32_t>(parts[37]));
+    j->setDistanceToNextUpstreamJunction(lexical_cast<uint32_t>(parts[38]));
+    j->setDistanceToNextDownstreamJunction(lexical_cast<uint32_t>(parts[39]));
+    j->setDistanceToNearestJunction(lexical_cast<uint32_t>(parts[40]));
     
     // Read Junction overhangs
-    j->setMeanQueryLength(lexical_cast<uint32_t>(parts[40]));
-    j->setSuspicious(lexical_cast<bool>(parts[41]));
-    j->setPotentialFalsePositive(lexical_cast<bool>(parts[42]));
+    j->setMeanQueryLength(lexical_cast<uint32_t>(parts[41]));
+    j->setSuspicious(lexical_cast<bool>(parts[42]));
+    j->setPotentialFalsePositive(lexical_cast<bool>(parts[43]));
     for(size_t i = 0; i < JO_NAMES.size(); i++) {
-        j->setJunctionOverhangs(i, lexical_cast<uint32_t>(parts[43 + i]));
+        j->setJunctionOverhangs(i, lexical_cast<uint32_t>(parts[44 + i]));
     }
     
     return j;
diff --git a/lib/src/junction_system.cc b/lib/src/junction_system.cc
index 7104d459..6a6c0b2a 100644
--- a/lib/src/junction_system.cc
+++ b/lib/src/junction_system.cc
@@ -368,19 +368,6 @@ void portcullis::JunctionSystem::calcJunctionStats(bool verbose) {
     }
 }
 
-/**
- * Used of calculating metric 27.  Trimmed Overhang score.  This provide a probability
- * that this junction is genuine based on comparisons of expected distribution
- * of coverage to observed distribution of coverage, after reads have been trimmed
- * to the first mismatch up and downstream of the junction.  An L1 regularized 
- * logistic regression is then fitted to
- * 
- * This metric similar to this is used in FineSplice.
- */
-void portcullis::JunctionSystem::calcTrimmedOverhangScore() {
-
-
-}
 
 void portcullis::JunctionSystem::sort() {
 
diff --git a/lib/src/rule_parser.cc b/lib/src/rule_parser.cc
index 0c9a0380..a3bb74d6 100644
--- a/lib/src/rule_parser.cc
+++ b/lib/src/rule_parser.cc
@@ -219,14 +219,16 @@ double portcullis::eval::getNumericFromJunc(const var& fullname) const {
         case 18:
             return junc->getMeanMismatches();
         case 19:
-            return junc->getNbMultipleSplicedReads();
+            return junc->getNbUniquelySplicedReads();
         case 20:
-            return junc->getNbUpstreamJunctions();
+            return junc->getNbMultipleSplicedReads();
         case 21:
-            return junc->getNbDownstreamJunctions();
+            return junc->getNbUpstreamJunctions();
         case 22:
-            return junc->getNbUpstreamFlankingAlignments();
+            return junc->getNbDownstreamJunctions();
         case 23:
+            return junc->getNbUpstreamFlankingAlignments();
+        case 24:
             return junc->getNbDownstreamFlankingAlignments();
 
     }
@@ -235,7 +237,7 @@ double portcullis::eval::getNumericFromJunc(const var& fullname) const {
         return junc->isSuspicious();
     } 
     else if (boost::iequals(name, "PFP")) {
-        return junc->getSpliceSiteType();
+        return junc->isPotentialFalsePositive();
     } 
 
 
@@ -257,7 +259,7 @@ string portcullis::eval::getStringFromJunc(const var& fullname) const {
     }    
 
     BOOST_THROW_EXCEPTION(RuleParserException() << RuleParserErrorInfo(string(
-                        "Unrecognised param: ") + name));        
+                        "Unrecognised param: ") + name));
 }
     
 bool portcullis::eval::evalNumberLeaf(Operator op, double threshold, double value) const {
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index 2e3bd4f6..cdee0531 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -196,6 +196,9 @@ void portcullis::JunctionFilter::filter() {
         currentJuncs.push_back(j);
     }
     
+    // To be overridden if we are training
+    uint32_t L95 = 0;
+    
     if (train) {
         
         JunctionList initialSet;
@@ -216,7 +219,9 @@ void portcullis::JunctionFilter::filter() {
         pos = passJuncs.size();
         
         for(auto& j : passJuncs) {
-            initialSet.push_back(j);            
+            JunctionPtr copy = make_shared<Junction>(*j);
+            copy->setGenuine(true);
+            initialSet.push_back(copy);            
         }
         if (!genuineFile.empty()) {
             shared_ptr<Performance> p = calcPerformance(passJuncs, failJuncs);
@@ -243,7 +248,11 @@ void portcullis::JunctionFilter::filter() {
             cout << "Saving missed positive junctions to disk:" << endl;
             missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp");
         }
-    
+        
+        
+        // Analyse positive set to get L0.05 of intron size
+        L95 = this->calcIntronThreshold(passJuncs);        
+        cout << "Length of intron at 95th percentile of positive set: " << L95 << endl << endl;
         
         passJuncs.clear();
         failJuncs.clear();
@@ -257,7 +266,9 @@ void portcullis::JunctionFilter::filter() {
         neg = passJuncs.size();
         
         for(auto& j : passJuncs) {
-            initialSet.push_back(j);            
+            JunctionPtr copy = make_shared<Junction>(*j);
+            copy->setGenuine(false);
+            initialSet.push_back(copy);            
         }
         
         if (!genuineFile.empty()) {
@@ -286,11 +297,17 @@ void portcullis::JunctionFilter::filter() {
             cout << "Saving missed negative junctions to disk:" << endl;
             missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn");
         }
+        
+        const uint32_t L95_fail = this->calcIntronThreshold(passJuncs);
+        cout << "Length of intron at 95th percentile of negative set: " << L95_fail << endl << endl;
+        
             
         cout << "Initial training set consists of " << pos << " positive and " << neg << " negative junctions." << endl << endl;
         
+        
+        
         cout << "Training a random forest model using the initial positive and negative datasets" << endl;
-        shared_ptr<Forest> forest = Train::trainInstance(initialSet, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true);
+        shared_ptr<Forest> forest = Train::trainInstance(initialSet, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, L95);
         
         forest->saveToFile();
         forest->writeOutput(&cout);
@@ -311,7 +328,7 @@ void portcullis::JunctionFilter::filter() {
         JunctionList passJuncs;
         JunctionList failJuncs;
         
-        forestPredict(currentJuncs, passJuncs, failJuncs);
+        forestPredict(currentJuncs, passJuncs, failJuncs, L95);
                 
         printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering"));
         
@@ -451,6 +468,18 @@ void portcullis::JunctionFilter::filter() {
     }
 }
     
+uint32_t portcullis::JunctionFilter::calcIntronThreshold(const JunctionList& juncs) {
+    
+    vector<uint32_t> intron_sizes;
+    for(auto& j : juncs) {
+        intron_sizes.push_back(j->getIntronSize());
+    }
+    
+    std::sort(intron_sizes.begin(), intron_sizes.end());
+    
+    return intron_sizes[intron_sizes.size() * 0.95];
+}
+
 void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) {
     // Output stats
     size_t diff = in.size() - pass.size();
@@ -512,13 +541,13 @@ void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, cons
 
 }
     
-void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail) {
+void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95) {
     
     if (verbose) {
         cerr << endl << "Preparing junction metrics into matrix" << endl;
     }
     
-    Data* testingData = Train::juncs2FeatureVectors(all);
+    Data* testingData = Train::juncs2FeatureVectors(all, L95);
     
     // Load forest from disk
     
@@ -571,9 +600,15 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
     f->run(verbose);
     
     if (verbose) {
-        cerr << "Separating original junction data into pass and fail categories" << endl;
+        cerr << "Separating original junction data into pass and fail categories." << endl;
     }
+    
+    path scorepath = output.string() + ".scores";
+    ofstream scoreStream(scorepath.c_str());
+    
     for(size_t i = 0; i < all.size(); i++) {
+        
+        scoreStream << f->getPredictions()[i][0] << endl;
         if (f->getPredictions()[i][0] >= 0.5) {
             pass.push_back(all[i]);
         }
@@ -582,6 +617,11 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
         }
     }
     
+    scoreStream.close();
+    
+    cout << "Saved junction scores to: " << scorepath << endl;
+    
+    
     delete testingData;
 }
 
@@ -594,6 +634,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
     path referenceFile;
     path output;
     uint16_t threads;
+    bool no_ml;
     bool saveBad;
     int32_t max_length;
     string canonical;
@@ -618,6 +659,8 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
                 "If you have a list of line separated boolean values in a file, indicating whether each junction in your input is genuine or not, then we can use that information here to gauge the accuracy of the predictions.")
             ("reference,r", po::value<path>(&referenceFile),
                 "Reference annotation of junctions in BED format.  Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria.  If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.")
+            ("no_ml,n", po::bool_switch(&no_ml)->default_value(false),
+                "Disables machine learning filtering")
             ("save_bad,b", po::bool_switch(&saveBad)->default_value(false),
                 "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)")
             ("source", po::value<string>(&source)->default_value(DEFAULT_FILTER_SOURCE),
@@ -679,12 +722,14 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
     // Only set the filter rules if specified.
     filter.setFilterFile(filterFile);
     filter.setGenuineFile(genuineFile);
-    if (modelFile.empty()) {
+    if (modelFile.empty() && !no_ml) {
         filter.setTrain(true);    
     }
     else {
         filter.setTrain(false);
-        filter.setModelFile(modelFile);
+        if (!no_ml) {
+            filter.setModelFile(modelFile);
+        }
     }
     filter.setReferenceFile(referenceFile);
     
diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp
index 12642bcd..5ff86eef 100644
--- a/src/junction_filter.hpp
+++ b/src/junction_filter.hpp
@@ -261,7 +261,7 @@ class JunctionFilter {
  
 protected:
     
-    void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail);
+    void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95);
 
     shared_ptr<Performance> calcPerformance(const JunctionList& pass, const JunctionList& fail) {
         return calcPerformance(pass, fail, false);
@@ -272,6 +272,8 @@ class JunctionFilter {
     
     void doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail, const string& prefix, JuncResultMap& resultMap);
     
+    uint32_t calcIntronThreshold(const JunctionList& pass);
+    
 public:
   
     static string helpMessage() {
diff --git a/src/train.cc b/src/train.cc
index 82c2abc3..5907e434 100644
--- a/src/train.cc
+++ b/src/train.cc
@@ -68,7 +68,7 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){
     verbose = false;    
 }
     
-Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x) {
+Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint32_t L95) {
     
     vector<string> headers;
     headers.reserve( VAR_NAMES.size() + JO_NAMES.size() );
@@ -82,16 +82,17 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x) {
     for (const auto& j : x) {        
         
         d[0 * x.size() + row] = j->getNbJunctionAlignments();
-        d[1 * x.size() + row] = j->getNbDistinctAlignments();
-        d[2 * x.size() + row] = j->getNbReliableAlignments();
-        d[3 * x.size() + row] = j->getMaxMinAnchor();
-        d[4 * x.size() + row] = j->getDiffAnchor();
-        d[5 * x.size() + row] = j->getNbDistinctAnchors();
-        d[6 * x.size() + row] = j->getEntropy();
-        d[7 * x.size() + row] = j->getMaxMMES();
-        d[8 * x.size() + row] = j->getHammingDistance5p();
-        d[9 * x.size() + row] = j->getHammingDistance3p();
-        d[10 * x.size() + row] = j->isGenuine();
+        //d[1 * x.size() + row] = j->getNbDistinctAlignments();
+        d[1 * x.size() + row] = j->getNbReliableAlignments();
+        //d[3 * x.size() + row] = j->getMaxMinAnchor();
+        //d[4 * x.size() + row] = j->getDiffAnchor();
+        //d[5 * x.size() + row] = j->getNbDistinctAnchors();
+        d[2 * x.size() + row] = j->getEntropy();
+        d[3 * x.size() + row] = j->getMaxMMES();
+        d[4 * x.size() + row] = j->getHammingDistance5p();
+        d[5 * x.size() + row] = j->getHammingDistance3p();
+        //d[6 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
+        d[6 * x.size() + row] = j->isGenuine();
         // Junction overhang values at each position are first converted into deviation from expected distributions       
         double half_read_length = (double)j->getMeanQueryLength() / 2.0;    
         for(size_t i = 0; i < JO_NAMES.size(); i++) {
@@ -101,7 +102,7 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x) {
             double Ei = (double)j->getNbJunctionAlignments() * Pi;  // Expected count at this position
             double Xi = abs(log2(Ni / Ei));                         // Log deviation
             
-            d[(i + 11) * x.size() + row] = Xi;
+            d[(i + 7) * x.size() + row] = Xi;
         }
         
         row++;
@@ -112,10 +113,12 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x) {
     return data;
 }
 
-shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) {
+shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95) {
     
-    Data* trainingData = juncs2FeatureVectors(x);
+    cout << "Creating feature vector" << endl;
+    Data* trainingData = juncs2FeatureVectors(x, L95);
     
+    cout << "Initialising random forest" << endl;
     shared_ptr<Forest> f = nullptr;
     if (regressionMode) {
         f = make_shared<ForestRegression>();
@@ -146,17 +149,17 @@ shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, strin
         false,                      // predall
         1.0);                       // Sample fraction
             
-    
+    cout << "Training" << endl;
     f->setVerboseOut(&cerr);
     f->run(verbose);
     
     return f;
 }
 
-void portcullis::Train::testInstance(shared_ptr<Forest> f, const JunctionList& x) {
+void portcullis::Train::testInstance(shared_ptr<Forest> f, const JunctionList& x, const uint32_t L95) {
     
     // Convert testing set junctions into feature vector
-    Data* testingData = juncs2FeatureVectors(x);
+    Data* testingData = juncs2FeatureVectors(x, L95);
     
     f->setPredictionMode(true);
     f->setData(testingData);
diff --git a/src/train.hpp b/src/train.hpp
index 68fefdff..7abed309 100644
--- a/src/train.hpp
+++ b/src/train.hpp
@@ -51,9 +51,18 @@ const double DEFAULT_TRAIN_FRACTION = 1.0;
 const int DEFAULT_SEED = 1234567;       // To avoid non-deterministic behaviour
 
 // List of variable names
-const vector<string> VAR_NAMES = { "M2-nb-reads", "M3-nb_dist_aln", "M4-nb_rel_aln", 
-            "M8-max_min_anc", "M9-dif_anc", "M10-dist_anc", "M11-entropy", 
-            "M12-maxmmes", "M13-hammping5p", "M14-hamming3p",
+const vector<string> VAR_NAMES = { 
+            "M2-nb-reads", 
+            //"M3-nb_dist_aln", 
+            "M4-nb_rel_aln", 
+            //"M8-max_min_anc", 
+            //"M9-dif_anc", 
+            //"M10-dist_anc", 
+            "M11-entropy", 
+            "M12-maxmmes", 
+            "M13-hamming5p", 
+            "M14-hamming3p",
+            //"IntronScore",
             "Genuine" };
 
 // Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/
@@ -220,15 +229,26 @@ class Train {
 
     static int main(int argc, char *argv[]);
     
-    static Data* juncs2FeatureVectors(const JunctionList& x);
+    static Data* juncs2FeatureVectors(const JunctionList& x) {
+        return juncs2FeatureVectors(x, 0);
+    }
+    static Data* juncs2FeatureVectors(const JunctionList& x, const uint32_t L95);
     
     static shared_ptr<Forest> trainInstance(const JunctionList& x, string outputPrefix, 
-            uint16_t trees, uint16_t threads, bool regressionMode, bool verbose);
+            uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) {
+        return trainInstance(x, outputPrefix, trees, threads, regressionMode, verbose, 0);
+    }
+    static shared_ptr<Forest> trainInstance(const JunctionList& x, string outputPrefix, 
+            uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95);
     
 protected:
     
     
-    void testInstance(shared_ptr<Forest> f, const JunctionList& y);  
+    void testInstance(shared_ptr<Forest> f, const JunctionList& y) {
+        return testInstance(f, y, 0);
+    }
+    
+    void testInstance(shared_ptr<Forest> f, const JunctionList& y, const uint32_t L95);
     
     void getRandomSubset(const JunctionList& in, JunctionList& out);
     

From f6165ff5177056b485c8074690ec05ff357bf532 Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Fri, 22 Apr 2016 13:26:28 +0100
Subject: [PATCH 03/15] Updating the junction analysis tool to include reliable
 2 raw alignment ratio scores.

Also removed some of the features going into the prediction tool which seems to ensure that all features are now contributing to the decision making process under all the situations I've tested so far.  Performance seems on par with finesplice, but we are still a little way behind truesight and soapsplice with this version.
---
 data/selftrain_initial_pos.json        |  6 ++-
 lib/include/portcullis/junction.hpp    | 54 ++++++++++++++------------
 lib/include/portcullis/rule_parser.hpp |  9 +++--
 lib/src/junction.cc                    | 39 ++++++++++---------
 lib/src/rule_parser.cc                 |  9 +++--
 src/junction_filter.cc                 | 31 ++++++++++-----
 src/train.cc                           | 24 ++++++------
 src/train.hpp                          | 16 ++++----
 8 files changed, 110 insertions(+), 78 deletions(-)

diff --git a/data/selftrain_initial_pos.json b/data/selftrain_initial_pos.json
index a3440720..83539e6f 100644
--- a/data/selftrain_initial_pos.json
+++ b/data/selftrain_initial_pos.json
@@ -47,7 +47,11 @@
                 "M20-nb_usrs": {
                         "operator": "gte",
                         "value": 5
+                },
+                "M22-rel2raw": {
+                        "operator": "gte",
+                        "value": 0.1
                 }
 	},
-	"expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & ( ( M19-mean_mismatches & M20-nb_usrs ) | ( M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) )"  
+	"expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M22-rel2raw & ( ( M19-mean_mismatches & M20-nb_usrs ) | ( M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) )"  
 }
diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp
index 047c0744..f7219007 100644
--- a/lib/include/portcullis/junction.hpp
+++ b/lib/include/portcullis/junction.hpp
@@ -100,13 +100,14 @@ const vector<string> METRIC_NAMES = {
         "M19-mean_mismatches",
         "M20-nb_usrs",
         "M21-nb_msrs",
-        "M22-nb_up_juncs",
-        "M23-nb_down_juncs",
-        "M24-up_aln",
-        "M25-down_aln",
-        "M26-dist_2_up_junc",
-        "M27-dist_2_down_junc",
-        "M28-dist_nearest_junc"
+        "M22-rel2raw",
+        "M23-nb_up_juncs",
+        "M24-nb_down_juncs",
+        "M25-up_aln",
+        "M26-down_aln",
+        "M27-dist_2_up_junc",
+        "M28-dist_2_down_junc",
+        "M29-dist_nearest_junc"
     };
 
 const vector<string> STRAND_NAMES = {
@@ -259,16 +260,17 @@ class Junction {
     bool     primaryJunction;                   // Metric 17    
     double   multipleMappingScore;              // Metric 18
     double   meanMismatches;                    // Metric 19
-    //uint32_t nbUniquelySplicedReads;            // Metric 20 (Use getter)
+    //uint32_t nbUniquelySplicedReads;          // Metric 20 (Use getter)
     uint32_t nbMultipleSplicedReads;            // Metric 21
-    uint16_t nbUpstreamJunctions;               // Metric 22
-    uint16_t nbDownstreamJunctions;             // Metric 23
-    uint32_t nbUpstreamFlankingAlignments;      // Metric 24
-    uint32_t nbDownstreamFlankingAlignments;    // Metric 25
-    int32_t distanceToNextUpstreamJunction;     // Metric 26
-    int32_t distanceToNextDownstreamJunction;   // Metric 27
-    int32_t distanceToNearestJunction;          // Metric 28
-    vector<uint32_t> junctionOverhangs;         // Metric 29-38
+    //double   reliableVsRawReadRatio;            // Metric 22 (Use getter)
+    uint16_t nbUpstreamJunctions;               // Metric 23
+    uint16_t nbDownstreamJunctions;             // Metric 24
+    uint32_t nbUpstreamFlankingAlignments;      // Metric 25
+    uint32_t nbDownstreamFlankingAlignments;    // Metric 26
+    int32_t distanceToNextUpstreamJunction;     // Metric 27
+    int32_t distanceToNextDownstreamJunction;   // Metric 28
+    int32_t distanceToNearestJunction;          // Metric 29
+    vector<uint32_t> junctionOverhangs;         // Metric 30-49
     
     // **** Predictions ****
     
@@ -695,7 +697,10 @@ class Junction {
     uint32_t getNbUniquelySplicedReads() const {
         return nbJunctionAlignments - nbMultipleSplicedReads;
     }
-
+    
+    double getReliable2RawRatio() const {
+        return (double)nbReliableAlignments / (double)nbJunctionAlignments;
+    }
     
     uint16_t getNbDownstreamJunctions() const {
         return nbDownstreamJunctions;
@@ -853,14 +858,14 @@ class Junction {
     
     /**
      * Calculates a score for this intron size based on how this intron size fits
-     * into an expected distribution specified by the length at the 95th percentile
-     * (L95) provided by the user.  Introns of length < L95 have a score of 0. 
-     * Introns with length > L95 have score: -ln(size - L95)
-     * @param L95 Intron size at 95 percentile of a correct distribution
-     * @return A score for this intron size given the L95
+     * into an expected distribution specified by the length at the threhsold percentile
+     * (threshold) provided by the user.  Introns of length < threshold have a score of 0. 
+     * Introns with length > threshold have score: -ln(size - length_threshold)
+     * @param Length of threshold Intron size
+     * @return A score for this intron size given the threshold value
      */
-    double calcIntronScore(const uint32_t L95) const {
-        return this->intron->size() <= L95 ? 0.0 : -log(this->intron->size() - L95);
+    double calcIntronScore(const uint32_t threshold) const {
+        return this->intron->size() <= threshold ? 0.0 : log(this->intron->size() - threshold);
     }
 
     
@@ -943,6 +948,7 @@ class Junction {
                     << j.meanMismatches << "\t"
                     << j.getNbUniquelySplicedReads() << "\t"
                     << j.nbMultipleSplicedReads << "\t"
+                    << j.getReliable2RawRatio() << "\t"
                     << j.nbDownstreamJunctions << "\t"
                     << j.nbUpstreamJunctions << "\t"
                     << j.nbUpstreamFlankingAlignments << "\t"
diff --git a/lib/include/portcullis/rule_parser.hpp b/lib/include/portcullis/rule_parser.hpp
index ac24a33b..31f39ec0 100644
--- a/lib/include/portcullis/rule_parser.hpp
+++ b/lib/include/portcullis/rule_parser.hpp
@@ -90,10 +90,11 @@ const unordered_set<string> numericParams = {
     "M19-nb_mismatches",
     "M20-nb_usrs",
     "M21-nb_msrs",
-    "M22-nb_up_juncs",
-    "M23-nb_down_juncs",
-    "M24-up_aln",
-    "M25-down_aln"        
+    "M22-rel2raw",
+    "M23-nb_up_juncs",
+    "M24-nb_down_juncs",
+    "M25-up_aln",
+    "M26-down_aln"        
 };
 
 const unordered_set<string> stringParams = {
diff --git a/lib/src/junction.cc b/lib/src/junction.cc
index af3a438d..b77dd369 100644
--- a/lib/src/junction.cc
+++ b/lib/src/junction.cc
@@ -982,13 +982,15 @@ void portcullis::Junction::condensedOutputDescription(std::ostream &strm, string
          << "M19-MeanMismatches=" << meanMismatches << delimiter
          << "M20-NbUniquelySplicedReads=" << getNbUniquelySplicedReads() << delimiter
          << "M21-NbMultipleSplicedReads=" << nbMultipleSplicedReads << delimiter
-         << "M22-NbUpstreamJunctions=" << nbUpstreamJunctions << delimiter
-         << "M23-NbDownstreamJunctions=" << nbDownstreamJunctions << delimiter
-         << "M24-NbUpstreamNonSplicedAlignments=" << nbUpstreamFlankingAlignments << delimiter
-         << "M25-NbDownstreamNonSplicedAlignments=" << nbDownstreamFlankingAlignments << delimiter
-         << "M26-DistanceToNextUpstreamJunction=" << distanceToNextUpstreamJunction << delimiter
-         << "M27-DistanceToNextDownstreamJunction=" << distanceToNextDownstreamJunction << delimiter
-         << "M28-DistanceToNearestJunction=" << distanceToNearestJunction << delimiter
+         << "M22-Reliable2RawRatio=" << getReliable2RawRatio() << delimiter
+         << "M23-NbUpstreamJunctions=" << nbUpstreamJunctions << delimiter
+         << "M24-NbDownstreamJunctions=" << nbDownstreamJunctions << delimiter
+         << "M25-NbUpstreamNonSplicedAlignments=" << nbUpstreamFlankingAlignments << delimiter
+         << "M26-NbDownstreamNonSplicedAlignments=" << nbDownstreamFlankingAlignments << delimiter
+         << "M27-DistanceToNextUpstreamJunction=" << distanceToNextUpstreamJunction << delimiter
+         << "M28-DistanceToNextDownstreamJunction=" << distanceToNextDownstreamJunction << delimiter
+         << "M29-DistanceToNearestJunction=" << distanceToNearestJunction << delimiter
+         << "Suspect=" << boolalpha << suspicious << delimiter
          << "PFP=" << boolalpha << pfp;         
 }
 
@@ -1204,20 +1206,21 @@ shared_ptr<portcullis::Junction> portcullis::Junction::parse(const string& line)
     j->setMeanMismatches(lexical_cast<double>(parts[31]));
     //j->setNbUniquelySplicedReads(lexical_cast<uint32_t>(parts[32]));
     j->setNbMultipleSplicedReads(lexical_cast<uint32_t>(parts[33]));
-    j->setNbUpstreamJunctions(lexical_cast<uint16_t>(parts[34]));
-    j->setNbDownstreamJunctions(lexical_cast<uint16_t>(parts[35]));
-    j->setNbUpstreamFlankingAlignments(lexical_cast<uint32_t>(parts[36]));
-    j->setNbDownstreamFlankingAlignments(lexical_cast<uint32_t>(parts[37]));
-    j->setDistanceToNextUpstreamJunction(lexical_cast<uint32_t>(parts[38]));
-    j->setDistanceToNextDownstreamJunction(lexical_cast<uint32_t>(parts[39]));
-    j->setDistanceToNearestJunction(lexical_cast<uint32_t>(parts[40]));
+    //j->setNbMultipleSplicedReads(lexical_cast<uint32_t>(parts[33]));
+    j->setNbUpstreamJunctions(lexical_cast<uint16_t>(parts[35]));
+    j->setNbDownstreamJunctions(lexical_cast<uint16_t>(parts[36]));
+    j->setNbUpstreamFlankingAlignments(lexical_cast<uint32_t>(parts[37]));
+    j->setNbDownstreamFlankingAlignments(lexical_cast<uint32_t>(parts[38]));
+    j->setDistanceToNextUpstreamJunction(lexical_cast<uint32_t>(parts[39]));
+    j->setDistanceToNextDownstreamJunction(lexical_cast<uint32_t>(parts[40]));
+    j->setDistanceToNearestJunction(lexical_cast<uint32_t>(parts[41]));
     
     // Read Junction overhangs
-    j->setMeanQueryLength(lexical_cast<uint32_t>(parts[41]));
-    j->setSuspicious(lexical_cast<bool>(parts[42]));
-    j->setPotentialFalsePositive(lexical_cast<bool>(parts[43]));
+    j->setMeanQueryLength(lexical_cast<uint32_t>(parts[42]));
+    j->setSuspicious(lexical_cast<bool>(parts[43]));
+    j->setPotentialFalsePositive(lexical_cast<bool>(parts[44]));
     for(size_t i = 0; i < JO_NAMES.size(); i++) {
-        j->setJunctionOverhangs(i, lexical_cast<uint32_t>(parts[44 + i]));
+        j->setJunctionOverhangs(i, lexical_cast<uint32_t>(parts[45 + i]));
     }
     
     return j;
diff --git a/lib/src/rule_parser.cc b/lib/src/rule_parser.cc
index a3bb74d6..71f42b80 100644
--- a/lib/src/rule_parser.cc
+++ b/lib/src/rule_parser.cc
@@ -223,14 +223,15 @@ double portcullis::eval::getNumericFromJunc(const var& fullname) const {
         case 20:
             return junc->getNbMultipleSplicedReads();
         case 21:
-            return junc->getNbUpstreamJunctions();
+            return junc->getReliable2RawRatio();
         case 22:
-            return junc->getNbDownstreamJunctions();
+            return junc->getNbUpstreamJunctions();
         case 23:
-            return junc->getNbUpstreamFlankingAlignments();
+            return junc->getNbDownstreamJunctions();
         case 24:
+            return junc->getNbUpstreamFlankingAlignments();
+        case 25:
             return junc->getNbDownstreamFlankingAlignments();
-
     }
     
     if (boost::iequals(name, "Suspect")) {
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index cdee0531..0d625a9d 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -260,6 +260,25 @@ void portcullis::JunctionFilter::filter() {
         
         doRuleBasedFiltering(this->getIntitalNegRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial negative set for training", resultMap);
         
+        const uint32_t longintronthreshold = L95 * 6;
+        cout << endl << "Adding junctions to negative set with MaxMMES < 10 and excessively long intron size of positive set (> L95 x 6 = " << longintronthreshold << ")...";
+        cout.flush();
+        
+        uint32_t nblongintrons = 0;
+        JunctionList fail2Juncs;
+        for(auto& j : failJuncs) {
+            if (j->getIntronSize() > longintronthreshold && j->getMaxMMES() < 10 ) {
+                passJuncs.push_back(j);
+                nblongintrons++;                
+            }
+            else {
+                fail2Juncs.push_back(j);
+            }
+        }
+        
+        cout << " done." << endl
+             << "Found an additional " << nblongintrons << " junctions with long introns." << endl << endl;
+        
         cout << "Saving initial negative set:" << endl;
         JunctionSystem isn(passJuncs);
         isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn");
@@ -272,7 +291,7 @@ void portcullis::JunctionFilter::filter() {
         }
         
         if (!genuineFile.empty()) {
-            shared_ptr<Performance> p = calcPerformance(passJuncs, failJuncs, true);            
+            shared_ptr<Performance> p = calcPerformance(passJuncs, fail2Juncs, true);            
             cout << "Performance of initial negative set (High NPV is important):" << endl;
             cout << Performance::longHeader() << endl;
             cout << p->toLongString() << endl << endl;
@@ -285,7 +304,7 @@ void portcullis::JunctionFilter::filter() {
                     invalidNeg.addJunction(j);
                 }
             }
-            for(auto& j : failJuncs) {
+            for(auto& j : fail2Juncs) {
                 if (!j->isGenuine()) {
                     missedNeg.addJunction(j);
                 }
@@ -298,14 +317,8 @@ void portcullis::JunctionFilter::filter() {
             missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn");
         }
         
-        const uint32_t L95_fail = this->calcIntronThreshold(passJuncs);
-        cout << "Length of intron at 95th percentile of negative set: " << L95_fail << endl << endl;
-        
-            
         cout << "Initial training set consists of " << pos << " positive and " << neg << " negative junctions." << endl << endl;
         
-        
-        
         cout << "Training a random forest model using the initial positive and negative datasets" << endl;
         shared_ptr<Forest> forest = Train::trainInstance(initialSet, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, L95);
         
@@ -609,7 +622,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
     for(size_t i = 0; i < all.size(); i++) {
         
         scoreStream << f->getPredictions()[i][0] << endl;
-        if (f->getPredictions()[i][0] >= 0.5) {
+        if (f->getPredictions()[i][0] >= 0.1) {
             pass.push_back(all[i]);
         }
         else {
diff --git a/src/train.cc b/src/train.cc
index 5907e434..044020d2 100644
--- a/src/train.cc
+++ b/src/train.cc
@@ -73,7 +73,7 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3
     vector<string> headers;
     headers.reserve( VAR_NAMES.size() + JO_NAMES.size() );
     headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() );
-    headers.insert( headers.end(), JO_NAMES.begin(), JO_NAMES.end() );
+    headers.insert( headers.end(), JO_NAMES.begin()+14, JO_NAMES.end() );
     
     // Convert junction list info to double*
     double* d = new double[x.size() * headers.size()];
@@ -81,28 +81,30 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3
     uint32_t row = 0;
     for (const auto& j : x) {        
         
-        d[0 * x.size() + row] = j->getNbJunctionAlignments();
-        //d[1 * x.size() + row] = j->getNbDistinctAlignments();
-        d[1 * x.size() + row] = j->getNbReliableAlignments();
+        //d[0 * x.size() + row] = j->getNbJunctionAlignments();
+        //d[0 * x.size() + row] = j->getNbDistinctAlignments();
+        d[0 * x.size() + row] = j->getNbReliableAlignments();
         //d[3 * x.size() + row] = j->getMaxMinAnchor();
         //d[4 * x.size() + row] = j->getDiffAnchor();
         //d[5 * x.size() + row] = j->getNbDistinctAnchors();
-        d[2 * x.size() + row] = j->getEntropy();
-        d[3 * x.size() + row] = j->getMaxMMES();
-        d[4 * x.size() + row] = j->getHammingDistance5p();
-        d[5 * x.size() + row] = j->getHammingDistance3p();
-        //d[6 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
+        d[1 * x.size() + row] = j->getEntropy();
+        d[2 * x.size() + row] = j->getMaxMMES();
+        d[3 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());
+        //d[3 * x.size() + row] = ;
+        d[4 * x.size() + row] = j->getReliable2RawRatio();
+        //d[5 * x.size() + row] = j->getMeanMismatches();
+        d[5 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
         d[6 * x.size() + row] = j->isGenuine();
         // Junction overhang values at each position are first converted into deviation from expected distributions       
         double half_read_length = (double)j->getMeanQueryLength() / 2.0;    
-        for(size_t i = 0; i < JO_NAMES.size(); i++) {
+        for(size_t i = 14; i < JO_NAMES.size(); i++) {
             double Ni = j->getJunctionOverhangs(i);                 // Actual count at this position
             if (Ni == 0.0) Ni = 0.000000000001;                     // Ensure some value > 0 here otherwise we get -infinity later.
             double Pi = 1.0 - ((double)i / half_read_length);       // Likely scale at this position
             double Ei = (double)j->getNbJunctionAlignments() * Pi;  // Expected count at this position
             double Xi = abs(log2(Ni / Ei));                         // Log deviation
             
-            d[(i + 7) * x.size() + row] = Xi;
+            d[(i-14 + 7) * x.size() + row] = Xi;
         }
         
         row++;
diff --git a/src/train.hpp b/src/train.hpp
index 7abed309..3af77509 100644
--- a/src/train.hpp
+++ b/src/train.hpp
@@ -52,17 +52,19 @@ const int DEFAULT_SEED = 1234567;       // To avoid non-deterministic behaviour
 
 // List of variable names
 const vector<string> VAR_NAMES = { 
-            "M2-nb-reads", 
+            //"M2-nb-reads", 
             //"M3-nb_dist_aln", 
-            "M4-nb_rel_aln", 
+            "nb_rel_aln", 
             //"M8-max_min_anc", 
             //"M9-dif_anc", 
             //"M10-dist_anc", 
-            "M11-entropy", 
-            "M12-maxmmes", 
-            "M13-hamming5p", 
-            "M14-hamming3p",
-            //"IntronScore",
+            "entropy", 
+            "maxmmes", 
+            "min_hamming_score", 
+            //"M14-hamming3p",
+            "rel2raw_ratio",
+            //"mean_mismatches",
+            "IntronScore",
             "Genuine" };
 
 // Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/

From afb55ab3ccc983f5c018c9c1338af01ef120da33 Mon Sep 17 00:00:00 2001
From: Dan <d.mapleson@gmail.com>
Date: Mon, 25 Apr 2016 00:38:12 +0100
Subject: [PATCH 04/15] Implemented a markov model and kmer hash.

Implemented coding potential score for junctions, however it seems to detract from the overall usefullness of the predictor at the moment, which means there's probably an error in the calculation somewhere.  I should write some unit tests in the future to verify or remove the feature.
---
 lib/Makefile.am                              |  3 +
 lib/include/portcullis/bam/genome_mapper.hpp |  4 +-
 lib/include/portcullis/junction.hpp          |  5 +-
 lib/include/portcullis/kmer.hpp              | 94 ++++++++++++++++++++
 lib/include/portcullis/markov_model.hpp      | 83 +++++++++++++++++
 lib/src/junction.cc                          | 38 ++++++++
 lib/src/markov_model.cc                      | 58 ++++++++++++
 src/junction_filter.cc                       | 93 ++++++++++++++++---
 src/junction_filter.hpp                      | 18 +++-
 src/train.cc                                 | 15 ++--
 src/train.hpp                                | 24 +++--
 tests/Makefile.am                            |  3 +-
 tests/kmer_tests.cpp                         | 53 +++++++++++
 13 files changed, 457 insertions(+), 34 deletions(-)
 create mode 100644 lib/include/portcullis/kmer.hpp
 create mode 100644 lib/include/portcullis/markov_model.hpp
 create mode 100644 lib/src/markov_model.cc
 create mode 100644 tests/kmer_tests.cpp

diff --git a/lib/Makefile.am b/lib/Makefile.am
index dd10476e..0390e79e 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -15,6 +15,7 @@ libportcullis_la_SOURCES =	\
 	src/bam_writer.cc \
 	src/depth_parser.cc \
 	src/genome_mapper.cc \
+	src/markov_model.cc \
 	src/python_exe.cc \
 	src/intron.cc \
 	src/junction.cc \
@@ -30,6 +31,8 @@ library_include_HEADERS = \
 	$(PI)/bam/bam_writer.hpp \
 	$(PI)/bam/depth_parser.hpp \
 	$(PI)/bam/genome_mapper.hpp \
+	$(PI)/kmer.hpp \
+	$(PI)/markov_model.hpp \
 	$(PI)/python_exe.hpp \
 	$(PI)/intron.hpp \
 	$(PI)/junction.hpp \
diff --git a/lib/include/portcullis/bam/genome_mapper.hpp b/lib/include/portcullis/bam/genome_mapper.hpp
index 2bf028ed..91c55f9c 100644
--- a/lib/include/portcullis/bam/genome_mapper.hpp
+++ b/lib/include/portcullis/bam/genome_mapper.hpp
@@ -97,7 +97,7 @@ class GenomeMapper {
     /**
      * @abstract    Fetch the sequence in a region.
      * @param  reg  Region in the format "chr2:20,000-30,000"
-     * @param  len  Length of the region
+     * @param  len  Length of the region returned
      * @return      The sequence as a string; empty string if no seq found
      */
     string fetchBases(const char* reg, int* len) const;
@@ -107,7 +107,7 @@ class GenomeMapper {
      * @param  name Region name
      * @param  start    Start location on region (zero-based, inclusive)
      * @param  end  End position (zero-based, exclusive)
-     * @param  len  Length of the region
+     * @param  len  Length of the region returned
      * @return      The sequence as a string; empty string if no seq found
      */
     string fetchBases(const char* name, int start, int end, int* len) const;
diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp
index f7219007..73279b1c 100644
--- a/lib/include/portcullis/junction.hpp
+++ b/lib/include/portcullis/junction.hpp
@@ -45,6 +45,7 @@ using namespace portcullis::bam;
 
 #include "intron.hpp"
 #include "seq_utils.hpp"
+#include "markov_model.hpp"
 using portcullis::Intron;
 
 
@@ -867,7 +868,9 @@ class Junction {
     double calcIntronScore(const uint32_t threshold) const {
         return this->intron->size() <= threshold ? 0.0 : log(this->intron->size() - threshold);
     }
-
+    
+    
+    double calcCodingPotential(GenomeMapper& gmap, MarkovModel& exon, MarkovModel& intron) const;
     
     
     // **** Output methods ****
diff --git a/lib/include/portcullis/kmer.hpp b/lib/include/portcullis/kmer.hpp
new file mode 100644
index 00000000..76247a22
--- /dev/null
+++ b/lib/include/portcullis/kmer.hpp
@@ -0,0 +1,94 @@
+//  ********************************************************************
+//  This file is part of Portcullis.
+//
+//  Portcullis is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  Portcullis is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with Portcullis.  If not, see <http://www.gnu.org/licenses/>.
+//  *******************************************************************
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+using std::ostream;
+using std::string;
+using std::unordered_map;
+using std::vector;
+
+#include <boost/algorithm/string.hpp>
+#include <boost/exception/all.hpp>
+
+namespace portcullis {  
+
+typedef boost::error_info<struct KmerError,string> KmerErrorInfo;
+struct KmerException: virtual boost::exception, virtual std::exception { };
+    
+/**
+ * An extremely basic kmer hash class that doesn't worry about canonical kmers
+ * or reverse complementing the data.
+ */
+class KmerHash {
+private:
+    unordered_map<string, uint32_t> khash;
+    uint16_t k;
+    
+public:
+    KmerHash(const uint16_t _k, const string& seq) {
+        k = _k;
+        
+        for(size_t i = k; i <= seq.size(); i++) {
+            khash[boost::to_upper_copy(seq.substr(i - k, k))]++;
+        }        
+    }
+    
+    uint32_t getCount(const string& kmer) {
+        if (kmer.size() != k) {
+            BOOST_THROW_EXCEPTION(KmerException() << KmerErrorInfo(string(
+                    "Given kmer size is ") + std::to_string(kmer.size()) + ".  Expected kmer of size " + std::to_string(k)));   
+        }
+        
+        string kup = boost::to_upper_copy(kmer);
+        return (khash.find(kup) != khash.end()) ? khash[kup] : 0;
+    }
+    
+    size_t nbDistinctKmers() const {
+        return khash.size();
+    }
+    
+    void print(ostream& out) const {        
+        for(auto& kmer : khash) {
+            out << kmer.first << "\t" << kmer.second << endl;
+        }
+    }
+    
+    void printAbundanceHistogram(ostream& out, const uint32_t hist_size) {
+        
+        vector<uint32_t> hist(hist_size, 0);
+        
+        for(auto& kmer : khash) {
+            if (hist.size() > kmer.second) {            
+                hist[kmer.second]++;
+            }
+            else {
+                hist[hist.size() -1]++;
+            }
+        }
+        
+        for(auto& e : hist) {
+            out << e << endl;
+        }
+    }
+};
+    
+}
\ No newline at end of file
diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/markov_model.hpp
new file mode 100644
index 00000000..1b926361
--- /dev/null
+++ b/lib/include/portcullis/markov_model.hpp
@@ -0,0 +1,83 @@
+//  ********************************************************************
+//  This file is part of Portcullis.
+//
+//  Portcullis is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  Portcullis is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with Portcullis.  If not, see <http://www.gnu.org/licenses/>.
+//  *******************************************************************
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+using std::string;
+using std::unordered_map;
+using std::vector;
+
+
+namespace portcullis {
+
+typedef boost::error_info<struct MMError,string> MMErrorInfo;
+struct MMException: virtual boost::exception, virtual std::exception {};
+
+
+
+/**
+ * Intended to be used as a map containing the probability of seeing a given character
+ */
+typedef unordered_map<string, double> NTP;
+
+/**
+ * This datastructure consists of a map of kmers to a map of nucleotides with assoicated
+ * probabilities.
+ */
+typedef unordered_map<string, NTP> MMU;
+
+/**
+ * Simple Markov chain implementation derived originally from Truesight.
+ * Constructor trains the model on a set of sequences.  "getScore" returns the score
+ * for a given sequence based on the pre-trained model.  
+ * Automatically converts sequences to uppercase
+ */
+class MarkovModel {
+
+private:
+    portcullis::MMU model;
+    uint16_t order;
+    
+public:
+    
+    MarkovModel(const uint32_t _order) {
+        order = _order;
+    }
+    
+    MarkovModel(const vector<string>& input, const uint32_t _order) {
+        order = _order;        
+        train(input);
+    }
+    
+    void train(const vector<string>& input);
+    
+    uint16_t getOrder() const {
+        return order;
+    }
+    
+    size_t size() const {
+        return model.size();
+    }
+    
+    double getScore(const string& seq);
+};
+
+}
+
diff --git a/lib/src/junction.cc b/lib/src/junction.cc
index b77dd369..f10b9f9b 100644
--- a/lib/src/junction.cc
+++ b/lib/src/junction.cc
@@ -47,7 +47,9 @@ using portcullis::bam::Strand;
 
 #include <portcullis/intron.hpp>
 #include <portcullis/seq_utils.hpp>
+#include <portcullis/markov_model.hpp>
 using portcullis::Intron;
+using portcullis::MarkovModel;
 
 #include <portcullis/junction.hpp>
 
@@ -1225,3 +1227,39 @@ shared_ptr<portcullis::Junction> portcullis::Junction::parse(const string& line)
     
     return j;
 }
+
+double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, MarkovModel& exon, MarkovModel& intron) const {
+     
+    int len = 0;
+
+    const char* ref = this->intron->ref.name.c_str();
+
+    string left_exon = gmap.fetchBases(ref, this->intron->start - 80, this->intron->start, &len);
+    if (this->getConsensusStrand() == Strand::NEGATIVE) {
+        left_exon = SeqUtils::reverseComplement(left_exon);
+    }        
+
+    string left_intron = gmap.fetchBases(ref, this->intron->start, this->intron->start+81, &len);
+    if (this->getConsensusStrand() == Strand::NEGATIVE) {
+        left_intron = SeqUtils::reverseComplement(left_intron);
+    }        
+
+    string right_intron = gmap.fetchBases(ref, this->intron->end-80, this->intron->end+1, &len);
+    if (this->getConsensusStrand() == Strand::NEGATIVE) {
+        right_intron = SeqUtils::reverseComplement(right_intron);
+    }        
+
+    string right_exon = gmap.fetchBases(ref, this->intron->end + 1, this->intron->end + 80, &len);
+    if (this->getConsensusStrand() == Strand::NEGATIVE) {
+        right_exon = SeqUtils::reverseComplement(right_exon);
+    }        
+
+    double score =  exon.getScore(left_intron) + intron.getScore(left_exon)
+                - ( intron.getScore(left_intron) + exon.getScore(left_exon) )
+                + ( intron.getScore(right_exon) + exon.getScore(right_intron) )
+                - ( exon.getScore(right_exon) + intron.getScore(right_intron) );
+    
+    //cout << score << endl;
+    
+    return score;
+}
diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc
new file mode 100644
index 00000000..11bdb41e
--- /dev/null
+++ b/lib/src/markov_model.cc
@@ -0,0 +1,58 @@
+//  ********************************************************************
+//  This file is part of Portcullis.
+//
+//  Portcullis is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  Portcullis is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with Portcullis.  If not, see <http://www.gnu.org/licenses/>.
+//  *******************************************************************
+
+#include <iostream>
+using std::cout;
+using std::endl;
+
+#include <boost/algorithm/string.hpp>
+#include <boost/exception/all.hpp>
+
+#include <portcullis/markov_model.hpp>
+
+void portcullis::MarkovModel::train(const vector<string>& input) {
+    MMU temp;
+    for(auto& seq : input) {
+        string sequp = boost::to_upper_copy(seq);
+        for(uint16_t i = order; i < sequp.size(); i++) {
+            temp[sequp.substr(i-order, order)][sequp.substr(i, 1)]++;
+        }
+    }
+    for(auto& i : temp) {
+        double sum = 0;
+        for(auto& j : i.second) {
+            sum += j.second;
+        }
+        for(auto& j : i.second) {
+            //cout << i.first << " " << j.first << " " << j.second << endl;
+            model[i.first][j.first] = j.second / sum;
+        }
+    }
+}
+    
+    
+double portcullis::MarkovModel::getScore(const string& seq) {
+    string sequp = boost::to_upper_copy(seq); 
+    double score = 1.0;
+    for(uint16_t i = order; i < sequp.size(); i++){
+        score *= model[sequp.substr(i-order, order)][sequp.substr(i, 1)];
+    }
+    if(score == 0.0) {
+        return -100.0;
+    }
+    return log(score);
+}
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index 0d625a9d..4d1886bc 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -52,17 +52,21 @@ namespace po = boost::program_options;
 #include <ranger/ForestRegression.h>
 #include <ranger/DataDouble.h>
 
+#include <portcullis/bam/genome_mapper.hpp>
 #include <portcullis/intron.hpp>
 #include <portcullis/junction.hpp>
 #include <portcullis/junction_system.hpp>
 #include <portcullis/portcullis_fs.hpp>
 #include <portcullis/performance.hpp>
 #include <portcullis/rule_parser.hpp>
+#include <portcullis/markov_model.hpp>
 using portcullis::PortcullisFS;
 using portcullis::Intron;
 using portcullis::IntronHasher;
 using portcullis::Performance;
 using portcullis::eval;
+using portcullis::bam::GenomeMapper;
+using portcullis::MarkovModel;
 
 #include "train.hpp"
 using portcullis::Train;
@@ -72,6 +76,7 @@ using portcullis::Train;
 portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile, 
                     const path& _output) {
     junctionFile = _junctionFile;
+    genomeFile = "";
     modelFile = "";
     genuineFile = "";
     output = _output;
@@ -198,12 +203,20 @@ void portcullis::JunctionFilter::filter() {
     
     // To be overridden if we are training
     uint32_t L95 = 0;
+    MarkovModel exon_model(5);
+    MarkovModel intron_model(5);
+    
+    // Create the genome mapper
+    GenomeMapper gmap(this->genomeFile);
+
+    // Load the fasta index
+    gmap.loadFastaIndex();
+
     
     if (train) {
         
-        JunctionList initialSet;
-        uint32_t pos = 0;
-        uint32_t neg = 0;        
+        // The initial positive and negative sets
+        JunctionList pos, neg;
          
         cout << "Self training mode activated.  Collecting initial positive and negative sets from input." << endl;
         
@@ -216,12 +229,11 @@ void portcullis::JunctionFilter::filter() {
         cout << "Saving initial positive set:" << endl;
         JunctionSystem isp(passJuncs);
         isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp");
-        pos = passJuncs.size();
         
         for(auto& j : passJuncs) {
             JunctionPtr copy = make_shared<Junction>(*j);
             copy->setGenuine(true);
-            initialSet.push_back(copy);            
+            pos.push_back(copy);            
         }
         if (!genuineFile.empty()) {
             shared_ptr<Performance> p = calcPerformance(passJuncs, failJuncs);
@@ -254,6 +266,13 @@ void portcullis::JunctionFilter::filter() {
         L95 = this->calcIntronThreshold(passJuncs);        
         cout << "Length of intron at 95th percentile of positive set: " << L95 << endl << endl;
         
+        cout << "Training coding potential markov models on positive set ...";
+        cout.flush();
+        trainMMs(pos, exon_model, intron_model, gmap);
+        cout << " done." << endl;
+        cout << "Exon model contains " << exon_model.size() << " 5-mers." << endl;
+        cout << "Intron model contains " << intron_model.size() << " 5-mers." << endl;
+                
         passJuncs.clear();
         failJuncs.clear();
         resultMap.clear();
@@ -282,12 +301,11 @@ void portcullis::JunctionFilter::filter() {
         cout << "Saving initial negative set:" << endl;
         JunctionSystem isn(passJuncs);
         isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn");
-        neg = passJuncs.size();
         
         for(auto& j : passJuncs) {
             JunctionPtr copy = make_shared<Junction>(*j);
             copy->setGenuine(false);
-            initialSet.push_back(copy);            
+            neg.push_back(copy);            
         }
         
         if (!genuineFile.empty()) {
@@ -317,10 +335,17 @@ void portcullis::JunctionFilter::filter() {
             missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn");
         }
         
-        cout << "Initial training set consists of " << pos << " positive and " << neg << " negative junctions." << endl << endl;
+        cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl;
+        
+        // Build the training set by combining the positive and negative sets
+        JunctionList training;
+        training.reserve(pos.size() + neg.size());
+        training.insert(training.end(), pos.begin(), pos.end());
+        training.insert(training.end(), neg.begin(), neg.end());
+        
         
         cout << "Training a random forest model using the initial positive and negative datasets" << endl;
-        shared_ptr<Forest> forest = Train::trainInstance(initialSet, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, L95);
+        shared_ptr<Forest> forest = Train::trainInstance(training, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, L95, exon_model, intron_model, gmap);
         
         forest->saveToFile();
         forest->writeOutput(&cout);
@@ -341,7 +366,7 @@ void portcullis::JunctionFilter::filter() {
         JunctionList passJuncs;
         JunctionList failJuncs;
         
-        forestPredict(currentJuncs, passJuncs, failJuncs, L95);
+        forestPredict(currentJuncs, passJuncs, failJuncs, L95, exon_model, intron_model, gmap);
                 
         printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering"));
         
@@ -480,6 +505,44 @@ void portcullis::JunctionFilter::filter() {
         }
     }
 }
+
+void portcullis::JunctionFilter::trainMMs(const JunctionList& in, MarkovModel& exon_model, MarkovModel& intron_model, GenomeMapper& gmap) {
+    
+    vector<string> exons;
+    vector<string> introns;
+    for(auto& j : in) {
+        
+        int len = 0;
+        
+        string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 80, j->getIntron()->start, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            left_exon = SeqUtils::reverseComplement(left_exon);
+        }        
+        exons.push_back(left_exon);
+        
+        string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+81, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            left_intron = SeqUtils::reverseComplement(left_intron);
+        }        
+        introns.push_back(left_intron);
+        
+        string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end+1, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            right_intron = SeqUtils::reverseComplement(right_intron);
+        }        
+        introns.push_back(right_intron);
+        
+        
+        string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 80, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            right_exon = SeqUtils::reverseComplement(right_exon);
+        }        
+        exons.push_back(right_exon);
+    }
+    
+    exon_model.train(exons);
+    intron_model.train(introns);
+}
     
 uint32_t portcullis::JunctionFilter::calcIntronThreshold(const JunctionList& juncs) {
     
@@ -554,13 +617,13 @@ void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, cons
 
 }
     
-void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95) {
+void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) {
     
     if (verbose) {
         cerr << endl << "Preparing junction metrics into matrix" << endl;
     }
     
-    Data* testingData = Train::juncs2FeatureVectors(all, L95);
+    Data* testingData = Train::juncs2FeatureVectors(all, L95, exon, intron, gmap);
     
     // Load forest from disk
     
@@ -641,6 +704,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
 int portcullis::JunctionFilter::main(int argc, char *argv[]) {
 
     path junctionFile;
+    path genomeFile;
     path modelFile;
     path genuineFile;
     path filterFile;
@@ -693,11 +757,13 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
     // in config file, but will not be shown to the user.
     po::options_description hidden_options("Hidden options");
     hidden_options.add_options()
-            ("junction-file", po::value<path>(&junctionFile), "Path to the junction file to process.")
+        ("genome-file", po::value<path>(&genomeFile), "Path to the genome file to process.")        
+        ("junction-file", po::value<path>(&junctionFile), "Path to the junction file to process.")
             ;
 
     // Positional option for the input bam file
     po::positional_options_description p;
+    p.add("genome-file", 1);
     p.add("junction-file", 1);
 
 
@@ -745,6 +811,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
         }
     }
     filter.setReferenceFile(referenceFile);
+    filter.setGenomeFile(genomeFile);
     
     filter.filter();
 
diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp
index 5ff86eef..ff70137c 100644
--- a/src/junction_filter.hpp
+++ b/src/junction_filter.hpp
@@ -46,16 +46,20 @@ using boost::filesystem::create_symlink;
 using boost::filesystem::create_directory;
 using boost::filesystem::symbolic_link_exists;
 
+#include <portcullis/bam/genome_mapper.hpp>
 #include <portcullis/intron.hpp>
 #include <portcullis/portcullis_fs.hpp>
 #include <portcullis/junction_system.hpp>
 #include <portcullis/performance.hpp>
 #include <portcullis/rule_parser.hpp>
+#include <portcullis/markov_model.hpp>
+using portcullis::bam::GenomeMapper;
 using portcullis::PortcullisFS;
 using portcullis::Intron;
 using portcullis::IntronHasher;
 using portcullis::Performance;
 using portcullis::JuncResultMap;
+using portcullis::MarkovModel;
 
 
 namespace portcullis {
@@ -78,6 +82,7 @@ class JunctionFilter {
 private:
     
     path junctionFile;
+    path genomeFile;
     path modelFile;
     path filterFile;
     path genuineFile;
@@ -124,6 +129,15 @@ class JunctionFilter {
     void setJunctionFile(path junctionFile) {
         this->junctionFile = junctionFile;
     }
+    
+    path getGenomeFile() const {
+        return genomeFile;
+    }
+
+    void setGenomeFile(path genomeFile) {
+        this->genomeFile = genomeFile;
+    }
+
 
     path getOutput() const {
         return output;
@@ -261,7 +275,7 @@ class JunctionFilter {
  
 protected:
     
-    void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95);
+    void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap);
 
     shared_ptr<Performance> calcPerformance(const JunctionList& pass, const JunctionList& fail) {
         return calcPerformance(pass, fail, false);
@@ -274,6 +288,8 @@ class JunctionFilter {
     
     uint32_t calcIntronThreshold(const JunctionList& pass);
     
+    void trainMMs(const JunctionList& in, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap);
+    
 public:
   
     static string helpMessage() {
diff --git a/src/train.cc b/src/train.cc
index 044020d2..b65ef7c2 100644
--- a/src/train.cc
+++ b/src/train.cc
@@ -68,7 +68,7 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){
     verbose = false;    
 }
     
-Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint32_t L95) {
+Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) {
     
     vector<string> headers;
     headers.reserve( VAR_NAMES.size() + JO_NAMES.size() );
@@ -94,7 +94,8 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3
         d[4 * x.size() + row] = j->getReliable2RawRatio();
         //d[5 * x.size() + row] = j->getMeanMismatches();
         d[5 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
-        d[6 * x.size() + row] = j->isGenuine();
+        d[6 * x.size() + row] = exon.size() > 0 && intron.size() > 0 ? j->calcCodingPotential(gmap, exon, intron) : 0.0;     // Coding potential score
+        d[7 * x.size() + row] = j->isGenuine();
         // Junction overhang values at each position are first converted into deviation from expected distributions       
         double half_read_length = (double)j->getMeanQueryLength() / 2.0;    
         for(size_t i = 14; i < JO_NAMES.size(); i++) {
@@ -104,7 +105,7 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3
             double Ei = (double)j->getNbJunctionAlignments() * Pi;  // Expected count at this position
             double Xi = abs(log2(Ni / Ei));                         // Log deviation
             
-            d[(i-14 + 7) * x.size() + row] = Xi;
+            d[(i-14 + 8) * x.size() + row] = Xi;
         }
         
         row++;
@@ -115,10 +116,10 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3
     return data;
 }
 
-shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95) {
+shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) {
     
     cout << "Creating feature vector" << endl;
-    Data* trainingData = juncs2FeatureVectors(x, L95);
+    Data* trainingData = juncs2FeatureVectors(x, L95, exon, intron, gmap);
     
     cout << "Initialising random forest" << endl;
     shared_ptr<Forest> f = nullptr;
@@ -158,10 +159,10 @@ shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, strin
     return f;
 }
 
-void portcullis::Train::testInstance(shared_ptr<Forest> f, const JunctionList& x, const uint32_t L95) {
+void portcullis::Train::testInstance(shared_ptr<Forest> f, const JunctionList& x) {
     
     // Convert testing set junctions into feature vector
-    Data* testingData = juncs2FeatureVectors(x, L95);
+    Data* testingData = juncs2FeatureVectors(x);
     
     f->setPredictionMode(true);
     f->setData(testingData);
diff --git a/src/train.hpp b/src/train.hpp
index 3af77509..6fedc455 100644
--- a/src/train.hpp
+++ b/src/train.hpp
@@ -36,6 +36,8 @@ using boost::filesystem::path;
 
 #include <ranger/Forest.h>
 #include <portcullis/performance.hpp>
+#include <portcullis/bam/genome_mapper.hpp>
+using portcullis::bam::GenomeMapper;
 
 
 namespace portcullis {
@@ -65,6 +67,7 @@ const vector<string> VAR_NAMES = {
             "rel2raw_ratio",
             //"mean_mismatches",
             "IntronScore",
+            "CodingPotential",
             "Genuine" };
 
 // Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/
@@ -232,25 +235,28 @@ class Train {
     static int main(int argc, char *argv[]);
     
     static Data* juncs2FeatureVectors(const JunctionList& x) {
-        return juncs2FeatureVectors(x, 0);
+        GenomeMapper gmap;
+        MarkovModel m(1);
+        return juncs2FeatureVectors(x, 0, m, m, gmap);
     }
-    static Data* juncs2FeatureVectors(const JunctionList& x, const uint32_t L95);
+    static Data* juncs2FeatureVectors(const JunctionList& x, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap);
     
     static shared_ptr<Forest> trainInstance(const JunctionList& x, string outputPrefix, 
             uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) {
-        return trainInstance(x, outputPrefix, trees, threads, regressionMode, verbose, 0);
+        GenomeMapper gmap;
+        MarkovModel m(1);
+        return trainInstance(x, outputPrefix, trees, threads, regressionMode, verbose,
+                0, m, m, gmap);
     }
+    
     static shared_ptr<Forest> trainInstance(const JunctionList& x, string outputPrefix, 
-            uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95);
+            uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, 
+            const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap);
     
 protected:
     
     
-    void testInstance(shared_ptr<Forest> f, const JunctionList& y) {
-        return testInstance(f, y, 0);
-    }
-    
-    void testInstance(shared_ptr<Forest> f, const JunctionList& y, const uint32_t L95);
+    void testInstance(shared_ptr<Forest> f, const JunctionList& y);
     
     void getRandomSubset(const JunctionList& in, JunctionList& out);
     
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 9cda768c..b3a42b38 100755
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -24,8 +24,9 @@ check_PROGRAMS = check_unit_tests
 
 check_unit_tests_SOURCES =	bam_tests.cpp \
 				seq_utils_tests.cpp \
+				kmer_tests.cpp \
 				intron_tests.cpp \
-				junction_tests.cpp \
+				junction_tests.cpp \	
 				check_portcullis.cc
 
 check_unit_tests_CXXFLAGS = -O0 @AM_CXXFLAGS@ @CXXFLAGS@
diff --git a/tests/kmer_tests.cpp b/tests/kmer_tests.cpp
new file mode 100644
index 00000000..d7895d2d
--- /dev/null
+++ b/tests/kmer_tests.cpp
@@ -0,0 +1,53 @@
+//  ********************************************************************
+//  This file is part of Portcullis.
+//
+//  Portcullis is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  Portcullis is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with Portcullis.  If not, see <http://www.gnu.org/licenses/>.
+//  *******************************************************************
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <fstream>
+#include <string>
+using std::cout;
+using std::endl;
+using std::stringstream;
+
+
+#include <boost/algorithm/string.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/filesystem/path.hpp>
+namespace bfs = boost::filesystem;
+using bfs::path;
+
+#include <portcullis/kmer.hpp>
+using portcullis::KmerHash;
+
+        
+TEST(kmer, make) {
+    
+    string seq = "ATGCATGCATCGNATATATATTGAC";
+    
+    KmerHash kh(4, seq);
+    
+    //kh.print(std::cout);
+    
+    EXPECT_EQ(1, kh.getCount("tgac"));
+    
+    EXPECT_EQ(0, kh.getCount("kjls"));
+    
+    EXPECT_EQ(16, kh.nbDistinctKmers());
+    
+}
+

From 68a29428d4f90b25084bbe90eba9f02c1787d3ad Mon Sep 17 00:00:00 2001
From: Dan <d.mapleson@gmail.com>
Date: Mon, 25 Apr 2016 10:24:15 +0100
Subject: [PATCH 05/15] Moving code relating to additional features calculated
 during self training into a class of its own stored in the library.

---
 lib/Makefile.am                           |  2 +
 lib/include/portcullis/markov_model.hpp   |  8 +-
 lib/include/portcullis/model_features.hpp | 50 ++++++++++++
 lib/src/markov_model.cc                   |  4 +-
 lib/src/model_features.cc                 | 88 +++++++++++++++++++++
 src/junction_filter.cc                    | 93 +++++------------------
 src/junction_filter.hpp                   | 10 +--
 src/train.cc                              | 10 +--
 src/train.hpp                             | 17 ++---
 9 files changed, 187 insertions(+), 95 deletions(-)
 create mode 100644 lib/include/portcullis/model_features.hpp
 create mode 100644 lib/src/model_features.cc

diff --git a/lib/Makefile.am b/lib/Makefile.am
index 0390e79e..529a4809 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -16,6 +16,7 @@ libportcullis_la_SOURCES =	\
 	src/depth_parser.cc \
 	src/genome_mapper.cc \
 	src/markov_model.cc \
+	src/model_features.cc \
 	src/python_exe.cc \
 	src/intron.cc \
 	src/junction.cc \
@@ -33,6 +34,7 @@ library_include_HEADERS = \
 	$(PI)/bam/genome_mapper.hpp \
 	$(PI)/kmer.hpp \
 	$(PI)/markov_model.hpp \
+	$(PI)/model_features.hpp \
 	$(PI)/python_exe.hpp \
 	$(PI)/intron.hpp \
 	$(PI)/junction.hpp \
diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/markov_model.hpp
index 1b926361..1b524858 100644
--- a/lib/include/portcullis/markov_model.hpp
+++ b/lib/include/portcullis/markov_model.hpp
@@ -57,6 +57,8 @@ class MarkovModel {
     
 public:
     
+    MarkovModel() : MarkovModel(1) {}
+    
     MarkovModel(const uint32_t _order) {
         order = _order;
     }
@@ -66,7 +68,11 @@ class MarkovModel {
         train(input);
     }
     
-    void train(const vector<string>& input);
+    void train(const vector<string>& input) {
+        train(input, order);
+    }
+    
+    void train(const vector<string>& input, const uint32_t order);
     
     uint16_t getOrder() const {
         return order;
diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp
new file mode 100644
index 00000000..8179e57c
--- /dev/null
+++ b/lib/include/portcullis/model_features.hpp
@@ -0,0 +1,50 @@
+//  ********************************************************************
+//  This file is part of Portcullis.
+//
+//  Portcullis is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  Portcullis is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with Portcullis.  If not, see <http://www.gnu.org/licenses/>.
+//  *******************************************************************
+
+#pragma once
+
+#include <boost/filesystem/path.hpp>
+using boost::filesystem::path;
+
+#include <portcullis/bam/genome_mapper.hpp>
+#include <portcullis/markov_model.hpp>
+using portcullis::bam::GenomeMapper;
+using portcullis::MarkovModel;
+
+namespace portcullis {
+    
+class ModelFeatures {
+public:
+    uint32_t L95;
+    MarkovModel exonModel;
+    MarkovModel intronModel;
+    GenomeMapper gmap;
+    
+    ModelFeatures() : L95(0) {
+    }
+    
+    bool isCodingPotentialModelEmpty() {
+        return exonModel.size() == 0 || intronModel.size() == 0;
+    }
+    
+    void initGenomeMapper(const path& genomeFile);
+    
+    uint32_t calcIntronThreshold(const JunctionList& juncs);
+    
+    void trainCodingPotentialModel(const JunctionList& in);
+};
+}
\ No newline at end of file
diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc
index 11bdb41e..50d061db 100644
--- a/lib/src/markov_model.cc
+++ b/lib/src/markov_model.cc
@@ -24,7 +24,8 @@ using std::endl;
 
 #include <portcullis/markov_model.hpp>
 
-void portcullis::MarkovModel::train(const vector<string>& input) {
+void portcullis::MarkovModel::train(const vector<string>& input, const uint32_t _order) {
+    order = _order;
     MMU temp;
     for(auto& seq : input) {
         string sequp = boost::to_upper_copy(seq);
@@ -32,6 +33,7 @@ void portcullis::MarkovModel::train(const vector<string>& input) {
             temp[sequp.substr(i-order, order)][sequp.substr(i, 1)]++;
         }
     }
+    model.clear();
     for(auto& i : temp) {
         double sum = 0;
         for(auto& j : i.second) {
diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc
new file mode 100644
index 00000000..5bb344ad
--- /dev/null
+++ b/lib/src/model_features.cc
@@ -0,0 +1,88 @@
+//  ********************************************************************
+//  This file is part of Portcullis.
+//
+//  Portcullis is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  Portcullis is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with Portcullis.  If not, see <http://www.gnu.org/licenses/>.
+//  *******************************************************************
+
+#include <iostream>
+using std::cout;
+using std::endl;
+
+#include <portcullis/junction.hpp>
+using portcullis::Junction;
+
+#include <portcullis/model_features.hpp>
+
+
+
+void portcullis::ModelFeatures::initGenomeMapper(const path& genomeFile) {
+
+    // Initialise
+    gmap.setGenomeFile(genomeFile);
+
+    // Load the fasta index
+    gmap.loadFastaIndex();
+}
+
+uint32_t portcullis::ModelFeatures::calcIntronThreshold(const JunctionList& juncs) {
+
+    vector<uint32_t> intron_sizes;
+    for(auto& j : juncs) {
+        intron_sizes.push_back(j->getIntronSize());
+    }
+
+    std::sort(intron_sizes.begin(), intron_sizes.end());
+
+    L95 = intron_sizes[intron_sizes.size() * 0.95];
+
+    return L95;
+}
+
+void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in) {
+
+    vector<string> exons;
+    vector<string> introns;
+    for(auto& j : in) {
+
+        int len = 0;
+
+        string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 80, j->getIntron()->start, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            left_exon = SeqUtils::reverseComplement(left_exon);
+        }        
+        exons.push_back(left_exon);
+
+        string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+81, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            left_intron = SeqUtils::reverseComplement(left_intron);
+        }        
+        introns.push_back(left_intron);
+
+        string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end+1, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            right_intron = SeqUtils::reverseComplement(right_intron);
+        }        
+        introns.push_back(right_intron);
+
+
+        string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 80, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            right_exon = SeqUtils::reverseComplement(right_exon);
+        }        
+        exons.push_back(right_exon);
+    }
+
+    exonModel.train(exons, 5);
+    intronModel.train(introns, 5);
+}
\ No newline at end of file
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index 4d1886bc..05e5ae26 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -202,16 +202,9 @@ void portcullis::JunctionFilter::filter() {
     }
     
     // To be overridden if we are training
-    uint32_t L95 = 0;
-    MarkovModel exon_model(5);
-    MarkovModel intron_model(5);
+    ModelFeatures mf;
+    mf.initGenomeMapper(this->genomeFile);
     
-    // Create the genome mapper
-    GenomeMapper gmap(this->genomeFile);
-
-    // Load the fasta index
-    gmap.loadFastaIndex();
-
     
     if (train) {
         
@@ -263,15 +256,14 @@ void portcullis::JunctionFilter::filter() {
         
         
         // Analyse positive set to get L0.05 of intron size
-        L95 = this->calcIntronThreshold(passJuncs);        
-        cout << "Length of intron at 95th percentile of positive set: " << L95 << endl << endl;
+        cout << "Length of intron at 95th percentile of positive set: " << mf.calcIntronThreshold(passJuncs) << endl << endl;
         
         cout << "Training coding potential markov models on positive set ...";
         cout.flush();
-        trainMMs(pos, exon_model, intron_model, gmap);
+        mf.trainCodingPotentialModel(pos);
         cout << " done." << endl;
-        cout << "Exon model contains " << exon_model.size() << " 5-mers." << endl;
-        cout << "Intron model contains " << intron_model.size() << " 5-mers." << endl;
+        cout << "Exon model contains " << mf.exonModel.size() << " 5-mers." << endl;
+        cout << "Intron model contains " << mf.intronModel.size() << " 5-mers." << endl;
                 
         passJuncs.clear();
         failJuncs.clear();
@@ -279,7 +271,7 @@ void portcullis::JunctionFilter::filter() {
         
         doRuleBasedFiltering(this->getIntitalNegRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial negative set for training", resultMap);
         
-        const uint32_t longintronthreshold = L95 * 6;
+        const uint32_t longintronthreshold = mf.L95 * 6;
         cout << endl << "Adding junctions to negative set with MaxMMES < 10 and excessively long intron size of positive set (> L95 x 6 = " << longintronthreshold << ")...";
         cout.flush();
         
@@ -343,9 +335,8 @@ void portcullis::JunctionFilter::filter() {
         training.insert(training.end(), pos.begin(), pos.end());
         training.insert(training.end(), neg.begin(), neg.end());
         
-        
         cout << "Training a random forest model using the initial positive and negative datasets" << endl;
-        shared_ptr<Forest> forest = Train::trainInstance(training, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, L95, exon_model, intron_model, gmap);
+        shared_ptr<Forest> forest = Train::trainInstance(training, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, mf);
         
         forest->saveToFile();
         forest->writeOutput(&cout);
@@ -366,9 +357,9 @@ void portcullis::JunctionFilter::filter() {
         JunctionList passJuncs;
         JunctionList failJuncs;
         
-        forestPredict(currentJuncs, passJuncs, failJuncs, L95, exon_model, intron_model, gmap);
+        forestPredict(currentJuncs, passJuncs, failJuncs, mf);
                 
-        printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering"));
+        printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering results"));
         
         // Reset currentJuncs
         currentJuncs.clear();
@@ -442,7 +433,7 @@ void portcullis::JunctionFilter::filter() {
             }
         }
         
-        printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Post filtering (length and/or canonical)"));
+        printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Post filtering (length and/or canonical) results"));
         
         // Reset currentJuncs
         currentJuncs.clear();
@@ -451,6 +442,8 @@ void portcullis::JunctionFilter::filter() {
         }
     }
     
+    cout << endl;
+    
     JunctionSystem filteredJuncs;
     JunctionSystem refKeptJuncs;
         
@@ -459,7 +452,7 @@ void portcullis::JunctionFilter::filter() {
     }
     else {
     
-        cout  << "Recalculating junction grouping and distance stats based on new junction list that passed filters ...";
+        cout << "Recalculating junction grouping and distance stats based on new junction list that passed filters ...";
         cout.flush();
 
         for(auto& j : currentJuncs) {
@@ -489,7 +482,7 @@ void portcullis::JunctionFilter::filter() {
                             discardedJuncs.getJunctions(), 
                             string("Overall results"));
 
-    cout << "Saving junctions passing filter to disk:" << endl;
+    cout << endl << "Saving junctions passing filter to disk:" << endl;
 
     filteredJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".pass", source + "_pass");
     
@@ -506,61 +499,15 @@ void portcullis::JunctionFilter::filter() {
     }
 }
 
-void portcullis::JunctionFilter::trainMMs(const JunctionList& in, MarkovModel& exon_model, MarkovModel& intron_model, GenomeMapper& gmap) {
-    
-    vector<string> exons;
-    vector<string> introns;
-    for(auto& j : in) {
-        
-        int len = 0;
-        
-        string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 80, j->getIntron()->start, &len);
-        if (j->getConsensusStrand() == Strand::NEGATIVE) {
-            left_exon = SeqUtils::reverseComplement(left_exon);
-        }        
-        exons.push_back(left_exon);
-        
-        string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+81, &len);
-        if (j->getConsensusStrand() == Strand::NEGATIVE) {
-            left_intron = SeqUtils::reverseComplement(left_intron);
-        }        
-        introns.push_back(left_intron);
-        
-        string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end+1, &len);
-        if (j->getConsensusStrand() == Strand::NEGATIVE) {
-            right_intron = SeqUtils::reverseComplement(right_intron);
-        }        
-        introns.push_back(right_intron);
-        
-        
-        string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 80, &len);
-        if (j->getConsensusStrand() == Strand::NEGATIVE) {
-            right_exon = SeqUtils::reverseComplement(right_exon);
-        }        
-        exons.push_back(right_exon);
-    }
-    
-    exon_model.train(exons);
-    intron_model.train(introns);
-}
-    
-uint32_t portcullis::JunctionFilter::calcIntronThreshold(const JunctionList& juncs) {
-    
-    vector<uint32_t> intron_sizes;
-    for(auto& j : juncs) {
-        intron_sizes.push_back(j->getIntronSize());
-    }
-    
-    std::sort(intron_sizes.begin(), intron_sizes.end());
+
     
-    return intron_sizes[intron_sizes.size() * 0.95];
-}
+
 
 void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) {
     // Output stats
     size_t diff = in.size() - pass.size();
 
-    cout << prefix << endl
+    cout << endl << prefix << endl
          << "-------------------------" << endl
          << "Input contained " << in.size() << " junctions." << endl
          << "Output contains " << pass.size() << " junctions." << endl
@@ -617,13 +564,13 @@ void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, cons
 
 }
     
-void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) {
+void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf) {
     
     if (verbose) {
         cerr << endl << "Preparing junction metrics into matrix" << endl;
     }
     
-    Data* testingData = Train::juncs2FeatureVectors(all, L95, exon, intron, gmap);
+    Data* testingData = Train::juncs2FeatureVectors(all, mf);
     
     // Load forest from disk
     
diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp
index ff70137c..ec3767ff 100644
--- a/src/junction_filter.hpp
+++ b/src/junction_filter.hpp
@@ -53,6 +53,7 @@ using boost::filesystem::symbolic_link_exists;
 #include <portcullis/performance.hpp>
 #include <portcullis/rule_parser.hpp>
 #include <portcullis/markov_model.hpp>
+#include <portcullis/model_features.hpp>
 using portcullis::bam::GenomeMapper;
 using portcullis::PortcullisFS;
 using portcullis::Intron;
@@ -60,6 +61,7 @@ using portcullis::IntronHasher;
 using portcullis::Performance;
 using portcullis::JuncResultMap;
 using portcullis::MarkovModel;
+using portcullis::ModelFeatures;
 
 
 namespace portcullis {
@@ -275,7 +277,7 @@ class JunctionFilter {
  
 protected:
     
-    void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap);
+    void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf);
 
     shared_ptr<Performance> calcPerformance(const JunctionList& pass, const JunctionList& fail) {
         return calcPerformance(pass, fail, false);
@@ -285,11 +287,7 @@ class JunctionFilter {
     void printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix);
     
     void doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail, const string& prefix, JuncResultMap& resultMap);
-    
-    uint32_t calcIntronThreshold(const JunctionList& pass);
-    
-    void trainMMs(const JunctionList& in, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap);
-    
+        
 public:
   
     static string helpMessage() {
diff --git a/src/train.cc b/src/train.cc
index b65ef7c2..801af91f 100644
--- a/src/train.cc
+++ b/src/train.cc
@@ -68,7 +68,7 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){
     verbose = false;    
 }
     
-Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) {
+Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, ModelFeatures& mf) {
     
     vector<string> headers;
     headers.reserve( VAR_NAMES.size() + JO_NAMES.size() );
@@ -93,8 +93,8 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3
         //d[3 * x.size() + row] = ;
         d[4 * x.size() + row] = j->getReliable2RawRatio();
         //d[5 * x.size() + row] = j->getMeanMismatches();
-        d[5 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
-        d[6 * x.size() + row] = exon.size() > 0 && intron.size() > 0 ? j->calcCodingPotential(gmap, exon, intron) : 0.0;     // Coding potential score
+        d[5 * x.size() + row] = mf.L95 == 0 ? 0.0 : j->calcIntronScore(mf.L95);     // Intron score
+        d[6 * x.size() + row] = mf.isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(mf.gmap, mf.exonModel, mf.intronModel);     // Coding potential score
         d[7 * x.size() + row] = j->isGenuine();
         // Junction overhang values at each position are first converted into deviation from expected distributions       
         double half_read_length = (double)j->getMeanQueryLength() / 2.0;    
@@ -116,10 +116,10 @@ Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, const uint3
     return data;
 }
 
-shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap) {
+shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, ModelFeatures& mf) {
     
     cout << "Creating feature vector" << endl;
-    Data* trainingData = juncs2FeatureVectors(x, L95, exon, intron, gmap);
+    Data* trainingData = juncs2FeatureVectors(x, mf);
     
     cout << "Initialising random forest" << endl;
     shared_ptr<Forest> f = nullptr;
diff --git a/src/train.hpp b/src/train.hpp
index 6fedc455..7e2f009b 100644
--- a/src/train.hpp
+++ b/src/train.hpp
@@ -37,8 +37,9 @@ using boost::filesystem::path;
 #include <ranger/Forest.h>
 #include <portcullis/performance.hpp>
 #include <portcullis/bam/genome_mapper.hpp>
+#include <portcullis/model_features.hpp>
 using portcullis::bam::GenomeMapper;
-
+using portcullis::ModelFeatures;
 
 namespace portcullis {
     
@@ -235,23 +236,21 @@ class Train {
     static int main(int argc, char *argv[]);
     
     static Data* juncs2FeatureVectors(const JunctionList& x) {
-        GenomeMapper gmap;
-        MarkovModel m(1);
-        return juncs2FeatureVectors(x, 0, m, m, gmap);
+        ModelFeatures mf;
+        return juncs2FeatureVectors(x, mf);
     }
-    static Data* juncs2FeatureVectors(const JunctionList& x, const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap);
+    static Data* juncs2FeatureVectors(const JunctionList& x, ModelFeatures& modelFeatures);
     
     static shared_ptr<Forest> trainInstance(const JunctionList& x, string outputPrefix, 
             uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) {
-        GenomeMapper gmap;
-        MarkovModel m(1);
+        ModelFeatures mf;
         return trainInstance(x, outputPrefix, trees, threads, regressionMode, verbose,
-                0, m, m, gmap);
+                mf);
     }
     
     static shared_ptr<Forest> trainInstance(const JunctionList& x, string outputPrefix, 
             uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, 
-            const uint32_t L95, MarkovModel& exon, MarkovModel& intron, GenomeMapper& gmap);
+            ModelFeatures& modelFeatures);
     
 protected:
     

From c4625ed11ccf7722af1dc8e5c6cc8936dba343f6 Mon Sep 17 00:00:00 2001
From: Dan <d.mapleson@gmail.com>
Date: Mon, 25 Apr 2016 13:40:33 +0100
Subject: [PATCH 06/15] Shifted creation of feature vector into model_features.

---
 lib/Makefile.am                           |  1 +
 lib/include/portcullis/model_features.hpp | 25 +++++++++++
 lib/src/model_features.cc                 | 50 +++++++++++++++++++++
 src/junction_filter.cc                    |  2 +-
 src/train.cc                              | 53 ++---------------------
 src/train.hpp                             | 24 +---------
 6 files changed, 82 insertions(+), 73 deletions(-)

diff --git a/lib/Makefile.am b/lib/Makefile.am
index 529a4809..643163b7 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -46,6 +46,7 @@ library_include_HEADERS = \
 
 libportcullis_la_CPPFLAGS = \
 	-isystem $(top_srcdir)/deps/htslib-1.3 \
+	-isystem $(top_srcdir)/deps/ranger-0.3.8/include \
 	-isystem $(top_srcdir)/lib/include \
 	-DDATADIR=\"$(datadir)\" \
 	@AM_CPPFLAGS@ \
diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp
index 8179e57c..b3e5a917 100644
--- a/lib/include/portcullis/model_features.hpp
+++ b/lib/include/portcullis/model_features.hpp
@@ -20,12 +20,35 @@
 #include <boost/filesystem/path.hpp>
 using boost::filesystem::path;
 
+#include <ranger/Data.h>
+
 #include <portcullis/bam/genome_mapper.hpp>
 #include <portcullis/markov_model.hpp>
+#include <portcullis/junction.hpp>
 using portcullis::bam::GenomeMapper;
 using portcullis::MarkovModel;
+using portcullis::JunctionList;
 
 namespace portcullis {
+
+// List of variable names
+const vector<string> VAR_NAMES = { 
+            //"M2-nb-reads", 
+            //"M3-nb_dist_aln", 
+            "nb_rel_aln", 
+            //"M8-max_min_anc", 
+            //"M9-dif_anc", 
+            //"M10-dist_anc", 
+            "entropy", 
+            "maxmmes", 
+            "min_hamming_score", 
+            //"M14-hamming3p",
+            "rel2raw_ratio",
+            //"mean_mismatches",
+            "IntronScore",
+            "CodingPotential",
+            "Genuine" };
+    
     
 class ModelFeatures {
 public:
@@ -46,5 +69,7 @@ class ModelFeatures {
     uint32_t calcIntronThreshold(const JunctionList& juncs);
     
     void trainCodingPotentialModel(const JunctionList& in);
+    
+    Data* juncs2FeatureVectors(const JunctionList& x);
 };
 }
\ No newline at end of file
diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc
index 5bb344ad..8d40b880 100644
--- a/lib/src/model_features.cc
+++ b/lib/src/model_features.cc
@@ -19,6 +19,8 @@
 using std::cout;
 using std::endl;
 
+#include <ranger/DataDouble.h>
+
 #include <portcullis/junction.hpp>
 using portcullis::Junction;
 
@@ -85,4 +87,52 @@ void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in
 
     exonModel.train(exons, 5);
     intronModel.train(introns, 5);
+}
+
+Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) {
+    
+    vector<string> headers;
+    headers.reserve( VAR_NAMES.size() + JO_NAMES.size() );
+    headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() );
+    headers.insert( headers.end(), JO_NAMES.begin()+14, JO_NAMES.end() );
+    
+    // Convert junction list info to double*
+    double* d = new double[x.size() * headers.size()];
+    
+    uint32_t row = 0;
+    for (const auto& j : x) {        
+        
+        //d[0 * x.size() + row] = j->getNbJunctionAlignments();
+        //d[0 * x.size() + row] = j->getNbDistinctAlignments();
+        d[0 * x.size() + row] = j->getNbReliableAlignments();
+        //d[3 * x.size() + row] = j->getMaxMinAnchor();
+        //d[4 * x.size() + row] = j->getDiffAnchor();
+        //d[5 * x.size() + row] = j->getNbDistinctAnchors();
+        d[1 * x.size() + row] = j->getEntropy();
+        d[2 * x.size() + row] = j->getMaxMMES();
+        d[3 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());
+        //d[3 * x.size() + row] = ;
+        d[4 * x.size() + row] = j->getReliable2RawRatio();
+        //d[5 * x.size() + row] = j->getMeanMismatches();
+        d[5 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
+        d[6 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel);     // Coding potential score
+        d[7 * x.size() + row] = j->isGenuine();
+        // Junction overhang values at each position are first converted into deviation from expected distributions       
+        double half_read_length = (double)j->getMeanQueryLength() / 2.0;    
+        for(size_t i = 14; i < JO_NAMES.size(); i++) {
+            double Ni = j->getJunctionOverhangs(i);                 // Actual count at this position
+            if (Ni == 0.0) Ni = 0.000000000001;                     // Ensure some value > 0 here otherwise we get -infinity later.
+            double Pi = 1.0 - ((double)i / half_read_length);       // Likely scale at this position
+            double Ei = (double)j->getNbJunctionAlignments() * Pi;  // Expected count at this position
+            double Xi = abs(log2(Ni / Ei));                         // Log deviation
+            
+            d[(i-14 + 8) * x.size() + row] = Xi;
+        }
+        
+        row++;
+    }
+    
+    Data* data = new DataDouble(d, headers, x.size(), headers.size());
+    data->setExternalData(false);      // This causes 'd' to be deleted when 'data' is deleted
+    return data;
 }
\ No newline at end of file
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index 05e5ae26..526c5ad6 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -570,7 +570,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
         cerr << endl << "Preparing junction metrics into matrix" << endl;
     }
     
-    Data* testingData = Train::juncs2FeatureVectors(all, mf);
+    Data* testingData = mf.juncs2FeatureVectors(all);
     
     // Load forest from disk
     
diff --git a/src/train.cc b/src/train.cc
index 801af91f..9c59ab63 100644
--- a/src/train.cc
+++ b/src/train.cc
@@ -68,58 +68,12 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){
     verbose = false;    
 }
     
-Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x, ModelFeatures& mf) {
-    
-    vector<string> headers;
-    headers.reserve( VAR_NAMES.size() + JO_NAMES.size() );
-    headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() );
-    headers.insert( headers.end(), JO_NAMES.begin()+14, JO_NAMES.end() );
-    
-    // Convert junction list info to double*
-    double* d = new double[x.size() * headers.size()];
-    
-    uint32_t row = 0;
-    for (const auto& j : x) {        
-        
-        //d[0 * x.size() + row] = j->getNbJunctionAlignments();
-        //d[0 * x.size() + row] = j->getNbDistinctAlignments();
-        d[0 * x.size() + row] = j->getNbReliableAlignments();
-        //d[3 * x.size() + row] = j->getMaxMinAnchor();
-        //d[4 * x.size() + row] = j->getDiffAnchor();
-        //d[5 * x.size() + row] = j->getNbDistinctAnchors();
-        d[1 * x.size() + row] = j->getEntropy();
-        d[2 * x.size() + row] = j->getMaxMMES();
-        d[3 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());
-        //d[3 * x.size() + row] = ;
-        d[4 * x.size() + row] = j->getReliable2RawRatio();
-        //d[5 * x.size() + row] = j->getMeanMismatches();
-        d[5 * x.size() + row] = mf.L95 == 0 ? 0.0 : j->calcIntronScore(mf.L95);     // Intron score
-        d[6 * x.size() + row] = mf.isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(mf.gmap, mf.exonModel, mf.intronModel);     // Coding potential score
-        d[7 * x.size() + row] = j->isGenuine();
-        // Junction overhang values at each position are first converted into deviation from expected distributions       
-        double half_read_length = (double)j->getMeanQueryLength() / 2.0;    
-        for(size_t i = 14; i < JO_NAMES.size(); i++) {
-            double Ni = j->getJunctionOverhangs(i);                 // Actual count at this position
-            if (Ni == 0.0) Ni = 0.000000000001;                     // Ensure some value > 0 here otherwise we get -infinity later.
-            double Pi = 1.0 - ((double)i / half_read_length);       // Likely scale at this position
-            double Ei = (double)j->getNbJunctionAlignments() * Pi;  // Expected count at this position
-            double Xi = abs(log2(Ni / Ei));                         // Log deviation
-            
-            d[(i-14 + 8) * x.size() + row] = Xi;
-        }
-        
-        row++;
-    }
-    
-    Data* data = new DataDouble(d, headers, x.size(), headers.size());
-    data->setExternalData(false);      // This causes 'd' to be deleted when 'data' is deleted
-    return data;
-}
+
 
 shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, ModelFeatures& mf) {
     
     cout << "Creating feature vector" << endl;
-    Data* trainingData = juncs2FeatureVectors(x, mf);
+    Data* trainingData = mf.juncs2FeatureVectors(x);
     
     cout << "Initialising random forest" << endl;
     shared_ptr<Forest> f = nullptr;
@@ -162,7 +116,8 @@ shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, strin
 void portcullis::Train::testInstance(shared_ptr<Forest> f, const JunctionList& x) {
     
     // Convert testing set junctions into feature vector
-    Data* testingData = juncs2FeatureVectors(x);
+    ModelFeatures mf;
+    Data* testingData = mf.juncs2FeatureVectors(x);
     
     f->setPredictionMode(true);
     f->setData(testingData);
diff --git a/src/train.hpp b/src/train.hpp
index 7e2f009b..2525c188 100644
--- a/src/train.hpp
+++ b/src/train.hpp
@@ -53,23 +53,7 @@ const uint16_t DEFAULT_TRAIN_THREADS = 1;
 const double DEFAULT_TRAIN_FRACTION = 1.0;
 const int DEFAULT_SEED = 1234567;       // To avoid non-deterministic behaviour
 
-// List of variable names
-const vector<string> VAR_NAMES = { 
-            //"M2-nb-reads", 
-            //"M3-nb_dist_aln", 
-            "nb_rel_aln", 
-            //"M8-max_min_anc", 
-            //"M9-dif_anc", 
-            //"M10-dist_anc", 
-            "entropy", 
-            "maxmmes", 
-            "min_hamming_score", 
-            //"M14-hamming3p",
-            "rel2raw_ratio",
-            //"mean_mismatches",
-            "IntronScore",
-            "CodingPotential",
-            "Genuine" };
+
 
 // Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/
 template<class In>
@@ -235,12 +219,6 @@ class Train {
 
     static int main(int argc, char *argv[]);
     
-    static Data* juncs2FeatureVectors(const JunctionList& x) {
-        ModelFeatures mf;
-        return juncs2FeatureVectors(x, mf);
-    }
-    static Data* juncs2FeatureVectors(const JunctionList& x, ModelFeatures& modelFeatures);
-    
     static shared_ptr<Forest> trainInstance(const JunctionList& x, string outputPrefix, 
             uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) {
         ModelFeatures mf;

From eff0ac4c46b63d213b7e80151e55368495b53da4 Mon Sep 17 00:00:00 2001
From: Dan <d.mapleson@gmail.com>
Date: Mon, 25 Apr 2016 14:01:05 +0100
Subject: [PATCH 07/15] Removed filter tool dependence on the training tool. 
 Training now handled in modelfeatures.

---
 lib/include/portcullis/model_features.hpp |  9 ++++
 lib/src/model_features.cc                 | 51 +++++++++++++++++++++++
 src/junction_filter.cc                    |  9 ++--
 src/junction_filter.hpp                   |  1 +
 src/train.cc                              | 46 +-------------------
 src/train.hpp                             | 10 -----
 6 files changed, 66 insertions(+), 60 deletions(-)

diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp
index b3e5a917..50f1245e 100644
--- a/lib/include/portcullis/model_features.hpp
+++ b/lib/include/portcullis/model_features.hpp
@@ -17,10 +17,14 @@
 
 #pragma once
 
+#include <memory>
+using std::shared_ptr;
+
 #include <boost/filesystem/path.hpp>
 using boost::filesystem::path;
 
 #include <ranger/Data.h>
+#include <ranger/Forest.h>
 
 #include <portcullis/bam/genome_mapper.hpp>
 #include <portcullis/markov_model.hpp>
@@ -30,6 +34,8 @@ using portcullis::MarkovModel;
 using portcullis::JunctionList;
 
 namespace portcullis {
+    
+typedef shared_ptr<Forest> ForestPtr;
 
 // List of variable names
 const vector<string> VAR_NAMES = { 
@@ -71,5 +77,8 @@ class ModelFeatures {
     void trainCodingPotentialModel(const JunctionList& in);
     
     Data* juncs2FeatureVectors(const JunctionList& x);
+    
+    ForestPtr trainInstance(const JunctionList& x, string outputPrefix, 
+            uint16_t trees, uint16_t threads, bool regressionMode, bool verbose);
 };
 }
\ No newline at end of file
diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc
index 8d40b880..e6adfc9d 100644
--- a/lib/src/model_features.cc
+++ b/lib/src/model_features.cc
@@ -16,10 +16,15 @@
 //  *******************************************************************
 
 #include <iostream>
+#include <memory>
 using std::cout;
+using std::cerr;
 using std::endl;
+using std::make_shared;
 
 #include <ranger/DataDouble.h>
+#include <ranger/ForestRegression.h>
+#include <ranger/ForestClassification.h>
 
 #include <portcullis/junction.hpp>
 using portcullis::Junction;
@@ -135,4 +140,50 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) {
     Data* data = new DataDouble(d, headers, x.size(), headers.size());
     data->setExternalData(false);      // This causes 'd' to be deleted when 'data' is deleted
     return data;
+}
+
+
+
+portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionList& x, 
+        string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) {
+    
+    cout << "Creating feature vector" << endl;
+    Data* trainingData = juncs2FeatureVectors(x);
+    
+    cout << "Initialising random forest" << endl;
+    ForestPtr f = nullptr;
+    if (regressionMode) {
+        f = make_shared<ForestRegression>();
+    }
+    else {
+        f = make_shared<ForestClassification>();
+    }
+    
+    vector<string> catVars;
+    
+    f->init(
+        "Genuine",                  // Dependant variable name
+        MEM_DOUBLE,                 // Memory mode
+        trainingData,               // Data object
+        0,                          // M Try (0 == use default)
+        outputPrefix,               // Output prefix 
+        trees,                      // Number of trees
+        1236456789,                // Use fixed seed to avoid non-deterministic behaviour as much as possible
+        threads,                    // Number of threads
+        IMP_GINI,                   // Importance measure 
+        regressionMode ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION,  // Min node size
+        "",                         // Status var name 
+        false,                      // Prediction mode
+        true,                       // Replace 
+        catVars,                    // Unordered categorical variable names (vector<string>)
+        false,                      // Memory saving
+        DEFAULT_SPLITRULE,          // Split rule
+        false,                      // predall
+        1.0);                       // Sample fraction
+            
+    cout << "Training" << endl;
+    f->setVerboseOut(&cerr);
+    f->run(verbose);
+    
+    return f;
 }
\ No newline at end of file
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index 526c5ad6..7b5eaa35 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -68,9 +68,6 @@ using portcullis::eval;
 using portcullis::bam::GenomeMapper;
 using portcullis::MarkovModel;
 
-#include "train.hpp"
-using portcullis::Train;
-
 #include "junction_filter.hpp"
 
 portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile, 
@@ -336,7 +333,7 @@ void portcullis::JunctionFilter::filter() {
         training.insert(training.end(), neg.begin(), neg.end());
         
         cout << "Training a random forest model using the initial positive and negative datasets" << endl;
-        shared_ptr<Forest> forest = Train::trainInstance(training, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true, mf);
+        shared_ptr<Forest> forest = mf.trainInstance(training, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true);
         
         forest->saveToFile();
         forest->writeOutput(&cout);
@@ -594,8 +591,8 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
         testingData,                // Data object
         0,                          // M Try (0 == use default)
         "",                         // Output prefix 
-        DEFAULT_TRAIN_TREES,        // Number of trees (will be overwritten when loading the model)
-        DEFAULT_SEED,               
+        DEFAULT_SELFTRAIN_TREES,                        // Number of trees (will be overwritten when loading the model)
+        1234567890,                 // Seed for random generator               
         threads,                    // Number of threads
         IMP_GINI,                   // Importance measure 
         train ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION,  // Min node size
diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp
index ec3767ff..760bf45c 100644
--- a/src/junction_filter.hpp
+++ b/src/junction_filter.hpp
@@ -77,6 +77,7 @@ const string DEFAULT_FILTER_MODEL_FILE = "default_model.forest";
 const string ST_IPOS_RULES_FILE = "selftrain_initial_pos.json";
 const string ST_INEG_RULES_FILE = "selftrain_initial_neg.json";
 const uint16_t DEFAULT_FILTER_THREADS = 1;
+const uint16_t DEFAULT_SELFTRAIN_TREES = 100;
 
 
 class JunctionFilter {
diff --git a/src/train.cc b/src/train.cc
index 9c59ab63..689fa1c5 100644
--- a/src/train.cc
+++ b/src/train.cc
@@ -70,48 +70,6 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){
     
 
 
-shared_ptr<Forest> portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, ModelFeatures& mf) {
-    
-    cout << "Creating feature vector" << endl;
-    Data* trainingData = mf.juncs2FeatureVectors(x);
-    
-    cout << "Initialising random forest" << endl;
-    shared_ptr<Forest> f = nullptr;
-    if (regressionMode) {
-        f = make_shared<ForestRegression>();
-    }
-    else {
-        f = make_shared<ForestClassification>();
-    }
-    
-    vector<string> catVars;
-    
-    f->init(
-        "Genuine",                  // Dependant variable name
-        MEM_DOUBLE,                 // Memory mode
-        trainingData,               // Data object
-        0,                          // M Try (0 == use default)
-        outputPrefix,               // Output prefix 
-        trees,                      // Number of trees
-        DEFAULT_SEED,               // Use fixed seed to avoid non-deterministic behaviour
-        threads,                    // Number of threads
-        IMP_GINI,                   // Importance measure 
-        regressionMode ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION,  // Min node size
-        "",                         // Status var name 
-        false,                      // Prediction mode
-        true,                       // Replace 
-        catVars,                    // Unordered categorical variable names (vector<string>)
-        false,                      // Memory saving
-        DEFAULT_SPLITRULE,          // Split rule
-        false,                      // predall
-        1.0);                       // Sample fraction
-            
-    cout << "Training" << endl;
-    f->setVerboseOut(&cerr);
-    f->run(verbose);
-    
-    return f;
-}
 
 void portcullis::Train::testInstance(shared_ptr<Forest> f, const JunctionList& x) {
     
@@ -198,7 +156,7 @@ void portcullis::Train::train() {
      
         cout << "Training on full dataset" << endl;
 
-        shared_ptr<Forest> f = trainInstance(junctions, outputPrefix.string(), trees, threads, false, true);
+        ForestPtr f = ModelFeatures().trainInstance(junctions, outputPrefix.string(), trees, threads, false, true);
         
         f->saveToFile();
         f->writeOutput(&cout);        
@@ -233,7 +191,7 @@ void portcullis::Train::train() {
             kf.getFold(i, back_inserter(train), back_inserter(test));
             
             // Train on this particular set
-            shared_ptr<Forest> f = trainInstance(train, outputPrefix.string(), trees, threads, false, false);
+            ForestPtr f = ModelFeatures().trainInstance(train, outputPrefix.string(), trees, threads, false, false);
             
             // Test model instance
             testInstance(f, test);
diff --git a/src/train.hpp b/src/train.hpp
index 2525c188..40fe2b34 100644
--- a/src/train.hpp
+++ b/src/train.hpp
@@ -219,16 +219,6 @@ class Train {
 
     static int main(int argc, char *argv[]);
     
-    static shared_ptr<Forest> trainInstance(const JunctionList& x, string outputPrefix, 
-            uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) {
-        ModelFeatures mf;
-        return trainInstance(x, outputPrefix, trees, threads, regressionMode, verbose,
-                mf);
-    }
-    
-    static shared_ptr<Forest> trainInstance(const JunctionList& x, string outputPrefix, 
-            uint16_t trees, uint16_t threads, bool regressionMode, bool verbose, 
-            ModelFeatures& modelFeatures);
     
 protected:
     

From db1b54ce69a519dc17cd650ea8979d42436f1504 Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Tue, 26 Apr 2016 16:33:30 +0100
Subject: [PATCH 08/15] Implemented splicing positional weights scores,
 although it doesn't appear to improve performance.

---
 deps/ranger-0.3.8/include/ranger/Data.h   | 178 +++++++------
 deps/ranger-0.3.8/src/Data.cpp            | 296 ++++++++++++----------
 lib/include/portcullis/junction.hpp       |  16 +-
 lib/include/portcullis/markov_model.hpp   |  44 +++-
 lib/include/portcullis/model_features.hpp |  44 ++--
 lib/include/portcullis/seq_utils.hpp      |  16 +-
 lib/src/junction.cc                       |  66 +++--
 lib/src/markov_model.cc                   |  57 ++++-
 lib/src/model_features.cc                 |  99 ++++++--
 src/junction_filter.cc                    |  55 ++--
 src/junction_filter.hpp                   |   2 -
 11 files changed, 546 insertions(+), 327 deletions(-)

diff --git a/deps/ranger-0.3.8/include/ranger/Data.h b/deps/ranger-0.3.8/include/ranger/Data.h
index 0894fdb3..56dd3810 100644
--- a/deps/ranger-0.3.8/include/ranger/Data.h
+++ b/deps/ranger-0.3.8/include/ranger/Data.h
@@ -31,107 +31,131 @@
 
 #include <vector>
 #include <iostream>
+#include <fstream>
+#include <sstream>
 
 #include "globals.h"
 
 class Data {
 public:
-  Data();
-  virtual ~Data();
+    Data();
+    virtual ~Data();
 
-  virtual double get(size_t row, size_t col) const = 0;
+    virtual double get(size_t row, size_t col) const = 0;
 
-  size_t getVariableID(std::string variable_name);
+    size_t getVariableID(std::string variable_name);
 
-  virtual void reserveMemory() = 0;
-  virtual void set(size_t col, size_t row, double value, bool& error) = 0;
+    virtual void reserveMemory() = 0;
+    virtual void set(size_t col, size_t row, double value, bool& error) = 0;
 
-  void addSparseData(unsigned char* sparse_data, size_t num_cols_sparse);
+    void addSparseData(unsigned char* sparse_data, size_t num_cols_sparse);
 
-  bool loadFromFile(std::string filename);
-  bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line);
-  bool loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator);
+    bool loadFromFile(std::string filename);
+    bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line);
+    bool loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator);
 
-  void getAllValues(std::vector<double>& all_values, std::vector<size_t>& sampleIDs, size_t varID);
+    void getAllValues(std::vector<double>& all_values, std::vector<size_t>& sampleIDs, size_t varID);
 
-  size_t getIndex(size_t row, size_t col) const {
-    if (col < num_cols_no_sparse) {
-      return index_data[col * num_rows + row];
-    } else {
-      // Get data out of sparse storage. -1 because of GenABEL coding.
-      size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row;
-      size_t result = (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1);
+    size_t getIndex(size_t row, size_t col) const {
+        if (col < num_cols_no_sparse) {
+            return index_data[col * num_rows + row];
+        } else {
+            // Get data out of sparse storage. -1 because of GenABEL coding.
+            size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row;
+            size_t result = (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1);
 
-      // TODO: Better way to treat missing values?
-      if (result > 2) {
-        return 0;
-      } else {
-        return result;
-      }
+            // TODO: Better way to treat missing values?
+            if (result > 2) {
+                return 0;
+            } else {
+                return result;
+            }
+        }
     }
-  }
-
-  double getUniqueDataValue(size_t varID, size_t index) const {
-    if (varID < num_cols_no_sparse) {
-      return unique_data_values[varID][index];
-    } else {
-      // For GWAS data the index is the value
-      return (index);
+
+    double getUniqueDataValue(size_t varID, size_t index) const {
+        if (varID < num_cols_no_sparse) {
+            return unique_data_values[varID][index];
+        } else {
+            // For GWAS data the index is the value
+            return (index);
+        }
+    }
+
+    size_t getNumUniqueDataValues(size_t varID) const {
+        if (varID < num_cols_no_sparse) {
+            return unique_data_values[varID].size();
+        } else {
+            // For GWAS data 0,1,2
+            return (3);
+        }
+    }
+
+    void sort();
+
+    const std::vector<std::string>& getVariableNames() const {
+        return variable_names;
+    }
+
+    size_t getNumCols() const {
+        return num_cols;
     }
-  }
-
-  size_t getNumUniqueDataValues(size_t varID) const {
-    if (varID < num_cols_no_sparse) {
-      return unique_data_values[varID].size();
-    } else {
-      // For GWAS data 0,1,2
-      return (3);
+
+    size_t getNumRows() const {
+        return num_rows;
+    }
+
+    size_t getMaxNumUniqueValues() const {
+        if (sparse_data == 0 || max_num_unique_values > 3) {
+            // If no sparse data or one variable with more than 3 unique values, return that value
+            return max_num_unique_values;
+        } else {
+            // If sparse data and no variable with more than 3 unique values, return 3
+            return 3;
+        }
+    }
+
+    void setExternalData(bool external) {
+        this->externalData = external;
+    }
+
+    void saveToTSV(const std::string& file);
+    
+    std::string getRow(size_t row) const {
+        std::stringstream ss;
+        ss << this->get(row, 0);
+        for (size_t j = 1; j < num_cols; j++) {
+            ss << "\t" << this->get(row, j);
+        }
+        return ss.str();
     }
-  }
-
-  void sort();
-
-  const std::vector<std::string>& getVariableNames() const {
-    return variable_names;
-  }
-  size_t getNumCols() const {
-    return num_cols;
-  }
-  size_t getNumRows() const {
-    return num_rows;
-  }
-
-  size_t getMaxNumUniqueValues() const {
-    if (sparse_data == 0 || max_num_unique_values > 3) {
-      // If no sparse data or one variable with more than 3 unique values, return that value
-      return max_num_unique_values;
-    } else {
-      // If sparse data and no variable with more than 3 unique values, return 3
-      return 3;
+    
+    std::string getHeader() const {
+        std::stringstream ss;
+        ss << variable_names[0];
+        for (size_t i = 1; i < variable_names.size(); i++) {
+            ss << "\t" << variable_names[i];
+        }
+        return ss.str();
     }
-  }
-  
-  void setExternalData(bool external) {
-      this->externalData = external;
-  }
 
 protected:
-  std::vector<std::string> variable_names;
-  size_t num_rows;
-  size_t num_rows_rounded;
-  size_t num_cols;
+    std::vector<std::string> variable_names;
+    size_t num_rows;
+    size_t num_rows_rounded;
+    size_t num_cols;
 
-  unsigned char* sparse_data;
-  size_t num_cols_no_sparse;
+    unsigned char* sparse_data;
+    size_t num_cols_no_sparse;
 
-  bool externalData;
+    bool externalData;
 
-  size_t* index_data;
-  std::vector<std::vector<double>> unique_data_values;
-  size_t max_num_unique_values;
+    size_t* index_data;
+    std::vector<std::vector<double>> unique_data_values;
+    size_t max_num_unique_values;
 
 private:
-  DISALLOW_COPY_AND_ASSIGN(Data);
+    DISALLOW_COPY_AND_ASSIGN(Data);
 };
 
 #endif /* DATA_H_ */
diff --git a/deps/ranger-0.3.8/src/Data.cpp b/deps/ranger-0.3.8/src/Data.cpp
index 03b09203..712afb81 100644
--- a/deps/ranger-0.3.8/src/Data.cpp
+++ b/deps/ranger-0.3.8/src/Data.cpp
@@ -36,181 +36,201 @@
 #include <ranger/utility.h>
 
 Data::Data() :
-    num_rows(0), num_rows_rounded(0), num_cols(0), sparse_data(0), num_cols_no_sparse(0), externalData(true), index_data(
-        0), max_num_unique_values(0) {
+num_rows(0), num_rows_rounded(0), num_cols(0), sparse_data(0), num_cols_no_sparse(0), externalData(true), index_data(
+0), max_num_unique_values(0) {
 }
 
 Data::~Data() {
-  if (index_data != 0) {
-    delete[] index_data;
-  }
+    if (index_data != 0) {
+        delete[] index_data;
+    }
 }
 
 size_t Data::getVariableID(std::string variable_name) {
-  std::vector<std::string>::iterator it = std::find(variable_names.begin(), variable_names.end(), variable_name);
-  if (it == variable_names.end()) {
-    throw std::runtime_error("Variable " + variable_name + " not found.");
-  }
-  return (std::distance(variable_names.begin(), it));
+    std::vector<std::string>::iterator it = std::find(variable_names.begin(), variable_names.end(), variable_name);
+    if (it == variable_names.end()) {
+        throw std::runtime_error("Variable " + variable_name + " not found.");
+    }
+    return (std::distance(variable_names.begin(), it));
 }
 
 void Data::addSparseData(unsigned char* sparse_data, size_t num_cols_sparse) {
-  num_cols = num_cols_no_sparse + num_cols_sparse;
-  num_rows_rounded = roundToNextMultiple(num_rows, 4);
-  this->sparse_data = sparse_data;
+    num_cols = num_cols_no_sparse + num_cols_sparse;
+    num_rows_rounded = roundToNextMultiple(num_rows, 4);
+    this->sparse_data = sparse_data;
 }
 
 bool Data::loadFromFile(std::string filename) {
 
-  bool result;
-
-  // Open input file
-  std::ifstream input_file;
-  input_file.open(filename);
-  if (!input_file.good()) {
-    throw std::runtime_error("Could not open input file.");
-  }
-
-  // Count number of rows
-  size_t line_count = 0;
-  std::string line;
-  while (getline(input_file, line)) {
-    ++line_count;
-  }
-  num_rows = line_count - 1;
-  input_file.close();
-  input_file.open(filename);
-
-  // Check if comma, semicolon or whitespace seperated
-  std::string header_line;
-  getline(input_file, header_line);
-
-  // Find out if comma, semicolon or whitespace seperated and call appropriate method
-  if (header_line.find(",") != std::string::npos) {
-    result = loadFromFileOther(input_file, header_line, ',');
-  } else if (header_line.find(";") != std::string::npos) {
-    result = loadFromFileOther(input_file, header_line, ';');
-  } else {
-    result = loadFromFileWhitespace(input_file, header_line);
-  }
-
-  externalData = false;
-  input_file.close();
-  return result;
+    bool result;
+
+    // Open input file
+    std::ifstream input_file;
+    input_file.open(filename);
+    if (!input_file.good()) {
+        throw std::runtime_error("Could not open input file.");
+    }
+
+    // Count number of rows
+    size_t line_count = 0;
+    std::string line;
+    while (getline(input_file, line)) {
+        ++line_count;
+    }
+    num_rows = line_count - 1;
+    input_file.close();
+    input_file.open(filename);
+
+    // Check if comma, semicolon or whitespace seperated
+    std::string header_line;
+    getline(input_file, header_line);
+
+    // Find out if comma, semicolon or whitespace seperated and call appropriate method
+    if (header_line.find(",") != std::string::npos) {
+        result = loadFromFileOther(input_file, header_line, ',');
+    } else if (header_line.find(";") != std::string::npos) {
+        result = loadFromFileOther(input_file, header_line, ';');
+    } else {
+        result = loadFromFileWhitespace(input_file, header_line);
+    }
+
+    externalData = false;
+    input_file.close();
+    return result;
 }
 
 bool Data::loadFromFileWhitespace(std::ifstream& input_file, std::string header_line) {
 
-  // Read header
-  std::string header_token;
-  std::stringstream header_line_stream(header_line);
-  while (header_line_stream >> header_token) {
-    variable_names.push_back(header_token);
-  }
-  num_cols = variable_names.size();
-  num_cols_no_sparse = num_cols;
-
-  // Read body
-  reserveMemory();
-  bool error = false;
-  std::string line;
-  size_t row = 0;
-  while (getline(input_file, line)) {
-    double token;
-    std::stringstream line_stream(line);
-    size_t column = 0;
-    while (line_stream >> token) {
-      set(column, row, token, error);
-      ++column;
+    // Read header
+    std::string header_token;
+    std::stringstream header_line_stream(header_line);
+    while (header_line_stream >> header_token) {
+        variable_names.push_back(header_token);
     }
-    if (column > num_cols) {
-      throw std::runtime_error("Could not open input file. Too many columns in a row.");
-    } else if (column < num_cols) {
-      throw std::runtime_error("Could not open input file. Too few columns in a row. Are all values numeric?");
+    num_cols = variable_names.size();
+    num_cols_no_sparse = num_cols;
+
+    // Read body
+    reserveMemory();
+    bool error = false;
+    std::string line;
+    size_t row = 0;
+    while (getline(input_file, line)) {
+        double token;
+        std::stringstream line_stream(line);
+        size_t column = 0;
+        while (line_stream >> token) {
+            set(column, row, token, error);
+            ++column;
+        }
+        if (column > num_cols) {
+            throw std::runtime_error("Could not open input file. Too many columns in a row.");
+        } else if (column < num_cols) {
+            throw std::runtime_error("Could not open input file. Too few columns in a row. Are all values numeric?");
+        }
+        ++row;
     }
-    ++row;
-  }
-  num_rows = row;
-  return error;
+    num_rows = row;
+    return error;
 }
 
 bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator) {
 
-  // Read header
-  std::string header_token;
-  std::stringstream header_line_stream(header_line);
-  while (getline(header_line_stream, header_token, seperator)) {
-    variable_names.push_back(header_token);
-  }
-  num_cols = variable_names.size();
-  num_cols_no_sparse = num_cols;
-
-  // Read body
-  reserveMemory();
-  bool error = false;
-  std::string line;
-  size_t row = 0;
-  while (getline(input_file, line)) {
-    std::string token_string;
-    double token;
-    std::stringstream line_stream(line);
-    size_t column = 0;
-    while (getline(line_stream, token_string, seperator)) {
-      std::stringstream token_stream(token_string);
-      token_stream >> token;
-      set(column, row, token, error);
-      ++column;
+    // Read header
+    std::string header_token;
+    std::stringstream header_line_stream(header_line);
+    while (getline(header_line_stream, header_token, seperator)) {
+        variable_names.push_back(header_token);
+    }
+    num_cols = variable_names.size();
+    num_cols_no_sparse = num_cols;
+
+    // Read body
+    reserveMemory();
+    bool error = false;
+    std::string line;
+    size_t row = 0;
+    while (getline(input_file, line)) {
+        std::string token_string;
+        double token;
+        std::stringstream line_stream(line);
+        size_t column = 0;
+        while (getline(line_stream, token_string, seperator)) {
+            std::stringstream token_stream(token_string);
+            token_stream >> token;
+            set(column, row, token, error);
+            ++column;
+        }
+        ++row;
     }
-    ++row;
-  }
-  num_rows = row;
-  return error;
+    num_rows = row;
+    return error;
 }
 
 void Data::getAllValues(std::vector<double>& all_values, std::vector<size_t>& sampleIDs, size_t varID) {
 
-  // All values for varID (no duplicates) for given sampleIDs
-  if (varID < num_cols_no_sparse) {
-
-    all_values.reserve(sampleIDs.size());
-    for (size_t i = 0; i < sampleIDs.size(); ++i) {
-      all_values.push_back(get(sampleIDs[i], varID));
+    // All values for varID (no duplicates) for given sampleIDs
+    if (varID < num_cols_no_sparse) {
+
+        all_values.reserve(sampleIDs.size());
+        for (size_t i = 0; i < sampleIDs.size(); ++i) {
+            all_values.push_back(get(sampleIDs[i], varID));
+        }
+        std::sort(all_values.begin(), all_values.end());
+        all_values.erase(unique(all_values.begin(), all_values.end()), all_values.end());
+    } else {
+        // If GWA data just use 0, 1, 2
+        all_values = std::vector<double>({0, 1, 2});
     }
-    std::sort(all_values.begin(), all_values.end());
-    all_values.erase(unique(all_values.begin(), all_values.end()), all_values.end());
-  } else {
-    // If GWA data just use 0, 1, 2
-    all_values = std::vector<double>( { 0, 1, 2 });
-  }
 
 }
 
 void Data::sort() {
 
-  // Reserve memory
-  index_data = new size_t[num_cols_no_sparse * num_rows];
+    // Reserve memory
+    index_data = new size_t[num_cols_no_sparse * num_rows];
+
+    // For all columns, get unique values and save index for each observation
+    for (size_t col = 0; col < num_cols_no_sparse; ++col) {
+
+        // Get all unique values
+        std::vector<double> unique_values(num_rows);
+        for (size_t row = 0; row < num_rows; ++row) {
+            unique_values[row] = get(row, col);
+        }
+        std::sort(unique_values.begin(), unique_values.end());
+        unique_values.erase(unique(unique_values.begin(), unique_values.end()), unique_values.end());
+
+        // Get index of unique value
+        for (size_t row = 0; row < num_rows; ++row) {
+            size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get(row, col)) - unique_values.begin();
+            index_data[col * num_rows + row] = idx;
+        }
+
+        // Save unique values
+        unique_data_values.push_back(unique_values);
+        if (unique_values.size() > max_num_unique_values) {
+            max_num_unique_values = unique_values.size();
+        }
+    }
+}
 
-  // For all columns, get unique values and save index for each observation
-  for (size_t col = 0; col < num_cols_no_sparse; ++col) {
+void Data::saveToTSV(const std::string& file) {
+    std::ofstream fout(file.c_str(), std::ofstream::out);
 
-    // Get all unique values
-    std::vector<double> unique_values(num_rows);
-    for (size_t row = 0; row < num_rows; ++row) {
-      unique_values[row] = get(row, col);
+    fout << variable_names[0];
+    for (size_t i = 1; i < variable_names.size(); i++) {
+        fout << "\t" << variable_names[i];
     }
-    std::sort(unique_values.begin(), unique_values.end());
-    unique_values.erase(unique(unique_values.begin(), unique_values.end()), unique_values.end());
-
-    // Get index of unique value
-    for (size_t row = 0; row < num_rows; ++row) {
-      size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get(row, col)) - unique_values.begin();
-      index_data[col * num_rows + row] = idx;
+    fout << std::endl;
+
+    for (size_t i = 0; i < num_rows; i++) {
+        fout << this->get(i, 0);
+        for (size_t j = 1; j < num_cols; j++) {
+            fout << "\t" << this->get(i, j);
+        }
+        fout << std::endl;
     }
 
-    // Save unique values
-    unique_data_values.push_back(unique_values);
-    if (unique_values.size() > max_num_unique_values) {
-      max_num_unique_values = unique_values.size();
-    }
-  }
+    fout.close();
 }
diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp
index 73279b1c..4bd64869 100644
--- a/lib/include/portcullis/junction.hpp
+++ b/lib/include/portcullis/junction.hpp
@@ -46,6 +46,8 @@ using namespace portcullis::bam;
 #include "intron.hpp"
 #include "seq_utils.hpp"
 #include "markov_model.hpp"
+using portcullis::KmerMarkovModel;
+using portcullis::PosMarkovModel;
 using portcullis::Intron;
 
 
@@ -843,7 +845,15 @@ class Junction {
     void setJunctionOverhangs(size_t index, uint32_t val) {
         junctionOverhangs[index] = val;
     }
-
+    
+    double getJunctionOverhangLogDeviation(size_t i) const {
+        double Ni = junctionOverhangs[i];                 // Actual count at this position
+        if (Ni == 0.0) Ni = 0.000000000001;                     // Ensure some value > 0 here otherwise we get -infinity later.
+        double Pi = 1.0 - ((double)i / (double)(this->meanQueryLength / 2.0));       // Likely scale at this position
+        double Ei = (double)this->getNbJunctionAlignments() * Pi;  // Expected count at this position
+        double Xi = log2(Ni / Ei);
+        return Xi;
+    }
 
     bool isGenuine() const {
         return genuine;
@@ -870,7 +880,9 @@ class Junction {
     }
     
     
-    double calcCodingPotential(GenomeMapper& gmap, MarkovModel& exon, MarkovModel& intron) const;
+    double calcCodingPotential(GenomeMapper& gmap, KmerMarkovModel& exon, KmerMarkovModel& intron) const;
+    
+    double calcPositionWeightScore(GenomeMapper& gmap, PosMarkovModel& donor, PosMarkovModel& acceptor) const;
     
     
     // **** Output methods ****
diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/markov_model.hpp
index 1b524858..bcc4744e 100644
--- a/lib/include/portcullis/markov_model.hpp
+++ b/lib/include/portcullis/markov_model.hpp
@@ -30,18 +30,12 @@ namespace portcullis {
 typedef boost::error_info<struct MMError,string> MMErrorInfo;
 struct MMException: virtual boost::exception, virtual std::exception {};
 
-
-
-/**
- * Intended to be used as a map containing the probability of seeing a given character
- */
-typedef unordered_map<string, double> NTP;
-
 /**
  * This datastructure consists of a map of kmers to a map of nucleotides with assoicated
  * probabilities.
  */
-typedef unordered_map<string, NTP> MMU;
+typedef unordered_map<string, unordered_map<string, double>> KMMU;
+typedef unordered_map<uint32_t, unordered_map<string, double>> PMMU;
 
 /**
  * Simple Markov chain implementation derived originally from Truesight.
@@ -51,10 +45,9 @@ typedef unordered_map<string, NTP> MMU;
  */
 class MarkovModel {
 
-private:
-    portcullis::MMU model;
+protected:
     uint16_t order;
-    
+       
 public:
     
     MarkovModel() : MarkovModel(1) {}
@@ -72,18 +65,45 @@ class MarkovModel {
         train(input, order);
     }
     
-    void train(const vector<string>& input, const uint32_t order);
+    virtual void train(const vector<string>& input, const uint32_t order) = 0;
     
     uint16_t getOrder() const {
         return order;
     }
     
+    virtual double getScore(const string& seq) = 0;
+};
+
+class KmerMarkovModel : public portcullis::MarkovModel {
+private:
+    portcullis::KMMU model;
+public:
+    KmerMarkovModel() : MarkovModel(1) {}
+    KmerMarkovModel(const uint32_t _order) : MarkovModel(_order) {};    
+    KmerMarkovModel(const vector<string>& input, const uint32_t _order) : MarkovModel(input, _order) {}
+    
+    void train(const vector<string>& input, const uint32_t order);
+    double getScore(const string& seq);
     size_t size() const {
         return model.size();
     }
+};
+
+class PosMarkovModel : public portcullis::MarkovModel {
+private:
+    portcullis::PMMU model;
+public:
+    PosMarkovModel() : MarkovModel(1) {}
+    PosMarkovModel(const uint32_t _order) : MarkovModel(_order) {};    
+    PosMarkovModel(const vector<string>& input, const uint32_t _order) : MarkovModel(input, _order) {}
     
+    void train(const vector<string>& input, const uint32_t order);
     double getScore(const string& seq);
+    size_t size() const {
+        return model.size();
+    }
 };
 
+
 }
 
diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp
index 50f1245e..28393f72 100644
--- a/lib/include/portcullis/model_features.hpp
+++ b/lib/include/portcullis/model_features.hpp
@@ -39,28 +39,32 @@ typedef shared_ptr<Forest> ForestPtr;
 
 // List of variable names
 const vector<string> VAR_NAMES = { 
-            //"M2-nb-reads", 
-            //"M3-nb_dist_aln", 
-            "nb_rel_aln", 
-            //"M8-max_min_anc", 
-            //"M9-dif_anc", 
-            //"M10-dist_anc", 
-            "entropy", 
-            "maxmmes", 
-            "min_hamming_score", 
-            //"M14-hamming3p",
-            "rel2raw_ratio",
-            //"mean_mismatches",
-            "IntronScore",
-            "CodingPotential",
-            "Genuine" };
+        "Genuine",
+        //"M2-nb-reads", 
+        //"M3-nb_dist_aln", 
+        "nb_rel_aln", 
+        //"M8-max_min_anc", 
+        //"M9-dif_anc", 
+        //"M10-dist_anc", 
+        "entropy", 
+        "maxmmes", 
+        "min_hamming_score", 
+        //"M14-hamming3p",
+        "rel2raw_ratio",
+        //"mean_mismatches",
+        "IntronScore",
+        //"CodingPotential"
+        "PWS"
+};
     
     
 class ModelFeatures {
 public:
     uint32_t L95;
-    MarkovModel exonModel;
-    MarkovModel intronModel;
+    KmerMarkovModel exonModel;
+    KmerMarkovModel intronModel;
+    PosMarkovModel donorPWModel;
+    PosMarkovModel acceptorPWModel;
     GenomeMapper gmap;
     
     ModelFeatures() : L95(0) {
@@ -70,12 +74,18 @@ class ModelFeatures {
         return exonModel.size() == 0 || intronModel.size() == 0;
     }
     
+    bool isPWModelEmpty() {
+        return donorPWModel.size() == 0 || acceptorPWModel.size() == 0;
+    }
+    
     void initGenomeMapper(const path& genomeFile);
     
     uint32_t calcIntronThreshold(const JunctionList& juncs);
     
     void trainCodingPotentialModel(const JunctionList& in);
     
+    void trainPositionWeightModel(const JunctionList& in);
+    
     Data* juncs2FeatureVectors(const JunctionList& x);
     
     ForestPtr trainInstance(const JunctionList& x, string outputPrefix, 
diff --git a/lib/include/portcullis/seq_utils.hpp b/lib/include/portcullis/seq_utils.hpp
index 09bcc0a8..867f8b37 100644
--- a/lib/include/portcullis/seq_utils.hpp
+++ b/lib/include/portcullis/seq_utils.hpp
@@ -40,12 +40,24 @@ typedef boost::error_info<struct SeqUtilsError,string> SeqUtilsErrorInfo;
 struct SeqUtilsException: virtual boost::exception, virtual std::exception { };
 
 
-
-
 class SeqUtils {
     
 public:
     
+    static bool dnaNt(const char c) {
+        return c == 'A' || c == 'T' || c == 'G' || c == 'C';
+    }
+    
+    static string makeClean(const string& s) {
+        string sequp = boost::to_upper_copy(s);
+        
+        for(size_t i = 0; i < sequp.size(); i++) {
+            sequp[i] = dnaNt(sequp[i]) ? sequp[i] : 'N';
+        }
+        
+        return sequp;
+    }
+    
     static int16_t hammingDistance(const string& s1, const string& s2) {
 
         if (s1.size() != s2.size())
diff --git a/lib/src/junction.cc b/lib/src/junction.cc
index f10b9f9b..e2237ac9 100644
--- a/lib/src/junction.cc
+++ b/lib/src/junction.cc
@@ -45,12 +45,6 @@ using portcullis::bam::CigarOp;
 using portcullis::bam::GenomeMapper;
 using portcullis::bam::Strand;
 
-#include <portcullis/intron.hpp>
-#include <portcullis/seq_utils.hpp>
-#include <portcullis/markov_model.hpp>
-using portcullis::Intron;
-using portcullis::MarkovModel;
-
 #include <portcullis/junction.hpp>
 
 
@@ -1228,38 +1222,64 @@ shared_ptr<portcullis::Junction> portcullis::Junction::parse(const string& line)
     return j;
 }
 
-double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, MarkovModel& exon, MarkovModel& intron) const {
+double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, KmerMarkovModel& exon, KmerMarkovModel& intron) const {
      
     int len = 0;
 
     const char* ref = this->intron->ref.name.c_str();
+    const bool neg = getConsensusStrand() == Strand::NEGATIVE;
 
-    string left_exon = gmap.fetchBases(ref, this->intron->start - 80, this->intron->start, &len);
-    if (this->getConsensusStrand() == Strand::NEGATIVE) {
+    string left_exon = gmap.fetchBases(ref, this->intron->start - 82, this->intron->start-2, &len);
+    if (neg) {
         left_exon = SeqUtils::reverseComplement(left_exon);
-    }        
-
-    string left_intron = gmap.fetchBases(ref, this->intron->start, this->intron->start+81, &len);
-    if (this->getConsensusStrand() == Strand::NEGATIVE) {
+    }
+    
+    string left_intron = gmap.fetchBases(ref, this->intron->start, this->intron->start+80, &len);
+    if (neg) {
         left_intron = SeqUtils::reverseComplement(left_intron);
-    }        
+    }
 
-    string right_intron = gmap.fetchBases(ref, this->intron->end-80, this->intron->end+1, &len);
-    if (this->getConsensusStrand() == Strand::NEGATIVE) {
+    string right_intron = gmap.fetchBases(ref, this->intron->end-80, this->intron->end, &len);
+    if (neg) {
         right_intron = SeqUtils::reverseComplement(right_intron);
     }        
 
-    string right_exon = gmap.fetchBases(ref, this->intron->end + 1, this->intron->end + 80, &len);
-    if (this->getConsensusStrand() == Strand::NEGATIVE) {
+    string right_exon = gmap.fetchBases(ref, this->intron->end + 1, this->intron->end + 81, &len);
+    if (neg) {
         right_exon = SeqUtils::reverseComplement(right_exon);
-    }        
-
-    double score =  exon.getScore(left_intron) + intron.getScore(left_exon)
+    }
+    /*
+    cout << "Left exon   : " << this->consensusStrand << " : " << left_exon << endl;
+    cout << "Left intron : " << this->consensusStrand << " : " << left_intron << endl;
+    cout << "Right intron: " << this->consensusStrand << " : " << right_intron << endl;
+    cout << "Right exon  : " << this->consensusStrand << " : " << right_exon << endl;
+    */
+    double score =( exon.getScore(left_intron) + intron.getScore(left_exon) )
                 - ( intron.getScore(left_intron) + exon.getScore(left_exon) )
                 + ( intron.getScore(right_exon) + exon.getScore(right_intron) )
                 - ( exon.getScore(right_exon) + intron.getScore(right_intron) );
     
-    //cout << score << endl;
-    
     return score;
 }
+
+double portcullis::Junction::calcPositionWeightScore(GenomeMapper& gmap, PosMarkovModel& donor, PosMarkovModel& acceptor) const {
+    
+    int len = 0;
+    const char* ref = this->intron->ref.name.c_str();
+    const bool neg = getConsensusStrand() == Strand::NEGATIVE;
+    
+    string left = gmap.fetchBases(ref, intron->start - 3, intron->start + 20, &len);
+    if (neg) {
+        left = SeqUtils::reverseComplement(left);
+    }        
+
+    string right = gmap.fetchBases(ref, intron->start, intron->end, &len);
+    if (neg) {
+        right = SeqUtils::reverseComplement(right);
+    }        
+
+    string donorseq = neg ? right : left;
+    string acceptorseq = neg ? left : right;
+
+    return donor.getScore(donorseq) + acceptor.getScore(acceptorseq);
+}
diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc
index 50d061db..18cd2d92 100644
--- a/lib/src/markov_model.cc
+++ b/lib/src/markov_model.cc
@@ -22,15 +22,18 @@ using std::endl;
 #include <boost/algorithm/string.hpp>
 #include <boost/exception/all.hpp>
 
+#include <portcullis/seq_utils.hpp>
+using portcullis::SeqUtils;
+
 #include <portcullis/markov_model.hpp>
 
-void portcullis::MarkovModel::train(const vector<string>& input, const uint32_t _order) {
+void portcullis::KmerMarkovModel::train(const vector<string>& input, const uint32_t _order) {
     order = _order;
-    MMU temp;
+    KMMU temp;
     for(auto& seq : input) {
-        string sequp = boost::to_upper_copy(seq);
-        for(uint16_t i = order; i < sequp.size(); i++) {
-            temp[sequp.substr(i-order, order)][sequp.substr(i, 1)]++;
+        string s = SeqUtils::makeClean(seq);
+        for(uint16_t i = order; i < s.size(); i++) {
+            temp[s.substr(i-order, order)][s.substr(i, 1)]++;
         }
     }
     model.clear();
@@ -47,14 +50,50 @@ void portcullis::MarkovModel::train(const vector<string>& input, const uint32_t
 }
     
     
-double portcullis::MarkovModel::getScore(const string& seq) {
-    string sequp = boost::to_upper_copy(seq); 
+double portcullis::KmerMarkovModel::getScore(const string& seq) {
+    string s = SeqUtils::makeClean(seq);
     double score = 1.0;
-    for(uint16_t i = order; i < sequp.size(); i++){
-        score *= model[sequp.substr(i-order, order)][sequp.substr(i, 1)];
+    for(uint16_t i = order; i < s.size(); i++){
+        score *= model[s.substr(i-order, order)][s.substr(i, 1)];
     }
     if(score == 0.0) {
         return -100.0;
     }
     return log(score);
 }
+
+void portcullis::PosMarkovModel::train(const vector<string>& input, const uint32_t _order) {
+    order = _order;
+    PMMU temp;
+    for(auto& seq : input) {
+        string s = SeqUtils::makeClean(seq);
+        for(uint16_t i = order; i < s.size(); i++) {
+            temp[i][s.substr(i, 1)]++;
+        }
+    }
+    model.clear();
+    for(auto& i : temp) {
+        double sum = 0;
+        for(auto& j : i.second) {
+            sum += j.second;
+        }
+        for(auto& j : i.second) {
+            //cout << i.first << " " << j.first << " " << j.second << endl;
+            model[i.first][j.first] = j.second / sum;
+        }
+    }
+}
+    
+    
+double portcullis::PosMarkovModel::getScore(const string& seq) {
+    string s = SeqUtils::makeClean(seq);
+    double score = 1.0;
+    for(uint16_t i = order; i < s.size(); i++){
+        score *= model[i][s.substr(i, 1)];
+    }
+    if(score == 0.0) {
+        return -300.0;
+    }
+    return log(score);
+}
+
diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc
index e6adfc9d..f00bee43 100644
--- a/lib/src/model_features.cc
+++ b/lib/src/model_features.cc
@@ -15,11 +15,13 @@
 //  along with Portcullis.  If not, see <http://www.gnu.org/licenses/>.
 //  *******************************************************************
 
+#include <fstream>
 #include <iostream>
 #include <memory>
 using std::cout;
 using std::cerr;
 using std::endl;
+using std::ofstream;
 using std::make_shared;
 
 #include <ranger/DataDouble.h>
@@ -64,26 +66,32 @@ void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in
 
         int len = 0;
 
-        string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 80, j->getIntron()->start, &len);
+        string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 202, j->getIntron()->start - 2, &len);
         if (j->getConsensusStrand() == Strand::NEGATIVE) {
             left_exon = SeqUtils::reverseComplement(left_exon);
         }        
         exons.push_back(left_exon);
 
-        string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+81, &len);
+        /*string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+80, &len);
         if (j->getConsensusStrand() == Strand::NEGATIVE) {
             left_intron = SeqUtils::reverseComplement(left_intron);
         }        
         introns.push_back(left_intron);
 
-        string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end+1, &len);
+        string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end, &len);
         if (j->getConsensusStrand() == Strand::NEGATIVE) {
             right_intron = SeqUtils::reverseComplement(right_intron);
         }        
-        introns.push_back(right_intron);
+        introns.push_back(right_intron);*/
+        
+        string intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->end, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            intron = SeqUtils::reverseComplement(intron);
+        }        
+        introns.push_back(intron);
 
 
-        string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 80, &len);
+        string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 201, &len);
         if (j->getConsensusStrand() == Strand::NEGATIVE) {
             right_exon = SeqUtils::reverseComplement(right_exon);
         }        
@@ -94,44 +102,73 @@ void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in
     intronModel.train(introns, 5);
 }
 
+void portcullis::ModelFeatures::trainPositionWeightModel(const JunctionList& in) {
+
+    vector<string> donors;
+    vector<string> acceptors;
+    for(auto& j : in) {
+
+        int len = 0;
+
+        string left = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 3, j->getIntron()->start + 20, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            left = SeqUtils::reverseComplement(left);
+        }        
+   
+        string right = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->end, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            right = SeqUtils::reverseComplement(right);
+        }        
+        
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            donors.push_back(right);
+            acceptors.push_back(left);
+        }
+        else {
+            donors.push_back(left);
+            acceptors.push_back(right);            
+        }
+        
+    }
+
+    donorPWModel.train(donors, 1);
+    acceptorPWModel.train(acceptors, 1);
+}
+
 Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) {
     
+    const size_t JO_OFFSET = 10;
     vector<string> headers;
     headers.reserve( VAR_NAMES.size() + JO_NAMES.size() );
     headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() );
-    headers.insert( headers.end(), JO_NAMES.begin()+14, JO_NAMES.end() );
+    headers.insert( headers.end(), JO_NAMES.begin()+JO_OFFSET-1, JO_NAMES.end() );
     
     // Convert junction list info to double*
     double* d = new double[x.size() * headers.size()];
     
     uint32_t row = 0;
     for (const auto& j : x) {        
-        
+        d[0 * x.size() + row] = j->isGenuine();
+        //
         //d[0 * x.size() + row] = j->getNbJunctionAlignments();
         //d[0 * x.size() + row] = j->getNbDistinctAlignments();
-        d[0 * x.size() + row] = j->getNbReliableAlignments();
+        d[1 * x.size() + row] = j->getNbReliableAlignments();
         //d[3 * x.size() + row] = j->getMaxMinAnchor();
         //d[4 * x.size() + row] = j->getDiffAnchor();
         //d[5 * x.size() + row] = j->getNbDistinctAnchors();
-        d[1 * x.size() + row] = j->getEntropy();
-        d[2 * x.size() + row] = j->getMaxMMES();
-        d[3 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());
+        d[2 * x.size() + row] = j->getEntropy();
+        d[3 * x.size() + row] = j->getMaxMMES();
+        d[4 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());
         //d[3 * x.size() + row] = ;
-        d[4 * x.size() + row] = j->getReliable2RawRatio();
+        d[5 * x.size() + row] = j->getReliable2RawRatio();
         //d[5 * x.size() + row] = j->getMeanMismatches();
-        d[5 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
-        d[6 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel);     // Coding potential score
-        d[7 * x.size() + row] = j->isGenuine();
-        // Junction overhang values at each position are first converted into deviation from expected distributions       
-        double half_read_length = (double)j->getMeanQueryLength() / 2.0;    
-        for(size_t i = 14; i < JO_NAMES.size(); i++) {
-            double Ni = j->getJunctionOverhangs(i);                 // Actual count at this position
-            if (Ni == 0.0) Ni = 0.000000000001;                     // Ensure some value > 0 here otherwise we get -infinity later.
-            double Pi = 1.0 - ((double)i / half_read_length);       // Likely scale at this position
-            double Ei = (double)j->getNbJunctionAlignments() * Pi;  // Expected count at this position
-            double Xi = abs(log2(Ni / Ei));                         // Log deviation
-            
-            d[(i-14 + 8) * x.size() + row] = Xi;
+        d[6 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
+        //d[7 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel);
+        d[7 * x.size() + row] = isPWModelEmpty() ? 0.0 : j->calcPositionWeightScore(gmap, donorPWModel, acceptorPWModel);
+        
+        //Junction overhang values at each position are first converted into deviation from expected distributions       
+        for(size_t i = JO_OFFSET; i <= JO_NAMES.size(); i++) {
+            d[(i-JO_OFFSET + 8) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1);
         }
         
         row++;
@@ -150,6 +187,18 @@ portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionLis
     cout << "Creating feature vector" << endl;
     Data* trainingData = juncs2FeatureVectors(x);
     
+    path feature_file = outputPrefix + ".features";
+    cout << "Saving feature vector to disk: " << feature_file << endl;
+    
+    ofstream fout(feature_file.c_str(), std::ofstream::out);
+    
+    fout << Intron::locationOutputHeader() << "\t" << trainingData->getHeader() << endl;
+    for(size_t i = 0; i < x.size(); i++) {        
+        fout << *(x[i]->getIntron()) << "\t" << trainingData->getRow(i) << endl;
+    }
+    
+    fout.close();
+     
     cout << "Initialising random forest" << endl;
     ForestPtr f = nullptr;
     if (regressionMode) {
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index 7b5eaa35..232b807e 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -59,14 +59,12 @@ namespace po = boost::program_options;
 #include <portcullis/portcullis_fs.hpp>
 #include <portcullis/performance.hpp>
 #include <portcullis/rule_parser.hpp>
-#include <portcullis/markov_model.hpp>
 using portcullis::PortcullisFS;
 using portcullis::Intron;
 using portcullis::IntronHasher;
 using portcullis::Performance;
 using portcullis::eval;
 using portcullis::bam::GenomeMapper;
-using portcullis::MarkovModel;
 
 #include "junction_filter.hpp"
 
@@ -220,19 +218,35 @@ void portcullis::JunctionFilter::filter() {
         JunctionSystem isp(passJuncs);
         isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp");
         
+        // Analyse positive set to get L0.05 of intron size
+        cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl;
+        
+        const uint32_t poslongintronthreshold = mf.L95 * 2;
+        JunctionList pass2juncs;
         for(auto& j : passJuncs) {
+            if (j->getIntronSize() <= poslongintronthreshold) {
+                pass2juncs.push_back(j);
+            }
+            else {
+                failJuncs.push_back(j);
+            }
+        }
+        cout << "Removed " << (passJuncs.size() - pass2juncs.size()) << " junctions from positive set with introns > L95 x 2 = " << poslongintronthreshold << endl << endl;        
+        
+        
+        for(auto& j : pass2juncs) {
             JunctionPtr copy = make_shared<Junction>(*j);
             copy->setGenuine(true);
             pos.push_back(copy);            
         }
         if (!genuineFile.empty()) {
-            shared_ptr<Performance> p = calcPerformance(passJuncs, failJuncs);
+            shared_ptr<Performance> p = calcPerformance(pass2juncs, failJuncs);
             cout << "Performance of initial positive set (High PPV is important):" << endl;
             cout << Performance::longHeader() << endl;
             cout << p->toLongString() << endl << endl;
             
             JunctionSystem invalidPos;
-            for(auto& j : passJuncs) {
+            for(auto& j : pass2juncs) {
                 if (!j->isGenuine()) {
                     invalidPos.addJunction(j);
                 }
@@ -251,16 +265,17 @@ void portcullis::JunctionFilter::filter() {
             missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp");
         }
         
-        
-        // Analyse positive set to get L0.05 of intron size
-        cout << "Length of intron at 95th percentile of positive set: " << mf.calcIntronThreshold(passJuncs) << endl << endl;
-        
         cout << "Training coding potential markov models on positive set ...";
         cout.flush();
         mf.trainCodingPotentialModel(pos);
         cout << " done." << endl;
         cout << "Exon model contains " << mf.exonModel.size() << " 5-mers." << endl;
-        cout << "Intron model contains " << mf.intronModel.size() << " 5-mers." << endl;
+        cout << "Intron model contains " << mf.intronModel.size() << " 5-mers." << endl << endl;
+        
+        cout << "Training position weight markov models on positive set ...";
+        cout.flush();
+        mf.trainPositionWeightModel(pos);
+        cout << " done." << endl;
                 
         passJuncs.clear();
         failJuncs.clear();
@@ -268,14 +283,14 @@ void portcullis::JunctionFilter::filter() {
         
         doRuleBasedFiltering(this->getIntitalNegRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial negative set for training", resultMap);
         
-        const uint32_t longintronthreshold = mf.L95 * 6;
-        cout << endl << "Adding junctions to negative set with MaxMMES < 10 and excessively long intron size of positive set (> L95 x 6 = " << longintronthreshold << ")...";
+        const uint32_t neglongintronthreshold = mf.L95 * 6;
+        cout << endl << "Adding junctions to negative set with MaxMMES < 10 and excessively long intron size of positive set (> L95 x 6 = " << neglongintronthreshold << ")...";
         cout.flush();
         
         uint32_t nblongintrons = 0;
         JunctionList fail2Juncs;
         for(auto& j : failJuncs) {
-            if (j->getIntronSize() > longintronthreshold && j->getMaxMMES() < 10 ) {
+            if (j->getIntronSize() > neglongintronthreshold && j->getMaxMMES() < 10 ) {
                 passJuncs.push_back(j);
                 nblongintrons++;                
             }
@@ -332,8 +347,11 @@ void portcullis::JunctionFilter::filter() {
         training.insert(training.end(), pos.begin(), pos.end());
         training.insert(training.end(), neg.begin(), neg.end());
         
+        JunctionSystem trainingSystem(training);
+        trainingSystem.sort();        
+        
         cout << "Training a random forest model using the initial positive and negative datasets" << endl;
-        shared_ptr<Forest> forest = mf.trainInstance(training, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true);
+        shared_ptr<Forest> forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true);
         
         forest->saveToFile();
         forest->writeOutput(&cout);
@@ -341,9 +359,7 @@ void portcullis::JunctionFilter::filter() {
         modelFile = output.string() + ".selftrain.forest";
         cout << endl;
     }
-    
-    
-        
+       
     // Manage a junction system of all discarded junctions
     JunctionSystem discardedJuncs;
     
@@ -497,9 +513,6 @@ void portcullis::JunctionFilter::filter() {
 }
 
 
-    
-
-
 void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) {
     // Output stats
     size_t diff = in.size() - pass.size();
@@ -626,9 +639,11 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
     path scorepath = output.string() + ".scores";
     ofstream scoreStream(scorepath.c_str());
     
+    scoreStream << "Score\t" << Intron::locationOutputHeader() << "\t" << testingData->getHeader() << endl;
+    
     for(size_t i = 0; i < all.size(); i++) {
         
-        scoreStream << f->getPredictions()[i][0] << endl;
+        scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) << "\t" << testingData->getRow(i) << endl;
         if (f->getPredictions()[i][0] >= 0.1) {
             pass.push_back(all[i]);
         }
diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp
index 760bf45c..dd970caf 100644
--- a/src/junction_filter.hpp
+++ b/src/junction_filter.hpp
@@ -52,7 +52,6 @@ using boost::filesystem::symbolic_link_exists;
 #include <portcullis/junction_system.hpp>
 #include <portcullis/performance.hpp>
 #include <portcullis/rule_parser.hpp>
-#include <portcullis/markov_model.hpp>
 #include <portcullis/model_features.hpp>
 using portcullis::bam::GenomeMapper;
 using portcullis::PortcullisFS;
@@ -60,7 +59,6 @@ using portcullis::Intron;
 using portcullis::IntronHasher;
 using portcullis::Performance;
 using portcullis::JuncResultMap;
-using portcullis::MarkovModel;
 using portcullis::ModelFeatures;
 
 

From c95babb67efe942ea5a6a71f69dbff578e33fb5a Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Tue, 26 Apr 2016 17:16:57 +0100
Subject: [PATCH 09/15] Implemented ability to control filtering threshold
 value as a hidden command line argument.

---
 lib/include/portcullis/markov_model.hpp   |  2 +-
 lib/include/portcullis/model_features.hpp |  7 +++----
 lib/src/model_features.cc                 | 23 +++++++++++------------
 src/junction_filter.cc                    |  7 ++++++-
 src/junction_filter.hpp                   | 10 ++++++++++
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/markov_model.hpp
index bcc4744e..613fb177 100644
--- a/lib/include/portcullis/markov_model.hpp
+++ b/lib/include/portcullis/markov_model.hpp
@@ -101,7 +101,7 @@ class PosMarkovModel : public portcullis::MarkovModel {
     double getScore(const string& seq);
     size_t size() const {
         return model.size();
-    }
+    }    
 };
 
 
diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp
index 28393f72..6451530f 100644
--- a/lib/include/portcullis/model_features.hpp
+++ b/lib/include/portcullis/model_features.hpp
@@ -41,7 +41,7 @@ typedef shared_ptr<Forest> ForestPtr;
 const vector<string> VAR_NAMES = { 
         "Genuine",
         //"M2-nb-reads", 
-        //"M3-nb_dist_aln", 
+        "M3-nb_dist_aln", 
         "nb_rel_aln", 
         //"M8-max_min_anc", 
         //"M9-dif_anc", 
@@ -49,11 +49,10 @@ const vector<string> VAR_NAMES = {
         "entropy", 
         "maxmmes", 
         "min_hamming_score", 
-        //"M14-hamming3p",
         "rel2raw_ratio",
-        //"mean_mismatches",
+        "mean_mismatches",
         "IntronScore",
-        //"CodingPotential"
+        "CodingPotential",
         "PWS"
 };
     
diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc
index f00bee43..30672ee2 100644
--- a/lib/src/model_features.cc
+++ b/lib/src/model_features.cc
@@ -151,24 +151,23 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) {
         d[0 * x.size() + row] = j->isGenuine();
         //
         //d[0 * x.size() + row] = j->getNbJunctionAlignments();
-        //d[0 * x.size() + row] = j->getNbDistinctAlignments();
-        d[1 * x.size() + row] = j->getNbReliableAlignments();
+        d[1 * x.size() + row] = j->getNbDistinctAlignments();
+        d[2 * x.size() + row] = j->getNbReliableAlignments();
         //d[3 * x.size() + row] = j->getMaxMinAnchor();
         //d[4 * x.size() + row] = j->getDiffAnchor();
         //d[5 * x.size() + row] = j->getNbDistinctAnchors();
-        d[2 * x.size() + row] = j->getEntropy();
-        d[3 * x.size() + row] = j->getMaxMMES();
-        d[4 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());
-        //d[3 * x.size() + row] = ;
-        d[5 * x.size() + row] = j->getReliable2RawRatio();
-        //d[5 * x.size() + row] = j->getMeanMismatches();
-        d[6 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
-        //d[7 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel);
-        d[7 * x.size() + row] = isPWModelEmpty() ? 0.0 : j->calcPositionWeightScore(gmap, donorPWModel, acceptorPWModel);
+        d[3 * x.size() + row] = j->getEntropy();
+        d[4 * x.size() + row] = j->getMaxMMES();
+        d[5 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());
+        d[6 * x.size() + row] = j->getReliable2RawRatio();
+        d[7 * x.size() + row] = j->getMeanMismatches();
+        d[8 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
+        d[9 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel);
+        d[10 * x.size() + row] = isPWModelEmpty() ? 0.0 : j->calcPositionWeightScore(gmap, donorPWModel, acceptorPWModel);
         
         //Junction overhang values at each position are first converted into deviation from expected distributions       
         for(size_t i = JO_OFFSET; i <= JO_NAMES.size(); i++) {
-            d[(i-JO_OFFSET + 8) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1);
+            d[(i-JO_OFFSET + 11) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1);
         }
         
         row++;
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index 232b807e..a25f045e 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -85,6 +85,7 @@ portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile,
     filterNovel = false;
     source = DEFAULT_FILTER_SOURCE;
     verbose = false;
+    threshold = DEFAULT_FILTER_THRESHOLD;
 }
     
     
@@ -644,7 +645,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
     for(size_t i = 0; i < all.size(); i++) {
         
         scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) << "\t" << testingData->getRow(i) << endl;
-        if (f->getPredictions()[i][0] >= 0.1) {
+        if (f->getPredictions()[i][0] >= this->threshold) {
             pass.push_back(all[i]);
         }
         else {
@@ -675,6 +676,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
     int32_t max_length;
     string canonical;
     string source;
+    double threshold;
     bool verbose;
     bool help;
     
@@ -718,6 +720,8 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
     hidden_options.add_options()
         ("genome-file", po::value<path>(&genomeFile), "Path to the genome file to process.")        
         ("junction-file", po::value<path>(&junctionFile), "Path to the junction file to process.")
+        ("threshold", po::value<double>(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), 
+                "The threshold score at which we determine a junction to be genuine or not.")            
             ;
 
     // Positional option for the input bam file
@@ -771,6 +775,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
     }
     filter.setReferenceFile(referenceFile);
     filter.setGenomeFile(genomeFile);
+    filter.setThreshold(threshold);
     
     filter.filter();
 
diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp
index dd970caf..ce568a66 100644
--- a/src/junction_filter.hpp
+++ b/src/junction_filter.hpp
@@ -76,6 +76,7 @@ const string ST_IPOS_RULES_FILE = "selftrain_initial_pos.json";
 const string ST_INEG_RULES_FILE = "selftrain_initial_neg.json";
 const uint16_t DEFAULT_FILTER_THREADS = 1;
 const uint16_t DEFAULT_SELFTRAIN_TREES = 100;
+const double DEFAULT_FILTER_THRESHOLD = 0.1;
 
 
 class JunctionFilter {
@@ -97,6 +98,7 @@ class JunctionFilter {
     bool filterSemi;
     bool filterNovel;    
     string source;
+    double threshold;
     bool verbose;    
     
     
@@ -139,6 +141,14 @@ class JunctionFilter {
         this->genomeFile = genomeFile;
     }
 
+    double getThreshold() const {
+        return threshold;
+    }
+
+    void setThreshold(double threshold) {
+        this->threshold = threshold;
+    }
+
 
     path getOutput() const {
         return output;

From 6a2c46907da679592c42e038869c8a4dd92dc21d Mon Sep 17 00:00:00 2001
From: Dan <d.mapleson@gmail.com>
Date: Tue, 26 Apr 2016 22:28:10 +0100
Subject: [PATCH 10/15] Added splicing signal feature and cleaned up some bugs.

---
 lib/include/portcullis/junction.hpp       |  9 +++-
 lib/include/portcullis/model_features.hpp | 13 ++++--
 lib/src/junction.cc                       | 21 ++++++---
 lib/src/markov_model.cc                   |  6 ++-
 lib/src/model_features.cc                 | 47 ++++++++++++++++---
 src/junction_filter.cc                    | 55 +++++++++--------------
 6 files changed, 97 insertions(+), 54 deletions(-)

diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp
index 4bd64869..b18c3d2a 100644
--- a/lib/include/portcullis/junction.hpp
+++ b/lib/include/portcullis/junction.hpp
@@ -180,7 +180,10 @@ static string cssToString(CanonicalSS css) {
     return string("No");
 }
 
-
+struct SplicingScores {
+    double positionWeighting = 0.0;
+    double splicingSignal = 0.0;
+};
 
 struct AlignmentInfo {
     BamAlignmentPtr ba;
@@ -882,7 +885,9 @@ class Junction {
     
     double calcCodingPotential(GenomeMapper& gmap, KmerMarkovModel& exon, KmerMarkovModel& intron) const;
     
-    double calcPositionWeightScore(GenomeMapper& gmap, PosMarkovModel& donor, PosMarkovModel& acceptor) const;
+    SplicingScores calcSplicingScores(GenomeMapper& gmap, KmerMarkovModel& donorT, KmerMarkovModel& donorF,
+                            KmerMarkovModel& acceptorT, KmerMarkovModel& acceptorF,
+                            PosMarkovModel& donorP, PosMarkovModel& acceptorP) const;
     
     
     // **** Output methods ****
diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp
index 6451530f..7d35f3ca 100644
--- a/lib/include/portcullis/model_features.hpp
+++ b/lib/include/portcullis/model_features.hpp
@@ -31,7 +31,10 @@ using boost::filesystem::path;
 #include <portcullis/junction.hpp>
 using portcullis::bam::GenomeMapper;
 using portcullis::MarkovModel;
+using portcullis::Junction;
+using portcullis::JunctionPtr;
 using portcullis::JunctionList;
+using portcullis::SplicingScores;
 
 namespace portcullis {
     
@@ -53,15 +56,19 @@ const vector<string> VAR_NAMES = {
         "mean_mismatches",
         "IntronScore",
         "CodingPotential",
-        "PWS"
+        "PWS",
+        "SS"
 };
     
-    
 class ModelFeatures {
 public:
     uint32_t L95;
     KmerMarkovModel exonModel;
     KmerMarkovModel intronModel;
+    KmerMarkovModel donorTModel;
+    KmerMarkovModel donorFModel;
+    KmerMarkovModel acceptorTModel;
+    KmerMarkovModel acceptorFModel;    
     PosMarkovModel donorPWModel;
     PosMarkovModel acceptorPWModel;
     GenomeMapper gmap;
@@ -83,7 +90,7 @@ class ModelFeatures {
     
     void trainCodingPotentialModel(const JunctionList& in);
     
-    void trainPositionWeightModel(const JunctionList& in);
+    void trainSplicingModels(const JunctionList& pass, const JunctionList& fail);
     
     Data* juncs2FeatureVectors(const JunctionList& x);
     
diff --git a/lib/src/junction.cc b/lib/src/junction.cc
index e2237ac9..62f1db20 100644
--- a/lib/src/junction.cc
+++ b/lib/src/junction.cc
@@ -1254,15 +1254,18 @@ double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, KmerMarkovM
     cout << "Right intron: " << this->consensusStrand << " : " << right_intron << endl;
     cout << "Right exon  : " << this->consensusStrand << " : " << right_exon << endl;
     */
-    double score =( exon.getScore(left_intron) + intron.getScore(left_exon) )
-                - ( intron.getScore(left_intron) + exon.getScore(left_exon) )
-                + ( intron.getScore(right_exon) + exon.getScore(right_intron) )
-                - ( exon.getScore(right_exon) + intron.getScore(right_intron) );
+    double score =( exon.getScore(left_exon) - intron.getScore(left_exon) )
+                + ( intron.getScore(left_intron) - exon.getScore(left_intron) )
+                + ( intron.getScore(right_intron) - exon.getScore(right_intron) )
+                + ( exon.getScore(right_exon) + intron.getScore(right_exon) );
     
     return score;
 }
 
-double portcullis::Junction::calcPositionWeightScore(GenomeMapper& gmap, PosMarkovModel& donor, PosMarkovModel& acceptor) const {
+
+portcullis::SplicingScores portcullis::Junction::calcSplicingScores(GenomeMapper& gmap, KmerMarkovModel& donorT, KmerMarkovModel& donorF,
+                            KmerMarkovModel& acceptorT, KmerMarkovModel& acceptorF,
+                            PosMarkovModel& donorP, PosMarkovModel& acceptorP) const {
     
     int len = 0;
     const char* ref = this->intron->ref.name.c_str();
@@ -1273,7 +1276,7 @@ double portcullis::Junction::calcPositionWeightScore(GenomeMapper& gmap, PosMark
         left = SeqUtils::reverseComplement(left);
     }        
 
-    string right = gmap.fetchBases(ref, intron->start, intron->end, &len);
+    string right = gmap.fetchBases(ref, intron->end - 20, intron->end + 2, &len);
     if (neg) {
         right = SeqUtils::reverseComplement(right);
     }        
@@ -1281,5 +1284,9 @@ double portcullis::Junction::calcPositionWeightScore(GenomeMapper& gmap, PosMark
     string donorseq = neg ? right : left;
     string acceptorseq = neg ? left : right;
 
-    return donor.getScore(donorseq) + acceptor.getScore(acceptorseq);
+    SplicingScores ss;
+    ss.positionWeighting = donorP.getScore(donorseq) + acceptorP.getScore(acceptorseq);
+    ss.splicingSignal =     (donorT.getScore(donorseq) - donorF.getScore(donorseq)) 
+                        +   (acceptorT.getScore(acceptorseq) - acceptorF.getScore(acceptorseq));
+    return ss;
 }
diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc
index 18cd2d92..2e7828d7 100644
--- a/lib/src/markov_model.cc
+++ b/lib/src/markov_model.cc
@@ -32,8 +32,10 @@ void portcullis::KmerMarkovModel::train(const vector<string>& input, const uint3
     KMMU temp;
     for(auto& seq : input) {
         string s = SeqUtils::makeClean(seq);
-        for(uint16_t i = order; i < s.size(); i++) {
-            temp[s.substr(i-order, order)][s.substr(i, 1)]++;
+        if (s.size() > order+1) {
+            for(uint16_t i = order; i < s.size(); i++) {
+                temp[s.substr(i-order, order)][s.substr(i, 1)]++;
+            }
         }
     }
     model.clear();
diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc
index 30672ee2..11f8af77 100644
--- a/lib/src/model_features.cc
+++ b/lib/src/model_features.cc
@@ -102,11 +102,11 @@ void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in
     intronModel.train(introns, 5);
 }
 
-void portcullis::ModelFeatures::trainPositionWeightModel(const JunctionList& in) {
+void portcullis::ModelFeatures::trainSplicingModels(const JunctionList& pass, const JunctionList& fail) {
 
     vector<string> donors;
     vector<string> acceptors;
-    for(auto& j : in) {
+    for(auto& j : pass) {
 
         int len = 0;
 
@@ -115,7 +115,7 @@ void portcullis::ModelFeatures::trainPositionWeightModel(const JunctionList& in)
             left = SeqUtils::reverseComplement(left);
         }        
    
-        string right = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->end, &len);
+        string right = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end - 20, j->getIntron()->end + 2, &len);
         if (j->getConsensusStrand() == Strand::NEGATIVE) {
             right = SeqUtils::reverseComplement(right);
         }        
@@ -133,6 +133,38 @@ void portcullis::ModelFeatures::trainPositionWeightModel(const JunctionList& in)
 
     donorPWModel.train(donors, 1);
     acceptorPWModel.train(acceptors, 1);
+    donorTModel.train(donors, 5);
+    acceptorTModel.train(acceptors, 5);
+    
+    donors.clear();
+    acceptors.clear();
+    
+    for(auto& j : fail) {
+
+        int len = 0;
+
+        string left = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 3, j->getIntron()->start + 20, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            left = SeqUtils::reverseComplement(left);
+        }        
+   
+        string right = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end - 20, j->getIntron()->end + 2, &len);
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            right = SeqUtils::reverseComplement(right);
+        }        
+        
+        if (j->getConsensusStrand() == Strand::NEGATIVE) {
+            donors.push_back(right);
+            acceptors.push_back(left);
+        }
+        else {
+            donors.push_back(left);
+            acceptors.push_back(right);
+        }
+    }
+
+    donorFModel.train(donors, 5);
+    acceptorFModel.train(acceptors, 5);
 }
 
 Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) {
@@ -148,6 +180,9 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) {
     
     uint32_t row = 0;
     for (const auto& j : x) {        
+        SplicingScores ss = j->calcSplicingScores(gmap, donorTModel, donorFModel, acceptorTModel, acceptorFModel, 
+                donorPWModel, acceptorPWModel);
+        
         d[0 * x.size() + row] = j->isGenuine();
         //
         //d[0 * x.size() + row] = j->getNbJunctionAlignments();
@@ -163,11 +198,13 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) {
         d[7 * x.size() + row] = j->getMeanMismatches();
         d[8 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
         d[9 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel);
-        d[10 * x.size() + row] = isPWModelEmpty() ? 0.0 : j->calcPositionWeightScore(gmap, donorPWModel, acceptorPWModel);
+        d[10 * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.positionWeighting;
+        d[11 * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.splicingSignal;
+        
         
         //Junction overhang values at each position are first converted into deviation from expected distributions       
         for(size_t i = JO_OFFSET; i <= JO_NAMES.size(); i++) {
-            d[(i-JO_OFFSET + 11) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1);
+            d[(i-JO_OFFSET + 12) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1);
         }
         
         row++;
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index a25f045e..e0848f6d 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -266,18 +266,6 @@ void portcullis::JunctionFilter::filter() {
             missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp");
         }
         
-        cout << "Training coding potential markov models on positive set ...";
-        cout.flush();
-        mf.trainCodingPotentialModel(pos);
-        cout << " done." << endl;
-        cout << "Exon model contains " << mf.exonModel.size() << " 5-mers." << endl;
-        cout << "Intron model contains " << mf.intronModel.size() << " 5-mers." << endl << endl;
-        
-        cout << "Training position weight markov models on positive set ...";
-        cout.flush();
-        mf.trainPositionWeightModel(pos);
-        cout << " done." << endl;
-                
         passJuncs.clear();
         failJuncs.clear();
         resultMap.clear();
@@ -342,6 +330,14 @@ void portcullis::JunctionFilter::filter() {
         
         cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl;
         
+        cout << "Training markov models ...";
+        cout.flush();
+        mf.trainCodingPotentialModel(pos);
+        mf.trainSplicingModels(pos, neg);
+        cout << " done." << endl << endl;
+        
+        
+        
         // Build the training set by combining the positive and negative sets
         JunctionList training;
         training.reserve(pos.size() + neg.size());
@@ -351,7 +347,8 @@ void portcullis::JunctionFilter::filter() {
         JunctionSystem trainingSystem(training);
         trainingSystem.sort();        
         
-        cout << "Training a random forest model using the initial positive and negative datasets" << endl;
+        cout << "Training a random forest model" << endl
+             << "------------------------------" << endl << endl;
         shared_ptr<Forest> forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true);
         
         forest->saveToFile();
@@ -366,7 +363,8 @@ void portcullis::JunctionFilter::filter() {
     
     // Do ML based filtering if requested
     if(!modelFile.empty() && exists(modelFile)){
-        cout << "Predicting valid junctions using random forest model" << endl << endl;
+        cout << "Predicting valid junctions using random forest model" << endl 
+             << "----------------------------------------------------" << endl << endl;
         
         JunctionList passJuncs;
         JunctionList failJuncs;
@@ -577,17 +575,11 @@ void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, cons
     
 void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf) {
     
-    if (verbose) {
-        cerr << endl << "Preparing junction metrics into matrix" << endl;
-    }
+    cout << "Creating feature vector" << endl;
     
     Data* testingData = mf.juncs2FeatureVectors(all);
     
-    // Load forest from disk
-    
-    if (verbose) {
-        cerr << "Initialising random forest" << endl;
-    }
+    cout << "Initialising random forest" << endl;
     
     shared_ptr<Forest> f = nullptr;
     if (train) {
@@ -621,30 +613,23 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
     
     f->setVerboseOut(&cerr);
     
-    if (verbose) {
-        cerr << "Loading forest model from disk" << endl;
-    }
-    
     // Load trees from saved model
     f->loadFromFile(modelFile.string());
     
-    if (verbose) {
-        cerr << "Making predictions" << endl;
-    }
+    cout << "Making predictions" << endl;
     f->run(verbose);
     
-    if (verbose) {
-        cerr << "Separating original junction data into pass and fail categories." << endl;
-    }
-    
     path scorepath = output.string() + ".scores";
     ofstream scoreStream(scorepath.c_str());
     
-    scoreStream << "Score\t" << Intron::locationOutputHeader() << "\t" << testingData->getHeader() << endl;
+    scoreStream << "Score\t" << Intron::locationOutputHeader() << "\tStrand\tSS\t" << testingData->getHeader() << endl;
     
     for(size_t i = 0; i < all.size(); i++) {
         
-        scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) << "\t" << testingData->getRow(i) << endl;
+        scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) 
+                    << "\t" << strandToChar(all[i]->getConsensusStrand())
+                    << "\t" << cssToChar(all[i]->getSpliceSiteType())
+                    << "\t" << testingData->getRow(i) << endl;
         if (f->getPredictions()[i][0] >= this->threshold) {
             pass.push_back(all[i]);
         }

From 1853cc2e9f8a3745773a71e701f65c5387b1d5a3 Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Thu, 28 Apr 2016 18:10:48 +0100
Subject: [PATCH 11/15] Tidying up filtering code and trying to tweak
 performance.

---
 Makefile.am                                   |  10 +-
 configure.ac                                  |   1 +
 data/selftrain_initial_neg.layer1.json        |  93 ++++
 data/selftrain_initial_neg.layer2.json        |  93 ++++
 data/selftrain_initial_neg.layer3.json        |  93 ++++
 data/selftrain_initial_neg.layer4.json        |  93 ++++
 data/selftrain_initial_neg.layer5.json        |  93 ++++
 ...json => selftrain_initial_neg.layer6.json} |  20 +-
 ...json => selftrain_initial_pos.layer1.json} |  18 +-
 data/selftrain_initial_pos.layer2.json        |  69 +++
 lib/include/portcullis/model_features.hpp     |  66 ++-
 lib/src/junction.cc                           |   2 +-
 lib/src/model_features.cc                     |  77 ++--
 src/junction_filter.cc                        | 397 ++++++++++++------
 src/junction_filter.hpp                       |  20 +-
 15 files changed, 961 insertions(+), 184 deletions(-)
 create mode 100644 data/selftrain_initial_neg.layer1.json
 create mode 100644 data/selftrain_initial_neg.layer2.json
 create mode 100644 data/selftrain_initial_neg.layer3.json
 create mode 100644 data/selftrain_initial_neg.layer4.json
 create mode 100644 data/selftrain_initial_neg.layer5.json
 rename data/{selftrain_initial_neg.json => selftrain_initial_neg.layer6.json} (77%)
 rename data/{selftrain_initial_pos.json => selftrain_initial_pos.layer1.json} (79%)
 create mode 100644 data/selftrain_initial_pos.layer2.json

diff --git a/Makefile.am b/Makefile.am
index e78793b8..0d39c81c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -56,8 +56,14 @@ dist_bin_SCRIPTS =  scripts/analyse_alignments.py \
 configdir = $(datadir)/portcullis
 dist_config_DATA = \
 		    data/default_filter.json \
-		    data/selftrain_initial_neg.json \
-		    data/selftrain_initial_pos.json
+		    data/selftrain_initial_neg.layer1.json \
+		    data/selftrain_initial_neg.layer2.json \
+		    data/selftrain_initial_neg.layer3.json \
+		    data/selftrain_initial_neg.layer4.json \
+		    data/selftrain_initial_neg.layer5.json \
+		    data/selftrain_initial_neg.layer6.json \
+		    data/selftrain_initial_pos.layer1.json \
+		    data/selftrain_initial_pos.layer2.json
 
 # SRC DIRS
 make_dirs=deps/htslib-1.3 deps/ranger-0.3.8 lib src tests
diff --git a/configure.ac b/configure.ac
index 346afb02..6996e2cd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -143,6 +143,7 @@ AX_BOOST_CHRONO
 AX_BOOST_PROGRAM_OPTIONS
 AX_BOOST_TIMER
 
+
 # Combine BOOST variables
 BOOST_LIBS="${BOOST_TIMER_STATIC_LIB} ${BOOST_CHRONO_STATIC_LIB} ${BOOST_FILESYSTEM_STATIC_LIB} ${BOOST_PROGRAM_OPTIONS_STATIC_LIB} ${BOOST_SYSTEM_STATIC_LIB}"
 AC_SUBST([BOOST_LIBS])
diff --git a/data/selftrain_initial_neg.layer1.json b/data/selftrain_initial_neg.layer1.json
new file mode 100644
index 00000000..9b1a5844
--- /dev/null
+++ b/data/selftrain_initial_neg.layer1.json
@@ -0,0 +1,93 @@
+{
+	"parameters": {
+                "M1-canonical_ss": {
+			"operator": "in",
+			"value": ["N", "S"]
+		},
+                "M2-nb_reads": {
+                        "operator": "lte",
+                        "value": 1
+                },                
+		"M3-nb_dist_aln": {
+			"operator": "gte",
+			"value": 2
+		},
+		"M4-nb_rel_aln": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M11-entropy": {
+			"operator": "lt",
+			"value": 1.0
+		},
+                "M11-entropy.2": {
+			"operator": "gt",
+			"value": 2.0
+		},
+                "M11-entropy.3": {
+			"operator": "eq",
+			"value": 0.0
+		},
+                "M12-maxmmes": {
+                        "operator": "lt",
+                        "value": 7
+                },
+                "M12-maxmmes.2": {
+                        "operator": "lt",
+                        "value": 10
+                },
+                "M12-maxmmes.3": {
+                        "operator": "lt",
+                        "value": 20
+                },
+                "M8-max_min_anc": {
+                        "operator": "lt",
+                        "value": 16
+                },
+		"Suspect": {
+			"operator": "eq",
+			"value": 1
+		},
+                "PFP": {
+			"operator": "eq",
+			"value": 1
+		},
+                "M13-hamming5p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M14-hamming3p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M19-mean_mismatches": {
+			"operator": "gte",
+			"value": 5.0
+		},
+                "M19-mean_mismatches.2": {
+			"operator": "gte",
+			"value": 2.0
+		},
+                "M20-nb_usrs": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M20-nb_usrs.2": {
+                        "operator": "eq",
+                        "value": 1
+                },
+                "M21-nb_msrs": {
+                        "operator": "gte",
+                        "value": 1
+                },
+                "M22-rel2raw": {
+                        "operator": "lt",
+                        "value": 0.9
+                },
+                "M22-rel2raw.2": {
+                        "operator": "eq",
+                        "value": 0.0
+                }
+	},
+	"expression": "( M20-nb_usrs & M21-nb_msrs & M12-maxmmes.2 & M22-rel2raw.2 )"  
+}
diff --git a/data/selftrain_initial_neg.layer2.json b/data/selftrain_initial_neg.layer2.json
new file mode 100644
index 00000000..e9a9fc33
--- /dev/null
+++ b/data/selftrain_initial_neg.layer2.json
@@ -0,0 +1,93 @@
+{
+	"parameters": {
+                "M1-canonical_ss": {
+			"operator": "in",
+			"value": ["N", "S"]
+		},
+                "M2-nb_reads": {
+                        "operator": "lte",
+                        "value": 1
+                },                
+		"M3-nb_dist_aln": {
+			"operator": "gte",
+			"value": 2
+		},
+		"M4-nb_rel_aln": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M11-entropy": {
+			"operator": "lt",
+			"value": 1.0
+		},
+                "M11-entropy.2": {
+			"operator": "gt",
+			"value": 2.0
+		},
+                "M11-entropy.3": {
+			"operator": "eq",
+			"value": 0.0
+		},
+                "M12-maxmmes": {
+                        "operator": "lt",
+                        "value": 7
+                },
+                "M12-maxmmes.2": {
+                        "operator": "lt",
+                        "value": 10
+                },
+                "M12-maxmmes.3": {
+                        "operator": "lt",
+                        "value": 20
+                },
+                "M8-max_min_anc": {
+                        "operator": "lt",
+                        "value": 16
+                },
+		"Suspect": {
+			"operator": "eq",
+			"value": 1
+		},
+                "PFP": {
+			"operator": "eq",
+			"value": 1
+		},
+                "M13-hamming5p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M14-hamming3p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M19-mean_mismatches": {
+			"operator": "gte",
+			"value": 5.0
+		},
+                "M19-mean_mismatches.2": {
+			"operator": "gte",
+			"value": 2.0
+		},
+                "M20-nb_usrs": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M20-nb_usrs.2": {
+                        "operator": "eq",
+                        "value": 1
+                },
+                "M21-nb_msrs": {
+                        "operator": "gte",
+                        "value": 1
+                },
+                "M22-rel2raw": {
+                        "operator": "lt",
+                        "value": 0.9
+                },
+                "M22-rel2raw.2": {
+                        "operator": "eq",
+                        "value": 0.0
+                }
+	},
+	"expression": "( M20-nb_usrs.2 & M1-canonical_ss & M12-maxmmes.2 )"  
+}
diff --git a/data/selftrain_initial_neg.layer3.json b/data/selftrain_initial_neg.layer3.json
new file mode 100644
index 00000000..ec5cbc5d
--- /dev/null
+++ b/data/selftrain_initial_neg.layer3.json
@@ -0,0 +1,93 @@
+{
+	"parameters": {
+                "M1-canonical_ss": {
+			"operator": "in",
+			"value": ["N", "S"]
+		},
+                "M2-nb_reads": {
+                        "operator": "lte",
+                        "value": 1
+                },                
+		"M3-nb_dist_aln": {
+			"operator": "gte",
+			"value": 2
+		},
+		"M4-nb_rel_aln": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M11-entropy": {
+			"operator": "lt",
+			"value": 1.0
+		},
+                "M11-entropy.2": {
+			"operator": "gt",
+			"value": 2.0
+		},
+                "M11-entropy.3": {
+			"operator": "eq",
+			"value": 0.0
+		},
+                "M12-maxmmes": {
+                        "operator": "lt",
+                        "value": 7
+                },
+                "M12-maxmmes.2": {
+                        "operator": "lt",
+                        "value": 10
+                },
+                "M12-maxmmes.3": {
+                        "operator": "lt",
+                        "value": 20
+                },
+                "M8-max_min_anc": {
+                        "operator": "lt",
+                        "value": 16
+                },
+		"Suspect": {
+			"operator": "eq",
+			"value": 1
+		},
+                "PFP": {
+			"operator": "eq",
+			"value": 1
+		},
+                "M13-hamming5p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M14-hamming3p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M19-mean_mismatches": {
+			"operator": "gte",
+			"value": 5.0
+		},
+                "M19-mean_mismatches.2": {
+			"operator": "gte",
+			"value": 2.0
+		},
+                "M20-nb_usrs": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M20-nb_usrs.2": {
+                        "operator": "eq",
+                        "value": 1
+                },
+                "M21-nb_msrs": {
+                        "operator": "gte",
+                        "value": 1
+                },
+                "M22-rel2raw": {
+                        "operator": "lt",
+                        "value": 0.9
+                },
+                "M22-rel2raw.2": {
+                        "operator": "eq",
+                        "value": 0.0
+                }
+	},
+	"expression": "( Suspect & M11-entropy.2 )"  
+}
diff --git a/data/selftrain_initial_neg.layer4.json b/data/selftrain_initial_neg.layer4.json
new file mode 100644
index 00000000..44890918
--- /dev/null
+++ b/data/selftrain_initial_neg.layer4.json
@@ -0,0 +1,93 @@
+{
+	"parameters": {
+                "M1-canonical_ss": {
+			"operator": "in",
+			"value": ["N", "S"]
+		},
+                "M2-nb_reads": {
+                        "operator": "lte",
+                        "value": 1
+                },                
+		"M3-nb_dist_aln": {
+			"operator": "gte",
+			"value": 2
+		},
+		"M4-nb_rel_aln": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M11-entropy": {
+			"operator": "lt",
+			"value": 1.0
+		},
+                "M11-entropy.2": {
+			"operator": "gt",
+			"value": 2.0
+		},
+                "M11-entropy.3": {
+			"operator": "eq",
+			"value": 0.0
+		},
+                "M12-maxmmes": {
+                        "operator": "lt",
+                        "value": 7
+                },
+                "M12-maxmmes.2": {
+                        "operator": "lt",
+                        "value": 10
+                },
+                "M12-maxmmes.3": {
+                        "operator": "lt",
+                        "value": 20
+                },
+                "M8-max_min_anc": {
+                        "operator": "lt",
+                        "value": 16
+                },
+		"Suspect": {
+			"operator": "eq",
+			"value": 1
+		},
+                "PFP": {
+			"operator": "eq",
+			"value": 1
+		},
+                "M13-hamming5p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M14-hamming3p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M19-mean_mismatches": {
+			"operator": "gte",
+			"value": 5.0
+		},
+                "M19-mean_mismatches.2": {
+			"operator": "gte",
+			"value": 2.0
+		},
+                "M20-nb_usrs": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M20-nb_usrs.2": {
+                        "operator": "eq",
+                        "value": 1
+                },
+                "M21-nb_msrs": {
+                        "operator": "gte",
+                        "value": 1
+                },
+                "M22-rel2raw": {
+                        "operator": "lt",
+                        "value": 0.9
+                },
+                "M22-rel2raw.2": {
+                        "operator": "eq",
+                        "value": 0.0
+                }
+	},
+	"expression": "( M12-maxmmes & M22-rel2raw )"  
+}
diff --git a/data/selftrain_initial_neg.layer5.json b/data/selftrain_initial_neg.layer5.json
new file mode 100644
index 00000000..945bd289
--- /dev/null
+++ b/data/selftrain_initial_neg.layer5.json
@@ -0,0 +1,93 @@
+{
+	"parameters": {
+                "M1-canonical_ss": {
+			"operator": "in",
+			"value": ["N", "S"]
+		},
+                "M2-nb_reads": {
+                        "operator": "lte",
+                        "value": 1
+                },                
+		"M3-nb_dist_aln": {
+			"operator": "gte",
+			"value": 2
+		},
+		"M4-nb_rel_aln": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M11-entropy": {
+			"operator": "lt",
+			"value": 1.0
+		},
+                "M11-entropy.2": {
+			"operator": "gt",
+			"value": 2.0
+		},
+                "M11-entropy.3": {
+			"operator": "eq",
+			"value": 0.0
+		},
+                "M12-maxmmes": {
+                        "operator": "lt",
+                        "value": 7
+                },
+                "M12-maxmmes.2": {
+                        "operator": "lt",
+                        "value": 10
+                },
+                "M12-maxmmes.3": {
+                        "operator": "lt",
+                        "value": 20
+                },
+                "M8-max_min_anc": {
+                        "operator": "lt",
+                        "value": 16
+                },
+		"Suspect": {
+			"operator": "eq",
+			"value": 1
+		},
+                "PFP": {
+			"operator": "eq",
+			"value": 1
+		},
+                "M13-hamming5p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M14-hamming3p": {
+                        "operator": "lte",
+                        "value": 2
+                },
+                "M19-mean_mismatches": {
+			"operator": "gte",
+			"value": 5.0
+		},
+                "M19-mean_mismatches.2": {
+			"operator": "gte",
+			"value": 2.0
+		},
+                "M20-nb_usrs": {
+                        "operator": "eq",
+                        "value": 0
+                },
+                "M20-nb_usrs.2": {
+                        "operator": "eq",
+                        "value": 1
+                },
+                "M21-nb_msrs": {
+                        "operator": "gte",
+                        "value": 1
+                },
+                "M22-rel2raw": {
+                        "operator": "lt",
+                        "value": 0.9
+                },
+                "M22-rel2raw.2": {
+                        "operator": "eq",
+                        "value": 0.0
+                }
+	},
+	"expression": "( M1-canonical_ss & M22-rel2raw.2 )"  
+}
diff --git a/data/selftrain_initial_neg.json b/data/selftrain_initial_neg.layer6.json
similarity index 77%
rename from data/selftrain_initial_neg.json
rename to data/selftrain_initial_neg.layer6.json
index 4885e954..0265d1eb 100644
--- a/data/selftrain_initial_neg.json
+++ b/data/selftrain_initial_neg.layer6.json
@@ -24,14 +24,22 @@
 			"operator": "gt",
 			"value": 2.0
 		},
+                "M11-entropy.3": {
+			"operator": "eq",
+			"value": 0.0
+		},
                 "M12-maxmmes": {
                         "operator": "lt",
                         "value": 7
                 },
                 "M12-maxmmes.2": {
-                        "operator": "lte",
+                        "operator": "lt",
                         "value": 10
                 },
+                "M12-maxmmes.3": {
+                        "operator": "lt",
+                        "value": 20
+                },
                 "M8-max_min_anc": {
                         "operator": "lt",
                         "value": 16
@@ -71,7 +79,15 @@
                 "M21-nb_msrs": {
                         "operator": "gte",
                         "value": 1
+                },
+                "M22-rel2raw": {
+                        "operator": "lt",
+                        "value": 0.9
+                },
+                "M22-rel2raw.2": {
+                        "operator": "eq",
+                        "value": 0.0
                 }
 	},
-	"expression": "( M20-nb_usrs & M21-nb_msrs & M12-maxmmes.2 ) | ( M20-nb_usrs.2 & M1-canonical_ss & M12-maxmmes.2 ) | ( Suspect & M11-entropy.2 ) | M12-maxmmes | ( M1-canonical_ss & M4-nb_rel_aln )"  
+	"expression": "( PFP )"  
 }
diff --git a/data/selftrain_initial_pos.json b/data/selftrain_initial_pos.layer1.json
similarity index 79%
rename from data/selftrain_initial_pos.json
rename to data/selftrain_initial_pos.layer1.json
index 83539e6f..d351e911 100644
--- a/data/selftrain_initial_pos.json
+++ b/data/selftrain_initial_pos.layer1.json
@@ -4,6 +4,10 @@
 			"operator": "in",
 			"value": ["C", "S"]
 		},
+                "M1-canonical_ss.2": {
+			"operator": "in",
+			"value": ["N"]
+		},
 		"M4-nb_rel_aln": {
                         "operator": "gte",
                         "value": 5
@@ -16,10 +20,18 @@
                         "operator": "gte",
                         "value": 20
                 },
+                "M12-maxmmes.2": {
+                        "operator": "gte",
+                        "value": 10
+                },
 		"M11-entropy": {
 			"operator": "gt",
 			"value": 5.0
 		},
+                "M11-entropy.2": {
+			"operator": "gt",
+			"value": 2.0
+		},
                 "M13-hamming5p": {
                         "operator": "gte",
                         "value": 5
@@ -42,7 +54,7 @@
 		},
                 "M19-mean_mismatches.2": {
 			"operator": "lt",
-			"value": 1.0
+			"value": 0.2
 		},
                 "M20-nb_usrs": {
                         "operator": "gte",
@@ -50,8 +62,8 @@
                 },
                 "M22-rel2raw": {
                         "operator": "gte",
-                        "value": 0.1
+                        "value": 0.5
                 }
 	},
-	"expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M22-rel2raw & ( ( M19-mean_mismatches & M20-nb_usrs ) | ( M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) )"  
+	"expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M22-rel2raw & M12-maxmmes.2"  
 }
diff --git a/data/selftrain_initial_pos.layer2.json b/data/selftrain_initial_pos.layer2.json
new file mode 100644
index 00000000..d91483ac
--- /dev/null
+++ b/data/selftrain_initial_pos.layer2.json
@@ -0,0 +1,69 @@
+{
+	"parameters": {
+		"M1-canonical_ss": {
+			"operator": "in",
+			"value": ["C", "S"]
+		},
+                "M1-canonical_ss.2": {
+			"operator": "in",
+			"value": ["N"]
+		},
+		"M4-nb_rel_aln": {
+                        "operator": "gte",
+                        "value": 5
+                },
+                "M4-nb_rel_aln.2": {
+                        "operator": "gte",
+                        "value": 1
+                },
+                "M12-maxmmes": {
+                        "operator": "gte",
+                        "value": 20
+                },
+                "M12-maxmmes.2": {
+                        "operator": "gte",
+                        "value": 10
+                },
+		"M11-entropy": {
+			"operator": "gt",
+			"value": 5.0
+		},
+                "M11-entropy.2": {
+			"operator": "gt",
+			"value": 2.0
+		},
+                "M13-hamming5p": {
+                        "operator": "gte",
+                        "value": 5
+                },
+                "M13-hamming5p.2": {
+                        "operator": "gte",
+                        "value": 7
+                },
+                "M14-hamming3p": {
+                        "operator": "gte",
+                        "value": 5
+                },
+                "M14-hamming3p.2": {
+                        "operator": "gte",
+                        "value": 7
+                },
+		"M19-mean_mismatches": {
+			"operator": "eq",
+			"value": 0
+		},
+                "M19-mean_mismatches.2": {
+			"operator": "lt",
+			"value": 0.2
+		},
+                "M20-nb_usrs": {
+                        "operator": "gte",
+                        "value": 5
+                },
+                "M22-rel2raw": {
+                        "operator": "gte",
+                        "value": 0.5
+                }
+	},
+	"expression": "( ( M1-canonical_ss & M19-mean_mismatches & M20-nb_usrs ) | ( M1-canonical_ss & M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) | ( M1-canonical_ss.2 & M19-mean_mismatches & M12-maxmmes & M11-entropy ) )"  
+}
diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp
index 7d35f3ca..cdc76d05 100644
--- a/lib/include/portcullis/model_features.hpp
+++ b/lib/include/portcullis/model_features.hpp
@@ -43,24 +43,29 @@ typedef shared_ptr<Forest> ForestPtr;
 // List of variable names
 const vector<string> VAR_NAMES = { 
         "Genuine",
-        //"M2-nb-reads", 
-        "M3-nb_dist_aln", 
-        "nb_rel_aln", 
-        //"M8-max_min_anc", 
-        //"M9-dif_anc", 
-        //"M10-dist_anc", 
-        "entropy", 
-        "maxmmes", 
-        "min_hamming_score", 
-        "rel2raw_ratio",
-        "mean_mismatches",
-        "IntronScore",
-        "CodingPotential",
-        "PWS",
-        "SS"
+        "rna_usrs", 
+        "rna_dist", 
+        "rna_rel", 
+        "rna_entropy", 
+        "rna_rel2raw",
+        "rna_maxminanc", 
+        "rna_maxmmes", 
+        "rna_missmatch",
+        "rna_intron",
+        "dna_minhamm", 
+        "dna_coding",
+        "dna_pws",
+        "dna_ss"
+};
+
+struct Feature {
+    string name;
+    bool active;    
 };
     
 class ModelFeatures {
+private:
+    size_t fi;
 public:
     uint32_t L95;
     KmerMarkovModel exonModel;
@@ -72,10 +77,25 @@ class ModelFeatures {
     PosMarkovModel donorPWModel;
     PosMarkovModel acceptorPWModel;
     GenomeMapper gmap;
+    vector<Feature> features;
     
     ModelFeatures() : L95(0) {
+        fi = 1;
+        features.clear();
+        for(size_t i = 0; i < VAR_NAMES.size(); i++) {
+            Feature f;
+            f.name = VAR_NAMES[i];
+            f.active = true;
+            features.push_back(f);
+        }
+        for(size_t i = 0; i < JO_NAMES.size(); i++) {
+            Feature f;
+            f.name = JO_NAMES[i];
+            f.active = true;
+            features.push_back(f);
+        }        
     }
-    
+        
     bool isCodingPotentialModelEmpty() {
         return exonModel.size() == 0 || intronModel.size() == 0;
     }
@@ -96,5 +116,19 @@ class ModelFeatures {
     
     ForestPtr trainInstance(const JunctionList& x, string outputPrefix, 
             uint16_t trees, uint16_t threads, bool regressionMode, bool verbose);
+    
+    void resetActiveFeatureIndex() {
+        fi = 0;
+    }
+    int16_t getNextActiveFeatureIndex() {
+        for(int16_t i = fi+1; i < features.size(); i++) {
+            if (features[i].active) {
+                fi = i;
+                return i;
+            }
+        }
+        return -1;
+    }
+    
 };
 }
\ No newline at end of file
diff --git a/lib/src/junction.cc b/lib/src/junction.cc
index 62f1db20..d791b272 100644
--- a/lib/src/junction.cc
+++ b/lib/src/junction.cc
@@ -1257,7 +1257,7 @@ double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, KmerMarkovM
     double score =( exon.getScore(left_exon) - intron.getScore(left_exon) )
                 + ( intron.getScore(left_intron) - exon.getScore(left_intron) )
                 + ( intron.getScore(right_intron) - exon.getScore(right_intron) )
-                + ( exon.getScore(right_exon) + intron.getScore(right_exon) );
+                + ( exon.getScore(right_exon) - intron.getScore(right_exon) );
     
     return score;
 }
diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc
index 11f8af77..f5cf9a57 100644
--- a/lib/src/model_features.cc
+++ b/lib/src/model_features.cc
@@ -168,43 +168,72 @@ void portcullis::ModelFeatures::trainSplicingModels(const JunctionList& pass, co
 }
 
 Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) {
-    
-    const size_t JO_OFFSET = 10;
+        
     vector<string> headers;
-    headers.reserve( VAR_NAMES.size() + JO_NAMES.size() );
-    headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() );
-    headers.insert( headers.end(), JO_NAMES.begin()+JO_OFFSET-1, JO_NAMES.end() );
+    for(auto& f : features) {
+        if (f.active) {
+            headers.push_back(f.name);
+        }
+    }
     
     // Convert junction list info to double*
     double* d = new double[x.size() * headers.size()];
     
     uint32_t row = 0;
-    for (const auto& j : x) {        
+    for (const auto& j : x) {
         SplicingScores ss = j->calcSplicingScores(gmap, donorTModel, donorFModel, acceptorTModel, acceptorFModel, 
                 donorPWModel, acceptorPWModel);
         
         d[0 * x.size() + row] = j->isGenuine();
-        //
-        //d[0 * x.size() + row] = j->getNbJunctionAlignments();
-        d[1 * x.size() + row] = j->getNbDistinctAlignments();
-        d[2 * x.size() + row] = j->getNbReliableAlignments();
-        //d[3 * x.size() + row] = j->getMaxMinAnchor();
-        //d[4 * x.size() + row] = j->getDiffAnchor();
-        //d[5 * x.size() + row] = j->getNbDistinctAnchors();
-        d[3 * x.size() + row] = j->getEntropy();
-        d[4 * x.size() + row] = j->getMaxMMES();
-        d[5 * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());
-        d[6 * x.size() + row] = j->getReliable2RawRatio();
-        d[7 * x.size() + row] = j->getMeanMismatches();
-        d[8 * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);     // Intron score
-        d[9 * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel);
-        d[10 * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.positionWeighting;
-        d[11 * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.splicingSignal;
         
+        uint16_t i = 1;
+        
+        if (features[1].active) {
+            d[i++ * x.size() + row] = j->getNbUniquelySplicedReads();
+        }
+        if (features[2].active) {
+            d[i++ * x.size() + row] = j->getNbDistinctAlignments();
+        }
+        if (features[3].active) {
+            d[i++ * x.size() + row] = j->getNbReliableAlignments();
+        }
+        if (features[4].active) {
+            d[i++ * x.size() + row] = j->getEntropy();
+        }
+        if (features[5].active) {
+            d[i++ * x.size() + row] = j->getReliable2RawRatio();
+        }        
+        if (features[6].active) {
+            d[i++ * x.size() + row] = j->getMaxMinAnchor();
+        }        
+        if (features[7].active) {
+            d[i++ * x.size() + row] = j->getMaxMMES();
+        }
+        if (features[8].active) {
+            d[i++ * x.size() + row] = j->getMeanMismatches();
+        }
+        if (features[9].active) {
+            d[i++ * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);
+        }
+        if (features[10].active) {
+            d[i++ * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());;
+        }
+        if (features[11].active) {
+            d[i++ * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel);
+        }
+        if (features[12].active) {
+            d[i++ * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.positionWeighting;
+        }
+        if (features[13].active) {
+            d[i++ * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.splicingSignal;
+        }
         
+                
         //Junction overhang values at each position are first converted into deviation from expected distributions       
-        for(size_t i = JO_OFFSET; i <= JO_NAMES.size(); i++) {
-            d[(i-JO_OFFSET + 12) * x.size() + row] = j->getJunctionOverhangLogDeviation(i-1);
+        for(size_t joi = 0; joi < JO_NAMES.size(); joi++) {
+            if (features[joi + 14].active) {
+                d[i++ * x.size() + row] = j->getJunctionOverhangLogDeviation(joi);
+            }
         }
         
         row++;
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index e0848f6d..e4994a78 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -207,126 +207,11 @@ void portcullis::JunctionFilter::filter() {
         // The initial positive and negative sets
         JunctionList pos, neg;
          
-        cout << "Self training mode activated.  Collecting initial positive and negative sets from input." << endl;
+        cout << "Self training mode activated." << endl << endl;
         
-        JunctionList passJuncs;
-        JunctionList failJuncs;
-        JuncResultMap resultMap;
-                
-        doRuleBasedFiltering(this->getIntitalPosRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial positive set for training", resultMap);
-        
-        cout << "Saving initial positive set:" << endl;
-        JunctionSystem isp(passJuncs);
-        isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp");
-        
-        // Analyse positive set to get L0.05 of intron size
-        cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl;
-        
-        const uint32_t poslongintronthreshold = mf.L95 * 2;
-        JunctionList pass2juncs;
-        for(auto& j : passJuncs) {
-            if (j->getIntronSize() <= poslongintronthreshold) {
-                pass2juncs.push_back(j);
-            }
-            else {
-                failJuncs.push_back(j);
-            }
-        }
-        cout << "Removed " << (passJuncs.size() - pass2juncs.size()) << " junctions from positive set with introns > L95 x 2 = " << poslongintronthreshold << endl << endl;        
-        
-        
-        for(auto& j : pass2juncs) {
-            JunctionPtr copy = make_shared<Junction>(*j);
-            copy->setGenuine(true);
-            pos.push_back(copy);            
-        }
-        if (!genuineFile.empty()) {
-            shared_ptr<Performance> p = calcPerformance(pass2juncs, failJuncs);
-            cout << "Performance of initial positive set (High PPV is important):" << endl;
-            cout << Performance::longHeader() << endl;
-            cout << p->toLongString() << endl << endl;
-            
-            JunctionSystem invalidPos;
-            for(auto& j : pass2juncs) {
-                if (!j->isGenuine()) {
-                    invalidPos.addJunction(j);
-                }
-            }
-            JunctionSystem missedPos;
-            for(auto& j : failJuncs) {
-                if (j->isGenuine()) {
-                    missedPos.addJunction(j);
-                }
-            }
-            
-            cout << "Saving invalid junctions in initial positive set to disk:" << endl;
-            invalidPos.saveAll(output.string() + ".selftrain.initialset.invalidpos", "portcullis_invalid_isp");
-            
-            cout << "Saving missed positive junctions to disk:" << endl;
-            missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp");
-        }
-        
-        passJuncs.clear();
-        failJuncs.clear();
-        resultMap.clear();
-        
-        doRuleBasedFiltering(this->getIntitalNegRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial negative set for training", resultMap);
-        
-        const uint32_t neglongintronthreshold = mf.L95 * 6;
-        cout << endl << "Adding junctions to negative set with MaxMMES < 10 and excessively long intron size of positive set (> L95 x 6 = " << neglongintronthreshold << ")...";
-        cout.flush();
-        
-        uint32_t nblongintrons = 0;
-        JunctionList fail2Juncs;
-        for(auto& j : failJuncs) {
-            if (j->getIntronSize() > neglongintronthreshold && j->getMaxMMES() < 10 ) {
-                passJuncs.push_back(j);
-                nblongintrons++;                
-            }
-            else {
-                fail2Juncs.push_back(j);
-            }
-        }
-        
-        cout << " done." << endl
-             << "Found an additional " << nblongintrons << " junctions with long introns." << endl << endl;
-        
-        cout << "Saving initial negative set:" << endl;
-        JunctionSystem isn(passJuncs);
-        isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn");
-        
-        for(auto& j : passJuncs) {
-            JunctionPtr copy = make_shared<Junction>(*j);
-            copy->setGenuine(false);
-            neg.push_back(copy);            
-        }
+        createPositiveSet(currentJuncs, pos, mf);
         
-        if (!genuineFile.empty()) {
-            shared_ptr<Performance> p = calcPerformance(passJuncs, fail2Juncs, true);            
-            cout << "Performance of initial negative set (High NPV is important):" << endl;
-            cout << Performance::longHeader() << endl;
-            cout << p->toLongString() << endl << endl;
-            
-            JunctionSystem invalidNeg;
-            JunctionSystem missedNeg;
-            
-            for(auto& j : passJuncs) {
-                if (j->isGenuine()) {
-                    invalidNeg.addJunction(j);
-                }
-            }
-            for(auto& j : fail2Juncs) {
-                if (!j->isGenuine()) {
-                    missedNeg.addJunction(j);
-                }
-            }
-            
-            cout << "Saving genuine valid junctions in initial negative set to disk:" << endl;
-            invalidNeg.saveAll(output.string() + ".selftrain.initialset.invalidneg", "portcullis_invalid_isn");
-            
-            cout << "Saving missed negative junctions to disk:" << endl;
-            missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn");
-        }
+        createNegativeSet(mf.L95, currentJuncs, neg);
         
         cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl;
         
@@ -347,9 +232,47 @@ void portcullis::JunctionFilter::filter() {
         JunctionSystem trainingSystem(training);
         trainingSystem.sort();        
         
-        cout << "Training a random forest model" << endl
-             << "------------------------------" << endl << endl;
-        shared_ptr<Forest> forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true);
+        cout << "Training Random Forest" << endl
+             << "----------------------" << endl << endl;
+        bool done = false;
+        shared_ptr<Forest> forest = nullptr;
+        while(!done) {
+        
+            forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true);
+            const vector<double> importance = forest->getVariableImportance();
+            mf.resetActiveFeatureIndex();
+            uint16_t min_importance_idx = 10000;
+            double min_importance_val = 100000.0;
+            uint16_t min_feature_idx = 10000;
+            // This approach to feature selection (i.e. removing worst features by importance)
+            // doesn't appear to have much impact on final results and is computationally
+            // expensive.  Removing for now.
+            /*for(size_t i = 0; i < importance.size(); i++) {
+                double imp = importance[i];
+                int16_t j = mf.getNextActiveFeatureIndex();
+                if (imp < 0.1 && imp < min_importance_val) {
+                    min_importance_idx = i;
+                    min_importance_val = imp;
+                    min_feature_idx = j;
+                }
+            }*/
+            if (min_importance_idx != 10000) {
+                mf.features[min_feature_idx].active = false;
+                cout << "Deactivating feature: " << mf.features[min_feature_idx].name << " - " << min_importance_val << endl;                
+            }
+            else {
+                done = true;
+            }
+        }
+        
+        const vector<double> importance = forest->getVariableImportance();
+        bool foundIrrelevant = false;
+        mf.resetActiveFeatureIndex();
+        cout << "Active features remaining:" << endl;
+        for(auto& i : importance) {
+            int16_t j = mf.getNextActiveFeatureIndex();
+            cout << mf.features[j].name << " - " << i << endl;
+        }
         
         forest->saveToFile();
         forest->writeOutput(&cout);
@@ -511,6 +434,177 @@ void portcullis::JunctionFilter::filter() {
     }
 }
 
+void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, JunctionList& pos, ModelFeatures& mf) {
+    JuncResultMap resultMap;
+         
+    cout << "Creating initial positive set for training" << endl
+         << "------------------------------------------" << endl << endl;
+
+    if (!genuineFile.empty()) {
+        cout << "Performance of each positive filter layer (Low FPs is important):" << endl;
+        cout << "LAYER\t" << Performance::longHeader() << endl;
+    } 
+    JunctionList p1, p2, f1, f2;
+    RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, f1, "Creating initial positive set for training", resultMap);
+    if (!genuineFile.empty()) {
+        cout << "1\t" << calcPerformance(p1, f1, true)->toLongString() << endl;
+    }
+    RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, f2, "Creating initial positive set for training", resultMap);
+    if (!genuineFile.empty()) {
+        cout << "2\t" << calcPerformance(p2, f2, true)->toLongString() << endl;
+    }
+
+    const uint32_t L95 = mf.calcIntronThreshold(p2);
+    const uint32_t L95x2 = L95 * 2;
+    
+    JunctionList passJuncs, failJuncs;
+    for(auto& j : p2) {
+        if (j->getIntronSize() <= L95x2) {
+            passJuncs.push_back(j);
+        }
+        else {
+            failJuncs.push_back(j);
+        }
+    }
+    if (!genuineFile.empty()) {
+        cout << "L95x2\t" << calcPerformance(passJuncs, failJuncs, true)->toLongString() << endl;
+    }
+        
+    // Analyse positive set to get L0.05 of intron size
+    cout << endl << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl;
+
+    cout << "Saving initial positive set:" << endl;
+    JunctionSystem isp(passJuncs);
+    isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp");
+
+
+    for(auto& j : passJuncs) {
+        JunctionPtr copy = make_shared<Junction>(*j);
+        copy->setGenuine(true);
+        pos.push_back(copy);            
+    }
+    if (!genuineFile.empty()) {
+        
+        JunctionSystem invalidPos;
+        for(auto& j : passJuncs) {
+            if (!j->isGenuine()) {
+                invalidPos.addJunction(j);
+            }
+        }
+        JunctionSystem missedPos;
+        for(auto& j : failJuncs) {
+            if (j->isGenuine()) {
+                missedPos.addJunction(j);
+            }
+        }
+
+        cout << "Saving invalid junctions in initial positive set to disk:" << endl;
+        invalidPos.saveAll(output.string() + ".selftrain.initialset.invalidpos", "portcullis_invalid_isp");
+
+        cout << "Saving missed positive junctions to disk:" << endl;
+        missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp");
+    }
+}
+
+void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg) {
+    
+    JuncResultMap resultMap;
+        
+    cout << "Creating initial negative set for training" << endl
+         << "------------------------------------------" << endl << endl;
+
+    if (!genuineFile.empty()) {
+       cout << "Performance of each negative filter layer (Low FNs is important):" << endl;
+       cout << "LAYER\t" << Performance::longHeader() << endl;
+    }
+    JunctionList p1, p2, p3, p4, p5, p6, p7, f1, f2, f3, f4, f5, f6, f7;
+    RuleFilter::filter(this->getIntitalNegRulesFile(1), all, p1, f1, "Creating initial negative set for training", resultMap);
+    if (!genuineFile.empty()) {
+       cout << "1\t" << calcPerformance(p1, f1, true)->toLongString() << endl;
+    }
+    RuleFilter::filter(this->getIntitalNegRulesFile(2), f1, p2, f2, "Creating initial negative set for training", resultMap);
+    if (!genuineFile.empty()) {
+       cout << "2\t" << calcPerformance(p2, f2, true)->toLongString() << endl;
+    }
+    RuleFilter::filter(this->getIntitalNegRulesFile(3), f2, p3, f3, "Creating initial negative set for training", resultMap);
+    if (!genuineFile.empty()) {
+       cout << "3\t" << calcPerformance(p3, f3, true)->toLongString() << endl;
+    }
+    RuleFilter::filter(this->getIntitalNegRulesFile(4), f3, p4, f4, "Creating initial negative set for training", resultMap);
+    if (!genuineFile.empty()) {
+       cout << "4\t" << calcPerformance(p4, f4, true)->toLongString() << endl;
+    }
+    RuleFilter::filter(this->getIntitalNegRulesFile(5), f4, p5, f5, "Creating initial negative set for training", resultMap);
+    if (!genuineFile.empty()) {
+       cout << "5\t" << calcPerformance(p5, f5, true)->toLongString() << endl;
+    }
+    RuleFilter::filter(this->getIntitalNegRulesFile(6), f5, p6, f6, "Creating initial negative set for training", resultMap);
+    if (!genuineFile.empty()) {
+       cout << "6\t" << calcPerformance(p6, f6, true)->toLongString() << endl;
+    }
+    
+    JunctionList passJuncs, failJuncs;
+    const uint32_t L95x5 = L95 * 5;
+    for(auto& j : f6) {
+       if (j->getIntronSize() > L95x5 && j->getMaxMMES() < 10 ) {
+           p7.push_back(j);
+       }
+       else {
+           failJuncs.push_back(j);
+       }
+    }
+    if (!genuineFile.empty()) {
+       cout << "L95x5\t" << calcPerformance(p7, failJuncs, true)->toLongString() << endl;
+    }
+    
+    cout << endl << "Concatenating TNs from each layer to create negative set" << endl << endl;
+    
+    passJuncs.insert(passJuncs.end(), p1.begin(), p1.end());
+    passJuncs.insert(passJuncs.end(), p2.begin(), p2.end());
+    passJuncs.insert(passJuncs.end(), p3.begin(), p3.end());
+    passJuncs.insert(passJuncs.end(), p4.begin(), p4.end());
+    passJuncs.insert(passJuncs.end(), p5.begin(), p5.end());
+    passJuncs.insert(passJuncs.end(), p6.begin(), p6.end());
+    passJuncs.insert(passJuncs.end(), p7.begin(), p7.end());
+
+    if (!genuineFile.empty()) {
+       cout << "Final\t" << calcPerformance(passJuncs, failJuncs, true)->toLongString() << endl;
+    }
+    
+    cout << endl << "Saving initial negative set:" << endl;
+    JunctionSystem isn(passJuncs);
+    isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn");
+
+    for(auto& j : passJuncs) {
+       JunctionPtr copy = make_shared<Junction>(*j);
+       copy->setGenuine(false);
+       neg.push_back(copy);            
+    }
+
+    if (!genuineFile.empty()) {
+    
+       JunctionSystem invalidNeg;
+       JunctionSystem missedNeg;
+
+       for(auto& j : passJuncs) {
+           if (j->isGenuine()) {
+               invalidNeg.addJunction(j);
+           }
+       }
+       for(auto& j : failJuncs) {
+           if (!j->isGenuine()) {
+               missedNeg.addJunction(j);
+           }
+       }
+
+       cout << "Saving genuine valid junctions in initial negative set to disk:" << endl;
+       invalidNeg.saveAll(output.string() + ".selftrain.initialset.invalidneg", "portcullis_invalid_isn");
+
+       cout << "Saving missed negative junctions to disk:" << endl;
+       missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn");
+    }
+}
+
 
 void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) {
     // Output stats
@@ -526,7 +620,7 @@ void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, c
         shared_ptr<Performance> p = calcPerformance(pass, fail);
         cout << Performance::longHeader() << endl;
         cout << p->toLongString() << endl << endl;
-    }
+    }    
 }
 
 shared_ptr<Performance> portcullis::JunctionFilter::calcPerformance(const JunctionList& pass, const JunctionList& fail, bool invert) {
@@ -625,27 +719,72 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
     scoreStream << "Score\t" << Intron::locationOutputHeader() << "\tStrand\tSS\t" << testingData->getHeader() << endl;
     
     for(size_t i = 0; i < all.size(); i++) {
-        
+
         scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) 
                     << "\t" << strandToChar(all[i]->getConsensusStrand())
                     << "\t" << cssToChar(all[i]->getSpliceSiteType())
                     << "\t" << testingData->getRow(i) << endl;
-        if (f->getPredictions()[i][0] >= this->threshold) {
-            pass.push_back(all[i]);
-        }
-        else {
-            fail.push_back(all[i]);
-        }
     }
     
     scoreStream.close();
     
+    
+    if (!genuineFile.empty() && exists(genuineFile)) {
+        vector<double> thresholds;
+        for(double i = 0.05; i <= 0.5; i += 0.01) {
+            thresholds.push_back(i);
+        }
+        double max_mcc = 0.0;
+        double max_f1 = 0.0;
+        double best_t_mcc = 0.0;
+        double best_t_f1 = 0.0;
+        
+        cout << "Threshold\t" << Performance::longHeader() << endl;
+        for(auto& t : thresholds) {
+            JunctionList pjl;
+            JunctionList fjl;
+            categorise(f, all, pjl, fjl, t);
+            shared_ptr<Performance> perf = calcPerformance(pjl, fjl);
+            double mcc = perf->getMCC();
+            double f1 = perf->getF1Score();
+            cout << t << "\t" << perf->toLongString() << endl;
+            if (mcc > max_mcc) {
+                max_mcc = mcc;
+                best_t_mcc = t;
+            }
+            if (f1 > max_f1) {
+                max_f1 = f1;
+                best_t_f1 = t;
+            }
+        }
+        
+        cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl;
+        cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl;
+        
+        categorise(f, all, pass, fail, best_t_mcc);
+    }
+    else {
+        categorise(f, all, pass, fail, threshold);
+    }
+    
     cout << "Saved junction scores to: " << scorepath << endl;
     
     
     delete testingData;
 }
 
+void portcullis::JunctionFilter::categorise(shared_ptr<Forest> f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t) {
+    
+    for(size_t i = 0; i < all.size(); i++) {
+        if (f->getPredictions()[i][0] >= t) {
+            pass.push_back(all[i]);
+        }
+        else {
+            fail.push_back(all[i]);
+        }
+    }
+}
+
 int portcullis::JunctionFilter::main(int argc, char *argv[]) {
 
     path junctionFile;
diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp
index ce568a66..e0ebc948 100644
--- a/src/junction_filter.hpp
+++ b/src/junction_filter.hpp
@@ -72,11 +72,11 @@ const string DEFAULT_FILTER_OUTPUT = "portcullis_filter/portcullis";
 const string DEFAULT_FILTER_SOURCE = "portcullis";
 const string DEFAULT_FILTER_RULE_FILE = "default_filter.json";
 const string DEFAULT_FILTER_MODEL_FILE = "default_model.forest";
-const string ST_IPOS_RULES_FILE = "selftrain_initial_pos.json";
-const string ST_INEG_RULES_FILE = "selftrain_initial_neg.json";
+const string ST_IPOS_RULES_FILE = "selftrain_initial_pos";
+const string ST_INEG_RULES_FILE = "selftrain_initial_neg";
 const uint16_t DEFAULT_FILTER_THREADS = 1;
 const uint16_t DEFAULT_SELFTRAIN_TREES = 100;
-const double DEFAULT_FILTER_THRESHOLD = 0.1;
+const double DEFAULT_FILTER_THRESHOLD = 0.25;
 
 
 class JunctionFilter {
@@ -273,12 +273,12 @@ class JunctionFilter {
         this->maxLength = maxLength;
     }
     
-    path getIntitalPosRulesFile() const {
-        return path(dataDir.string() + "/" + ST_IPOS_RULES_FILE);
+    path getIntitalPosRulesFile(uint16_t index) const {
+        return path(dataDir.string() + "/" + ST_IPOS_RULES_FILE + ".layer" + std::to_string(index) + ".json");
     }
 
-    path getIntitalNegRulesFile() const {
-        return path(dataDir.string() + "/" + ST_INEG_RULES_FILE);
+    path getIntitalNegRulesFile(uint16_t index) const {
+        return path(dataDir.string() + "/" + ST_INEG_RULES_FILE + ".layer" + std::to_string(index) + ".json");
     }
 
     void filter();
@@ -296,7 +296,13 @@ class JunctionFilter {
     void printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix);
     
     void doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail, const string& prefix, JuncResultMap& resultMap);
+    
+    void categorise(shared_ptr<Forest> f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t);
         
+    void createPositiveSet(const JunctionList& all, JunctionList& pos, ModelFeatures& mf);
+    
+    void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg);
+    
 public:
   
     static string helpMessage() {

From d9ffcb223c9cfbe77bb822a6345ab082ed2252d9 Mon Sep 17 00:00:00 2001
From: Dan <d.mapleson@gmail.com>
Date: Thu, 28 Apr 2016 22:49:34 +0100
Subject: [PATCH 12/15] Tidy up output from filtering tool.

---
 lib/src/model_features.cc |  2 +-
 src/junction_filter.cc    | 42 +++++++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc
index f5cf9a57..49f1952d 100644
--- a/lib/src/model_features.cc
+++ b/lib/src/model_features.cc
@@ -216,7 +216,7 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) {
             d[i++ * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95);
         }
         if (features[10].active) {
-            d[i++ * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());;
+            d[i++ * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p());
         }
         if (features[11].active) {
             d[i++ * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel);
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index e4994a78..c66d52d2 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -438,21 +438,30 @@ void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, Junc
     JuncResultMap resultMap;
          
     cout << "Creating initial positive set for training" << endl
-         << "------------------------------------------" << endl << endl;
+         << "------------------------------------------" << endl << endl
+         << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial positive set." << endl << endl;
 
     if (!genuineFile.empty()) {
-        cout << "Performance of each positive filter layer (Low FPs is important):" << endl;
-        cout << "LAYER\t" << Performance::longHeader() << endl;
+        cout << "Performance of each positive filter layer (Low FPs is important):" << endl;        
     } 
+    cout << "LAYER\t";
+    if (!genuineFile.empty()) {
+        cout << Performance::longHeader();
+    }
+    
     JunctionList p1, p2, f1, f2;
+    
+    cout << endl << "1\t";
     RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, f1, "Creating initial positive set for training", resultMap);
     if (!genuineFile.empty()) {
-        cout << "1\t" << calcPerformance(p1, f1, true)->toLongString() << endl;
+        cout << calcPerformance(p1, f1, true)->toLongString();
     }
+    cout << endl << "2\t";    
     RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, f2, "Creating initial positive set for training", resultMap);
     if (!genuineFile.empty()) {
-        cout << "2\t" << calcPerformance(p2, f2, true)->toLongString() << endl;
+        cout << calcPerformance(p2, f2, true)->toLongString();
     }
+    cout << endl;
 
     const uint32_t L95 = mf.calcIntronThreshold(p2);
     const uint32_t L95x2 = L95 * 2;
@@ -511,37 +520,50 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL
     JuncResultMap resultMap;
         
     cout << "Creating initial negative set for training" << endl
-         << "------------------------------------------" << endl << endl;
+         << "------------------------------------------" << endl << endl
+         << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial negative set." << endl << endl;
 
     if (!genuineFile.empty()) {
-       cout << "Performance of each negative filter layer (Low FNs is important):" << endl;
-       cout << "LAYER\t" << Performance::longHeader() << endl;
+       cout << "Performance of each negative filter layer (Low FNs is important):" << endl;       
+    }
+    cout << "LAYER\t";
+    if (!genuineFile.empty()) {
+        cout << Performance::longHeader();
     }
+    
     JunctionList p1, p2, p3, p4, p5, p6, p7, f1, f2, f3, f4, f5, f6, f7;
+    
+    cout << endl << "1\t";
     RuleFilter::filter(this->getIntitalNegRulesFile(1), all, p1, f1, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
-       cout << "1\t" << calcPerformance(p1, f1, true)->toLongString() << endl;
+       cout << calcPerformance(p1, f1, true)->toLongString();
     }
+    cout << endl << "2\t";    
     RuleFilter::filter(this->getIntitalNegRulesFile(2), f1, p2, f2, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
        cout << "2\t" << calcPerformance(p2, f2, true)->toLongString() << endl;
     }
+    cout << endl << "3\t";
     RuleFilter::filter(this->getIntitalNegRulesFile(3), f2, p3, f3, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
        cout << "3\t" << calcPerformance(p3, f3, true)->toLongString() << endl;
     }
+    cout << endl << "4\t";
     RuleFilter::filter(this->getIntitalNegRulesFile(4), f3, p4, f4, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
        cout << "4\t" << calcPerformance(p4, f4, true)->toLongString() << endl;
     }
+    cout << endl << "5\t";
     RuleFilter::filter(this->getIntitalNegRulesFile(5), f4, p5, f5, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
        cout << "5\t" << calcPerformance(p5, f5, true)->toLongString() << endl;
     }
+    cout << endl << "6\t";
     RuleFilter::filter(this->getIntitalNegRulesFile(6), f5, p6, f6, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
        cout << "6\t" << calcPerformance(p6, f6, true)->toLongString() << endl;
     }
+    cout << endl;
     
     JunctionList passJuncs, failJuncs;
     const uint32_t L95x5 = L95 * 5;
@@ -557,7 +579,7 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL
        cout << "L95x5\t" << calcPerformance(p7, failJuncs, true)->toLongString() << endl;
     }
     
-    cout << endl << "Concatenating TNs from each layer to create negative set" << endl << endl;
+    cout << endl << "Concatenating negatives from each layer to create negative set" << endl << endl;
     
     passJuncs.insert(passJuncs.end(), p1.begin(), p1.end());
     passJuncs.insert(passJuncs.end(), p2.begin(), p2.end());

From 999eafa71c16d6ea087ea8db4c790414ba6823ba Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Fri, 29 Apr 2016 11:14:08 +0100
Subject: [PATCH 13/15] Determined an optimal set of features for filtering.

---
 lib/include/portcullis/model_features.hpp |  2 +-
 lib/src/model_features.cc                 | 12 ++---
 src/junction_filter.cc                    | 61 +++++++++++++++++------
 src/junction_filter.hpp                   |  2 +-
 4 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp
index cdc76d05..1820cc01 100644
--- a/lib/include/portcullis/model_features.hpp
+++ b/lib/include/portcullis/model_features.hpp
@@ -115,7 +115,7 @@ class ModelFeatures {
     Data* juncs2FeatureVectors(const JunctionList& x);
     
     ForestPtr trainInstance(const JunctionList& x, string outputPrefix, 
-            uint16_t trees, uint16_t threads, bool regressionMode, bool verbose);
+            uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose);
     
     void resetActiveFeatureIndex() {
         fi = 0;
diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc
index 49f1952d..902c7813 100644
--- a/lib/src/model_features.cc
+++ b/lib/src/model_features.cc
@@ -25,7 +25,7 @@ using std::ofstream;
 using std::make_shared;
 
 #include <ranger/DataDouble.h>
-#include <ranger/ForestRegression.h>
+#include <ranger/ForestProbability.h>
 #include <ranger/ForestClassification.h>
 
 #include <portcullis/junction.hpp>
@@ -247,7 +247,7 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) {
 
 
 portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionList& x, 
-        string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) {
+        string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) {
     
     cout << "Creating feature vector" << endl;
     Data* trainingData = juncs2FeatureVectors(x);
@@ -266,8 +266,8 @@ portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionLis
      
     cout << "Initialising random forest" << endl;
     ForestPtr f = nullptr;
-    if (regressionMode) {
-        f = make_shared<ForestRegression>();
+    if (probabilityMode) {
+        f = make_shared<ForestProbability>();
     }
     else {
         f = make_shared<ForestClassification>();
@@ -281,11 +281,11 @@ portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionLis
         trainingData,               // Data object
         0,                          // M Try (0 == use default)
         outputPrefix,               // Output prefix 
-        trees,                      // Number of trees
+        250, //trees,                      // Number of trees
         1236456789,                // Use fixed seed to avoid non-deterministic behaviour as much as possible
         threads,                    // Number of threads
         IMP_GINI,                   // Importance measure 
-        regressionMode ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION,  // Min node size
+        probabilityMode ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION,  // Min node size
         "",                         // Status var name 
         false,                      // Prediction mode
         true,                       // Replace 
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index c66d52d2..aec33fc8 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -49,7 +49,7 @@ namespace po = boost::program_options;
 
 
 #include <ranger/ForestClassification.h>
-#include <ranger/ForestRegression.h>
+#include <ranger/ForestProbability.h>
 #include <ranger/DataDouble.h>
 
 #include <portcullis/bam/genome_mapper.hpp>
@@ -201,6 +201,38 @@ void portcullis::JunctionFilter::filter() {
     ModelFeatures mf;
     mf.initGenomeMapper(this->genomeFile);
     
+    mf.features[1].active=false;        // NB USRS          (BAD)
+    mf.features[2].active=false;        // NB DISTRS        (BAD)
+    //mf.features[3].active=false;      // NB RELRS         (GOOD)
+    mf.features[4].active = false;      // ENTROPY          (BAD - JO LOGDEV ARE BETTER)
+    //mf.features[5].active = false;    // REL2RAW          (GOOD)
+    mf.features[6].active=false;        // MAXMINANC        (BAD - MAXMMES IS BETTER)
+    //mf.features[7].active=false;      // MAXMMES          (GOOD)
+    //mf.features[8].active=false;      // MEAN MISMATCH    (GOOD)
+    //mf.features[9].active=false;      // INTRON           (GOOD)
+    //mf.features[10].active=false;     // MIN_HAMM         (GOOD)
+    mf.features[11].active=false;       // CODING POTENTIAL (BAD)
+    //mf.features[12].active=false;     // POS WEIGHTS      (GOOD)
+    //mf.features[13].active=false;     // SPLICE SIGNAL    (GOOD)
+    /*mf.features[14].active=false;     // JO LOGDEV FEATURES BETTER THAN ENTROPY
+    mf.features[15].active=false;
+    mf.features[16].active=false;
+    mf.features[17].active=false;
+    mf.features[18].active=false;
+    mf.features[19].active=false;
+    mf.features[20].active=false;
+    mf.features[21].active=false;
+    mf.features[22].active=false;
+    mf.features[23].active=false;
+    mf.features[24].active=false;
+    mf.features[25].active=false;
+    mf.features[26].active=false;
+    mf.features[27].active=false;
+    mf.features[28].active=false;
+    mf.features[29].active=false;*/
+    
+    
+    
     
     if (train) {
         
@@ -541,29 +573,29 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL
     cout << endl << "2\t";    
     RuleFilter::filter(this->getIntitalNegRulesFile(2), f1, p2, f2, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
-       cout << "2\t" << calcPerformance(p2, f2, true)->toLongString() << endl;
+       cout << calcPerformance(p2, f2, true)->toLongString();
     }
     cout << endl << "3\t";
     RuleFilter::filter(this->getIntitalNegRulesFile(3), f2, p3, f3, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
-       cout << "3\t" << calcPerformance(p3, f3, true)->toLongString() << endl;
+       cout << calcPerformance(p3, f3, true)->toLongString();
     }
     cout << endl << "4\t";
     RuleFilter::filter(this->getIntitalNegRulesFile(4), f3, p4, f4, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
-       cout << "4\t" << calcPerformance(p4, f4, true)->toLongString() << endl;
+       cout << calcPerformance(p4, f4, true)->toLongString();
     }
     cout << endl << "5\t";
     RuleFilter::filter(this->getIntitalNegRulesFile(5), f4, p5, f5, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
-       cout << "5\t" << calcPerformance(p5, f5, true)->toLongString() << endl;
+       cout << calcPerformance(p5, f5, true)->toLongString();
     }
     cout << endl << "6\t";
     RuleFilter::filter(this->getIntitalNegRulesFile(6), f5, p6, f6, "Creating initial negative set for training", resultMap);
     if (!genuineFile.empty()) {
-       cout << "6\t" << calcPerformance(p6, f6, true)->toLongString() << endl;
+       cout << calcPerformance(p6, f6, true)->toLongString();
     }
-    cout << endl;
+    cout << endl << "L95x5\t";
     
     JunctionList passJuncs, failJuncs;
     const uint32_t L95x5 = L95 * 5;
@@ -576,10 +608,10 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL
        }
     }
     if (!genuineFile.empty()) {
-       cout << "L95x5\t" << calcPerformance(p7, failJuncs, true)->toLongString() << endl;
+       cout << calcPerformance(p7, failJuncs, true)->toLongString();
     }
     
-    cout << endl << "Concatenating negatives from each layer to create negative set" << endl << endl;
+    cout << endl << endl << "Concatenating negatives from each layer to create negative set" << endl << endl;
     
     passJuncs.insert(passJuncs.end(), p1.begin(), p1.end());
     passJuncs.insert(passJuncs.end(), p2.begin(), p2.end());
@@ -699,7 +731,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
     
     shared_ptr<Forest> f = nullptr;
     if (train) {
-        f = make_shared<ForestRegression>();
+        f = make_shared<ForestProbability>();
     }
     else {
         f = make_shared<ForestClassification>();
@@ -713,11 +745,12 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
         testingData,                // Data object
         0,                          // M Try (0 == use default)
         "",                         // Output prefix 
-        DEFAULT_SELFTRAIN_TREES,                        // Number of trees (will be overwritten when loading the model)
+        250, //DEFAULT_SELFTRAIN_TREES,    // Number of trees (will be overwritten when loading the model)
         1234567890,                 // Seed for random generator               
         threads,                    // Number of threads
         IMP_GINI,                   // Importance measure 
-        train ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION,  // Min node size
+        train ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION,  // Min node size
+        //DEFAULT_MIN_NODE_SIZE_CLASSIFICATION,
         "",                         // Status var name 
         true,                       // Prediction mode
         true,                       // Replace 
@@ -753,7 +786,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
     
     if (!genuineFile.empty() && exists(genuineFile)) {
         vector<double> thresholds;
-        for(double i = 0.05; i <= 0.5; i += 0.01) {
+        for(double i = 0.0; i <= 1.0; i += 0.01) {
             thresholds.push_back(i);
         }
         double max_mcc = 0.0;
@@ -798,7 +831,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
 void portcullis::JunctionFilter::categorise(shared_ptr<Forest> f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t) {
     
     for(size_t i = 0; i < all.size(); i++) {
-        if (f->getPredictions()[i][0] >= t) {
+        if (f->getPredictions()[i][0] <= t) {
             pass.push_back(all[i]);
         }
         else {
diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp
index e0ebc948..d45edae9 100644
--- a/src/junction_filter.hpp
+++ b/src/junction_filter.hpp
@@ -76,7 +76,7 @@ const string ST_IPOS_RULES_FILE = "selftrain_initial_pos";
 const string ST_INEG_RULES_FILE = "selftrain_initial_neg";
 const uint16_t DEFAULT_FILTER_THREADS = 1;
 const uint16_t DEFAULT_SELFTRAIN_TREES = 100;
-const double DEFAULT_FILTER_THRESHOLD = 0.25;
+const double DEFAULT_FILTER_THRESHOLD = 0.75;
 
 
 class JunctionFilter {

From 9404738c5233646ebf2152a74d0db1ccd4b87a14 Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Fri, 29 Apr 2016 15:49:08 +0100
Subject: [PATCH 14/15] Additional improvements to the filtering tool.

---
 Makefile.am                            |  4 +-
 data/selftrain_initial_neg.layer7.json | 21 +++++++
 data/selftrain_initial_pos.layer1.json | 52 +++--------------
 data/selftrain_initial_pos.layer2.json | 52 +++++------------
 data/selftrain_initial_pos.layer3.json | 77 +++++++++++++++++++++++++
 src/junction_filter.cc                 | 78 ++++++++++++++++----------
 src/junction_filter.hpp                |  2 +-
 7 files changed, 172 insertions(+), 114 deletions(-)
 create mode 100644 data/selftrain_initial_neg.layer7.json
 create mode 100644 data/selftrain_initial_pos.layer3.json

diff --git a/Makefile.am b/Makefile.am
index 0d39c81c..65431406 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -62,8 +62,10 @@ dist_config_DATA = \
 		    data/selftrain_initial_neg.layer4.json \
 		    data/selftrain_initial_neg.layer5.json \
 		    data/selftrain_initial_neg.layer6.json \
+		    data/selftrain_initial_neg.layer7.json \
 		    data/selftrain_initial_pos.layer1.json \
-		    data/selftrain_initial_pos.layer2.json
+		    data/selftrain_initial_pos.layer2.json \
+		    data/selftrain_initial_pos.layer3.json
 
 # SRC DIRS
 make_dirs=deps/htslib-1.3 deps/ranger-0.3.8 lib src tests
diff --git a/data/selftrain_initial_neg.layer7.json b/data/selftrain_initial_neg.layer7.json
new file mode 100644
index 00000000..5b6bde7c
--- /dev/null
+++ b/data/selftrain_initial_neg.layer7.json
@@ -0,0 +1,21 @@
+{
+	"parameters": {
+                "M2-nb_reads": {
+                        "operator": "gt",
+                        "value": 100
+                },
+                "M22-rel2raw": {
+                        "operator": "eq",
+                        "value": 0.0
+                },
+                "M13-hamming5p": {
+                        "operator": "lte",
+                        "value": 4
+                },
+                "M14-hamming3p": {
+                        "operator": "lte",
+                        "value": 4
+                }
+	},
+	"expression": "( M2-nb_reads & M22-rel2raw & M13-hamming5p & M14-hamming3p )"  
+}
diff --git a/data/selftrain_initial_pos.layer1.json b/data/selftrain_initial_pos.layer1.json
index d351e911..1bc58557 100644
--- a/data/selftrain_initial_pos.layer1.json
+++ b/data/selftrain_initial_pos.layer1.json
@@ -1,69 +1,33 @@
 {
 	"parameters": {
-		"M1-canonical_ss": {
-			"operator": "in",
-			"value": ["C", "S"]
-		},
-                "M1-canonical_ss.2": {
-			"operator": "in",
-			"value": ["N"]
-		},
 		"M4-nb_rel_aln": {
-                        "operator": "gte",
-                        "value": 5
-                },
-                "M4-nb_rel_aln.2": {
                         "operator": "gte",
                         "value": 1
                 },
                 "M12-maxmmes": {
                         "operator": "gte",
-                        "value": 20
-                },
-                "M12-maxmmes.2": {
-                        "operator": "gte",
-                        "value": 10
+                        "value": 8
                 },
-		"M11-entropy": {
-			"operator": "gt",
-			"value": 5.0
-		},
-                "M11-entropy.2": {
-			"operator": "gt",
-			"value": 2.0
-		},
-                "M13-hamming5p": {
+		"M13-hamming5p": {
                         "operator": "gte",
                         "value": 5
                 },
-                "M13-hamming5p.2": {
-                        "operator": "gte",
-                        "value": 7
-                },
                 "M14-hamming3p": {
                         "operator": "gte",
                         "value": 5
                 },
-                "M14-hamming3p.2": {
-                        "operator": "gte",
-                        "value": 7
-                },
-		"M19-mean_mismatches": {
-			"operator": "eq",
-			"value": 0
-		},
-                "M19-mean_mismatches.2": {
-			"operator": "lt",
-			"value": 0.2
+                "M19-mean_mismatches": {
+			"operator": "lte",
+			"value": 1.5
 		},
                 "M20-nb_usrs": {
                         "operator": "gte",
-                        "value": 5
+                        "value": 1
                 },
                 "M22-rel2raw": {
                         "operator": "gte",
-                        "value": 0.5
+                        "value": 0.25
                 }
 	},
-	"expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M22-rel2raw & M12-maxmmes.2"  
+	"expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw"  
 }
diff --git a/data/selftrain_initial_pos.layer2.json b/data/selftrain_initial_pos.layer2.json
index d91483ac..62c0273e 100644
--- a/data/selftrain_initial_pos.layer2.json
+++ b/data/selftrain_initial_pos.layer2.json
@@ -1,69 +1,45 @@
 {
 	"parameters": {
-		"M1-canonical_ss": {
-			"operator": "in",
-			"value": ["C", "S"]
-		},
-                "M1-canonical_ss.2": {
-			"operator": "in",
-			"value": ["N"]
-		},
 		"M4-nb_rel_aln": {
                         "operator": "gte",
-                        "value": 5
+                        "value": 3
                 },
                 "M4-nb_rel_aln.2": {
                         "operator": "gte",
-                        "value": 1
+                        "value": 2
                 },
                 "M12-maxmmes": {
                         "operator": "gte",
-                        "value": 20
+                        "value": 25
                 },
                 "M12-maxmmes.2": {
-                        "operator": "gte",
-                        "value": 10
+                        "operator": "gt",
+                        "value": 12
                 },
-		"M11-entropy": {
-			"operator": "gt",
-			"value": 5.0
-		},
-                "M11-entropy.2": {
-			"operator": "gt",
-			"value": 2.0
-		},
                 "M13-hamming5p": {
                         "operator": "gte",
-                        "value": 5
+                        "value": 8
                 },
                 "M13-hamming5p.2": {
                         "operator": "gte",
-                        "value": 7
+                        "value": 10
                 },
                 "M14-hamming3p": {
                         "operator": "gte",
-                        "value": 5
+                        "value": 8
                 },
                 "M14-hamming3p.2": {
                         "operator": "gte",
-                        "value": 7
+                        "value": 10
                 },
-		"M19-mean_mismatches": {
-			"operator": "eq",
+                "M19-mean_mismatches": {
+			"operator": "lt",
 			"value": 0
 		},
                 "M19-mean_mismatches.2": {
 			"operator": "lt",
-			"value": 0.2
-		},
-                "M20-nb_usrs": {
-                        "operator": "gte",
-                        "value": 5
-                },
-                "M22-rel2raw": {
-                        "operator": "gte",
-                        "value": 0.5
-                }
+			"value": 0.5
+		}
 	},
-	"expression": "( ( M1-canonical_ss & M19-mean_mismatches & M20-nb_usrs ) | ( M1-canonical_ss & M12-maxmmes & M11-entropy & M19-mean_mismatches.2 ) | ( M1-canonical_ss.2 & M19-mean_mismatches & M12-maxmmes & M11-entropy ) )"  
+	"expression": "( M4-nb_rel_aln & M12-maxmmes ) | ( M4-nb_rel_aln.2 & M12-maxmmes.2 & M13-hamming5p & M14-hamming3p & M19-mean_mismatches.2 ) | ( M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches )"  
 }
diff --git a/data/selftrain_initial_pos.layer3.json b/data/selftrain_initial_pos.layer3.json
new file mode 100644
index 00000000..349d1317
--- /dev/null
+++ b/data/selftrain_initial_pos.layer3.json
@@ -0,0 +1,77 @@
+{
+	"parameters": {
+		"M1-canonical_ss": {
+			"operator": "in",
+			"value": ["C"]
+		},
+                "M1-canonical_ss.2": {
+			"operator": "in",
+			"value": ["S"]
+		},
+                "M1-canonical_ss.3": {
+			"operator": "in",
+			"value": ["N"]
+		},
+		"M4-nb_rel_aln": {
+                        "operator": "gte",
+                        "value": 5
+                },
+                "M4-nb_rel_aln.2": {
+                        "operator": "gte",
+                        "value": 1
+                },
+                "M12-maxmmes": {
+                        "operator": "gte",
+                        "value": 20
+                },
+                "M12-maxmmes.2": {
+                        "operator": "gte",
+                        "value": 10
+                },
+		"M11-entropy": {
+			"operator": "gt",
+			"value": 5.0
+		},
+                "M11-entropy.2": {
+			"operator": "gt",
+			"value": 2.0
+		},
+                "M13-hamming5p": {
+                        "operator": "gte",
+                        "value": 7
+                },
+                "M13-hamming5p.2": {
+                        "operator": "gte",
+                        "value": 8
+                },
+                "M14-hamming3p": {
+                        "operator": "gte",
+                        "value": 7
+                },
+                "M14-hamming3p.2": {
+                        "operator": "gte",
+                        "value": 8
+                },
+		"M19-mean_mismatches": {
+			"operator": "eq",
+			"value": 0
+		},
+                "M19-mean_mismatches": {
+			"operator": "lt",
+			"value": 0.1
+		},
+                "M20-nb_usrs": {
+                        "operator": "gte",
+                        "value": 5
+                },
+                "M22-rel2raw": {
+                        "operator": "gte",
+                        "value": 0.5
+                },
+                "M22-rel2raw.2": {
+                        "operator": "gte",
+                        "value": 0.75
+                }
+	},
+	"expression": "( M1-canonical_ss ) | ( M1-canonical_ss.2 & M22-rel2raw & M13-hamming5p & M14-hamming3p ) | ( M1-canonical_ss.3 & M22-rel2raw.2 & M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches & M11-entropy.2 )"  
+}
diff --git a/src/junction_filter.cc b/src/junction_filter.cc
index aec33fc8..b3e78942 100644
--- a/src/junction_filter.cc
+++ b/src/junction_filter.cc
@@ -237,13 +237,13 @@ void portcullis::JunctionFilter::filter() {
     if (train) {
         
         // The initial positive and negative sets
-        JunctionList pos, neg;
+        JunctionList pos, unlabelled, neg;
          
         cout << "Self training mode activated." << endl << endl;
         
-        createPositiveSet(currentJuncs, pos, mf);
+        createPositiveSet(currentJuncs, pos, unlabelled, mf);
         
-        createNegativeSet(mf.L95, currentJuncs, neg);
+        createNegativeSet(mf.L95, unlabelled, neg);
         
         cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl;
         
@@ -466,7 +466,7 @@ void portcullis::JunctionFilter::filter() {
     }
 }
 
-void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, JunctionList& pos, ModelFeatures& mf) {
+void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf) {
     JuncResultMap resultMap;
          
     cout << "Creating initial positive set for training" << endl
@@ -481,39 +481,47 @@ void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, Junc
         cout << Performance::longHeader();
     }
     
-    JunctionList p1, p2, f1, f2;
+    JunctionList p1, p2, p3;
     
     cout << endl << "1\t";
-    RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, f1, "Creating initial positive set for training", resultMap);
+    RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, unlabelled, "Creating initial positive set for training", resultMap);
     if (!genuineFile.empty()) {
-        cout << calcPerformance(p1, f1, true)->toLongString();
+        cout << calcPerformance(p1, unlabelled)->toLongString();
     }
     cout << endl << "2\t";    
-    RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, f2, "Creating initial positive set for training", resultMap);
+    RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, unlabelled, "Creating initial positive set for training", resultMap);
     if (!genuineFile.empty()) {
-        cout << calcPerformance(p2, f2, true)->toLongString();
+        cout << calcPerformance(p2, unlabelled)->toLongString();
     }
-    cout << endl;
+    cout << endl << "3\t";    
+    RuleFilter::filter(this->getIntitalPosRulesFile(3), p2, p3, unlabelled, "Creating initial positive set for training", resultMap);
+    if (!genuineFile.empty()) {
+        cout << calcPerformance(p3, unlabelled)->toLongString();
+    }
+    cout << endl << "L95x1.5\t";
 
     const uint32_t L95 = mf.calcIntronThreshold(p2);
-    const uint32_t L95x2 = L95 * 2;
+    const uint32_t pos_length_limit = L95 * 1.5;
     
-    JunctionList passJuncs, failJuncs;
-    for(auto& j : p2) {
-        if (j->getIntronSize() <= L95x2) {
+    JunctionList passJuncs;
+    for(auto& j : p3) {
+        if (j->getIntronSize() <= pos_length_limit) {
             passJuncs.push_back(j);
         }
         else {
-            failJuncs.push_back(j);
+            unlabelled.push_back(j);
         }
     }
     if (!genuineFile.empty()) {
-        cout << "L95x2\t" << calcPerformance(passJuncs, failJuncs, true)->toLongString() << endl;
+        cout << calcPerformance(passJuncs, unlabelled)->toLongString();
     }
-        
-    // Analyse positive set to get L0.05 of intron size
-    cout << endl << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl;
+      
+    cout << endl << endl << "Found " << passJuncs.size() << " junctions for the positive set" << endl << endl;
 
+    // Analyse positive set to get L0.05 of intron size
+    cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl;
+    
+    
     cout << "Saving initial positive set:" << endl;
     JunctionSystem isp(passJuncs);
     isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp");
@@ -533,7 +541,7 @@ void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, Junc
             }
         }
         JunctionSystem missedPos;
-        for(auto& j : failJuncs) {
+        for(auto& j : unlabelled) {
             if (j->isGenuine()) {
                 missedPos.addJunction(j);
             }
@@ -563,7 +571,7 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL
         cout << Performance::longHeader();
     }
     
-    JunctionList p1, p2, p3, p4, p5, p6, p7, f1, f2, f3, f4, f5, f6, f7;
+    JunctionList p1, p2, p3, p4, p5, p6, p7, p8, f1, f2, f3, f4, f5, f6, f7, f8;
     
     cout << endl << "1\t";
     RuleFilter::filter(this->getIntitalNegRulesFile(1), all, p1, f1, "Creating initial negative set for training", resultMap);
@@ -595,11 +603,16 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL
     if (!genuineFile.empty()) {
        cout << calcPerformance(p6, f6, true)->toLongString();
     }
+    cout << endl << "7\t";
+    RuleFilter::filter(this->getIntitalNegRulesFile(7), f6, p7, f7, "Creating initial negative set for training", resultMap);
+    if (!genuineFile.empty()) {
+       cout << calcPerformance(p7, f7, true)->toLongString();
+    }
     cout << endl << "L95x5\t";
     
     JunctionList passJuncs, failJuncs;
     const uint32_t L95x5 = L95 * 5;
-    for(auto& j : f6) {
+    for(auto& j : f7) {
        if (j->getIntronSize() > L95x5 && j->getMaxMMES() < 10 ) {
            p7.push_back(j);
        }
@@ -608,7 +621,7 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL
        }
     }
     if (!genuineFile.empty()) {
-       cout << calcPerformance(p7, failJuncs, true)->toLongString();
+       cout << calcPerformance(p8, failJuncs, true)->toLongString();
     }
     
     cout << endl << endl << "Concatenating negatives from each layer to create negative set" << endl << endl;
@@ -620,16 +633,21 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL
     passJuncs.insert(passJuncs.end(), p5.begin(), p5.end());
     passJuncs.insert(passJuncs.end(), p6.begin(), p6.end());
     passJuncs.insert(passJuncs.end(), p7.begin(), p7.end());
+    passJuncs.insert(passJuncs.end(), p8.begin(), p8.end());
 
+    // This will remove any duplicates
+    JunctionSystem isn(passJuncs);
+        
     if (!genuineFile.empty()) {
-       cout << "Final\t" << calcPerformance(passJuncs, failJuncs, true)->toLongString() << endl;
+       cout << "Final\t" << calcPerformance(isn.getJunctions(), failJuncs, true)->toLongString() << endl << endl;
     }
-    
+
+    cout << "Found " << isn.getJunctions().size() << " junctions for the negative set" << endl << endl;
+
     cout << endl << "Saving initial negative set:" << endl;
-    JunctionSystem isn(passJuncs);
     isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn");
 
-    for(auto& j : passJuncs) {
+    for(auto& j : isn.getJunctions()) {
        JunctionPtr copy = make_shared<Junction>(*j);
        copy->setGenuine(false);
        neg.push_back(copy);            
@@ -640,7 +658,7 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL
        JunctionSystem invalidNeg;
        JunctionSystem missedNeg;
 
-       for(auto& j : passJuncs) {
+       for(auto& j : isn.getJunctions()) {
            if (j->isGenuine()) {
                invalidNeg.addJunction(j);
            }
@@ -815,8 +833,8 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction
         
         cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl;
         cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl;
-        
-        categorise(f, all, pass, fail, best_t_mcc);
+        cout << "Actual threshold set at " << threshold << endl;
+        categorise(f, all, pass, fail, best_t_f1);
     }
     else {
         categorise(f, all, pass, fail, threshold);
diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp
index d45edae9..302a5d86 100644
--- a/src/junction_filter.hpp
+++ b/src/junction_filter.hpp
@@ -299,7 +299,7 @@ class JunctionFilter {
     
     void categorise(shared_ptr<Forest> f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t);
         
-    void createPositiveSet(const JunctionList& all, JunctionList& pos, ModelFeatures& mf);
+    void createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf);
     
     void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg);
     

From e99e9ec2851a0aab3e858e98e789aad29d37deb6 Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Fri, 29 Apr 2016 15:58:27 +0100
Subject: [PATCH 15/15] Changed the default filtering threshold level.

---
 src/junction_filter.hpp | 2 +-
 tests/Makefile.am       | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp
index 302a5d86..d21b14e2 100644
--- a/src/junction_filter.hpp
+++ b/src/junction_filter.hpp
@@ -76,7 +76,7 @@ const string ST_IPOS_RULES_FILE = "selftrain_initial_pos";
 const string ST_INEG_RULES_FILE = "selftrain_initial_neg";
 const uint16_t DEFAULT_FILTER_THREADS = 1;
 const uint16_t DEFAULT_SELFTRAIN_TREES = 100;
-const double DEFAULT_FILTER_THRESHOLD = 0.75;
+const double DEFAULT_FILTER_THRESHOLD = 0.625;
 
 
 class JunctionFilter {
diff --git a/tests/Makefile.am b/tests/Makefile.am
index b3a42b38..d6d27253 100755
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -32,6 +32,7 @@ check_unit_tests_SOURCES =	bam_tests.cpp \
 check_unit_tests_CXXFLAGS = -O0 @AM_CXXFLAGS@ @CXXFLAGS@
 
 check_unit_tests_CPPFLAGS =	-isystem $(top_srcdir)/deps/htslib-1.3 \
+				-isystem $(top_srcdir)/deps/ranger-0.3.8/include \
 				-I$(top_srcdir)/lib/include \
 				-DRESOURCESDIR=\"$(top_srcdir)/tests/resources\" \
 				-DDATADIR=\"$(datadir)\" \
@@ -40,12 +41,14 @@ check_unit_tests_CPPFLAGS =	-isystem $(top_srcdir)/deps/htslib-1.3 \
 
 check_unit_tests_LDFLAGS =	-L../lib \
 				-L../deps/htslib-1.3 \
+				-L../deps/ranger-0.3.8 \
 				@AM_LDFLAGS@ \
 				@LDFLAGS@
 	
 check_unit_tests_LDADD =	libgtest.la \
 				-lportcullis \
 				-lphts \
+				-lranger \
 				@AM_LIBS@	
 
 clean-local: clean-local-check