diff --git a/configure.ac b/configure.ac index 6996e2cd..cb6d580d 100644 --- a/configure.ac +++ b/configure.ac @@ -4,7 +4,7 @@ # Autoconf setup AC_PREREQ([2.68]) -AC_INIT([portcullis],[0.16.0],[daniel.mapleson@tgac.ac.uk],[portcullis],[http://www.tgac.ac.uk]) +AC_INIT([portcullis],[0.16.1],[daniel.mapleson@tgac.ac.uk],[portcullis],[http://www.tgac.ac.uk]) AC_CONFIG_SRCDIR([src/portcullis.cc]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_MACRO_DIR([m4]) diff --git a/data/selftrain_initial_neg.layer3.json b/data/selftrain_initial_neg.layer3.json index ec5cbc5d..ec8c8b81 100644 --- a/data/selftrain_initial_neg.layer3.json +++ b/data/selftrain_initial_neg.layer3.json @@ -4,90 +4,14 @@ "operator": "in", "value": ["N", "S"] }, - "M2-nb_reads": { - "operator": "lte", - "value": 1 - }, - "M3-nb_dist_aln": { - "operator": "gte", - "value": 2 - }, - "M4-nb_rel_aln": { - "operator": "eq", - "value": 0 - }, "M11-entropy": { - "operator": "lt", - "value": 1.0 - }, - "M11-entropy.2": { "operator": "gt", - "value": 2.0 - }, - "M11-entropy.3": { - "operator": "eq", - "value": 0.0 - }, - "M12-maxmmes": { - "operator": "lt", - "value": 7 - }, - "M12-maxmmes.2": { - "operator": "lt", - "value": 10 - }, - "M12-maxmmes.3": { - "operator": "lt", - "value": 20 - }, - "M8-max_min_anc": { - "operator": "lt", - "value": 16 - }, - "Suspect": { - "operator": "eq", - "value": 1 + "value": 3.0 }, - "PFP": { + "Suspect": { "operator": "eq", "value": 1 - }, - "M13-hamming5p": { - "operator": "lte", - "value": 2 - }, - "M14-hamming3p": { - "operator": "lte", - "value": 2 - }, - "M19-mean_mismatches": { - "operator": "gte", - "value": 5.0 - }, - "M19-mean_mismatches.2": { - "operator": "gte", - "value": 2.0 - }, - "M20-nb_usrs": { - "operator": "eq", - "value": 0 - }, - "M20-nb_usrs.2": { - "operator": "eq", - "value": 1 - }, - "M21-nb_msrs": { - "operator": "gte", - "value": 1 - }, - "M22-rel2raw": { - "operator": "lt", - "value": 0.9 - }, - "M22-rel2raw.2": { - "operator": "eq", - "value": 0.0 - } + } }, - "expression": "( Suspect & M11-entropy.2 )" + "expression": "( M1-canonical_ss & Suspect & M11-entropy )" } diff --git a/data/selftrain_initial_neg.layer4.json b/data/selftrain_initial_neg.layer4.json index 44890918..bab2560d 100644 --- a/data/selftrain_initial_neg.layer4.json +++ b/data/selftrain_initial_neg.layer4.json @@ -1,93 +1,13 @@ { "parameters": { - "M1-canonical_ss": { - "operator": "in", - "value": ["N", "S"] - }, - "M2-nb_reads": { - "operator": "lte", - "value": 1 - }, - "M3-nb_dist_aln": { - "operator": "gte", - "value": 2 - }, - "M4-nb_rel_aln": { - "operator": "eq", - "value": 0 - }, - "M11-entropy": { - "operator": "lt", - "value": 1.0 - }, - "M11-entropy.2": { - "operator": "gt", - "value": 2.0 - }, - "M11-entropy.3": { - "operator": "eq", - "value": 0.0 - }, "M12-maxmmes": { "operator": "lt", "value": 7 }, - "M12-maxmmes.2": { - "operator": "lt", - "value": 10 - }, - "M12-maxmmes.3": { - "operator": "lt", - "value": 20 - }, - "M8-max_min_anc": { - "operator": "lt", - "value": 16 - }, - "Suspect": { - "operator": "eq", - "value": 1 - }, - "PFP": { - "operator": "eq", - "value": 1 - }, - "M13-hamming5p": { - "operator": "lte", - "value": 2 - }, - "M14-hamming3p": { - "operator": "lte", - "value": 2 - }, - "M19-mean_mismatches": { - "operator": "gte", - "value": 5.0 - }, - "M19-mean_mismatches.2": { - "operator": "gte", - "value": 2.0 - }, - "M20-nb_usrs": { - "operator": "eq", - "value": 0 - }, - "M20-nb_usrs.2": { - "operator": "eq", - "value": 1 - }, - "M21-nb_msrs": { - "operator": "gte", - "value": 1 - }, "M22-rel2raw": { "operator": "lt", - "value": 0.9 - }, - "M22-rel2raw.2": { - "operator": "eq", - "value": 0.0 + "value": 0.5 } }, - "expression": "( M12-maxmmes & M22-rel2raw )" + "expression": "( M12-maxmmes & M22-rel2raw )" } diff --git a/data/selftrain_initial_neg.layer5.json b/data/selftrain_initial_neg.layer5.json index 945bd289..ef96164c 100644 --- a/data/selftrain_initial_neg.layer5.json +++ b/data/selftrain_initial_neg.layer5.json @@ -2,92 +2,12 @@ "parameters": { "M1-canonical_ss": { "operator": "in", - "value": ["N", "S"] + "value": ["N"] }, - "M2-nb_reads": { - "operator": "lte", - "value": 1 - }, - "M3-nb_dist_aln": { - "operator": "gte", - "value": 2 - }, - "M4-nb_rel_aln": { - "operator": "eq", - "value": 0 - }, - "M11-entropy": { - "operator": "lt", - "value": 1.0 - }, - "M11-entropy.2": { - "operator": "gt", - "value": 2.0 - }, - "M11-entropy.3": { - "operator": "eq", - "value": 0.0 - }, - "M12-maxmmes": { - "operator": "lt", - "value": 7 - }, - "M12-maxmmes.2": { - "operator": "lt", - "value": 10 - }, - "M12-maxmmes.3": { - "operator": "lt", - "value": 20 - }, - "M8-max_min_anc": { - "operator": "lt", - "value": 16 - }, - "Suspect": { - "operator": "eq", - "value": 1 - }, - "PFP": { - "operator": "eq", - "value": 1 - }, - "M13-hamming5p": { - "operator": "lte", - "value": 2 - }, - "M14-hamming3p": { - "operator": "lte", - "value": 2 - }, - "M19-mean_mismatches": { - "operator": "gte", - "value": 5.0 - }, - "M19-mean_mismatches.2": { - "operator": "gte", - "value": 2.0 - }, - "M20-nb_usrs": { - "operator": "eq", - "value": 0 - }, - "M20-nb_usrs.2": { - "operator": "eq", - "value": 1 - }, - "M21-nb_msrs": { - "operator": "gte", - "value": 1 - }, "M22-rel2raw": { - "operator": "lt", - "value": 0.9 - }, - "M22-rel2raw.2": { "operator": "eq", "value": 0.0 } }, - "expression": "( M1-canonical_ss & M22-rel2raw.2 )" + "expression": "( M1-canonical_ss & M22-rel2raw )" } diff --git a/data/selftrain_initial_neg.layer6.json b/data/selftrain_initial_neg.layer6.json index 0265d1eb..ea5b4298 100644 --- a/data/selftrain_initial_neg.layer6.json +++ b/data/selftrain_initial_neg.layer6.json @@ -3,91 +3,11 @@ "M1-canonical_ss": { "operator": "in", "value": ["N", "S"] - }, - "M2-nb_reads": { - "operator": "lte", - "value": 1 - }, - "M3-nb_dist_aln": { - "operator": "gte", - "value": 2 - }, - "M4-nb_rel_aln": { - "operator": "eq", - "value": 0 - }, - "M11-entropy": { - "operator": "lt", - "value": 1.0 - }, - "M11-entropy.2": { - "operator": "gt", - "value": 2.0 - }, - "M11-entropy.3": { - "operator": "eq", - "value": 0.0 - }, - "M12-maxmmes": { - "operator": "lt", - "value": 7 - }, - "M12-maxmmes.2": { - "operator": "lt", - "value": 10 - }, - "M12-maxmmes.3": { - "operator": "lt", - "value": 20 - }, - "M8-max_min_anc": { - "operator": "lt", - "value": 16 - }, - "Suspect": { - "operator": "eq", - "value": 1 - }, + }, "PFP": { "operator": "eq", "value": 1 - }, - "M13-hamming5p": { - "operator": "lte", - "value": 2 - }, - "M14-hamming3p": { - "operator": "lte", - "value": 2 - }, - "M19-mean_mismatches": { - "operator": "gte", - "value": 5.0 - }, - "M19-mean_mismatches.2": { - "operator": "gte", - "value": 2.0 - }, - "M20-nb_usrs": { - "operator": "eq", - "value": 0 - }, - "M20-nb_usrs.2": { - "operator": "eq", - "value": 1 - }, - "M21-nb_msrs": { - "operator": "gte", - "value": 1 - }, - "M22-rel2raw": { - "operator": "lt", - "value": 0.9 - }, - "M22-rel2raw.2": { - "operator": "eq", - "value": 0.0 - } + } }, - "expression": "( PFP )" + "expression": "( M1-canonical_ss & PFP )" } diff --git a/data/selftrain_initial_neg.layer7.json b/data/selftrain_initial_neg.layer7.json index 5b6bde7c..fb0352f3 100644 --- a/data/selftrain_initial_neg.layer7.json +++ b/data/selftrain_initial_neg.layer7.json @@ -10,11 +10,11 @@ }, "M13-hamming5p": { "operator": "lte", - "value": 4 + "value": 3 }, "M14-hamming3p": { "operator": "lte", - "value": 4 + "value": 3 } }, "expression": "( M2-nb_reads & M22-rel2raw & M13-hamming5p & M14-hamming3p )" diff --git a/data/selftrain_initial_pos.layer1.json b/data/selftrain_initial_pos.layer1.json index 1bc58557..bd1fd95d 100644 --- a/data/selftrain_initial_pos.layer1.json +++ b/data/selftrain_initial_pos.layer1.json @@ -1,33 +1,37 @@ { - "parameters": { - "M4-nb_rel_aln": { - "operator": "gte", - "value": 1 - }, - "M12-maxmmes": { - "operator": "gte", - "value": 8 - }, - "M13-hamming5p": { - "operator": "gte", - "value": 5 - }, - "M14-hamming3p": { - "operator": "gte", - "value": 5 - }, - "M19-mean_mismatches": { - "operator": "lte", - "value": 1.5 - }, - "M20-nb_usrs": { - "operator": "gte", - "value": 1 - }, - "M22-rel2raw": { - "operator": "gte", - "value": 0.25 - } - }, - "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw" + "parameters": { + "M4-nb_rel_aln": { + "operator": "gte", + "value": 1 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 8 + }, + "M11-entropy": { + "operator": "gt", + "value": 1.0 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 4 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 4 + }, + "M19-mean_mismatches": { + "operator": "lte", + "value": 1.0 + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "gte", + "value": 0.25 + } + }, + "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw" } diff --git a/data/selftrain_initial_pos.layer2.json b/data/selftrain_initial_pos.layer2.json index 62c0273e..a4c2b0b4 100644 --- a/data/selftrain_initial_pos.layer2.json +++ b/data/selftrain_initial_pos.layer2.json @@ -1,45 +1,45 @@ { - "parameters": { - "M4-nb_rel_aln": { - "operator": "gte", - "value": 3 - }, - "M4-nb_rel_aln.2": { - "operator": "gte", - "value": 2 - }, - "M12-maxmmes": { - "operator": "gte", - "value": 25 - }, - "M12-maxmmes.2": { - "operator": "gt", - "value": 12 - }, - "M13-hamming5p": { - "operator": "gte", - "value": 8 - }, - "M13-hamming5p.2": { - "operator": "gte", - "value": 10 - }, - "M14-hamming3p": { - "operator": "gte", - "value": 8 - }, - "M14-hamming3p.2": { - "operator": "gte", - "value": 10 - }, - "M19-mean_mismatches": { - "operator": "lt", - "value": 0 - }, - "M19-mean_mismatches.2": { - "operator": "lt", - "value": 0.5 - } - }, - "expression": "( M4-nb_rel_aln & M12-maxmmes ) | ( M4-nb_rel_aln.2 & M12-maxmmes.2 & M13-hamming5p & M14-hamming3p & M19-mean_mismatches.2 ) | ( M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches )" + "parameters": { + "M4-nb_rel_aln": { + "operator": "gte", + "value": 5 + }, + "M4-nb_rel_aln.2": { + "operator": "gte", + "value": 3 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 20 + }, + "M12-maxmmes.2": { + "operator": "gt", + "value": 12 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 7 + }, + "M13-hamming5p.2": { + "operator": "gte", + "value": 9 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 7 + }, + "M14-hamming3p.2": { + "operator": "gte", + "value": 9 + }, + "M19-mean_mismatches": { + "operator": "lte", + "value": 0 + }, + "M19-mean_mismatches.2": { + "operator": "lt", + "value": 0.33 + } + }, + "expression": "( M4-nb_rel_aln & M12-maxmmes ) | ( M4-nb_rel_aln.2 & M12-maxmmes.2 & M13-hamming5p & M14-hamming3p & M19-mean_mismatches.2 ) | ( M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches )" } diff --git a/data/selftrain_initial_pos.layer3.json b/data/selftrain_initial_pos.layer3.json index 349d1317..18464d1e 100644 --- a/data/selftrain_initial_pos.layer3.json +++ b/data/selftrain_initial_pos.layer3.json @@ -1,77 +1,77 @@ { - "parameters": { - "M1-canonical_ss": { - "operator": "in", - "value": ["C"] - }, - "M1-canonical_ss.2": { - "operator": "in", - "value": ["S"] - }, - "M1-canonical_ss.3": { - "operator": "in", - "value": ["N"] - }, - "M4-nb_rel_aln": { - "operator": "gte", - "value": 5 - }, - "M4-nb_rel_aln.2": { - "operator": "gte", - "value": 1 - }, - "M12-maxmmes": { - "operator": "gte", - "value": 20 - }, - "M12-maxmmes.2": { - "operator": "gte", - "value": 10 - }, - "M11-entropy": { - "operator": "gt", - "value": 5.0 - }, - "M11-entropy.2": { - "operator": "gt", - "value": 2.0 - }, - "M13-hamming5p": { - "operator": "gte", - "value": 7 - }, - "M13-hamming5p.2": { - "operator": "gte", - "value": 8 - }, - "M14-hamming3p": { - "operator": "gte", - "value": 7 - }, - "M14-hamming3p.2": { - "operator": "gte", - "value": 8 - }, - "M19-mean_mismatches": { - "operator": "eq", - "value": 0 - }, - "M19-mean_mismatches": { - "operator": "lt", - "value": 0.1 - }, - "M20-nb_usrs": { - "operator": "gte", - "value": 5 - }, - "M22-rel2raw": { - "operator": "gte", - "value": 0.5 - }, - "M22-rel2raw.2": { - "operator": "gte", - "value": 0.75 - } - }, - "expression": "( M1-canonical_ss ) | ( M1-canonical_ss.2 & M22-rel2raw & M13-hamming5p & M14-hamming3p ) | ( M1-canonical_ss.3 & M22-rel2raw.2 & M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches & M11-entropy.2 )" + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["C"] + }, + "M1-canonical_ss.2": { + "operator": "in", + "value": ["S"] + }, + "M1-canonical_ss.3": { + "operator": "in", + "value": ["N"] + }, + "M4-nb_rel_aln": { + "operator": "gte", + "value": 5 + }, + "M4-nb_rel_aln.2": { + "operator": "gte", + "value": 1 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 20 + }, + "M12-maxmmes.2": { + "operator": "gte", + "value": 10 + }, + "M11-entropy": { + "operator": "gt", + "value": 3.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 1.5 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 6 + }, + "M13-hamming5p.2": { + "operator": "gte", + "value": 7 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 6 + }, + "M14-hamming3p.2": { + "operator": "gte", + "value": 7 + }, + "M19-mean_mismatches": { + "operator": "eq", + "value": 0 + }, + "M19-mean_mismatches": { + "operator": "lt", + "value": 0.1 + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 5 + }, + "M22-rel2raw": { + "operator": "gte", + "value": 0.5 + }, + "M22-rel2raw.2": { + "operator": "gte", + "value": 0.75 + } + }, + "expression": "( M1-canonical_ss ) | ( M1-canonical_ss.2 & M22-rel2raw & M13-hamming5p & M14-hamming3p ) | ( M1-canonical_ss.3 & M22-rel2raw.2 & M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches & M11-entropy.2 )" } diff --git a/deps/ranger-0.3.8/include/ranger/Data.h b/deps/ranger-0.3.8/include/ranger/Data.h index 56dd3810..cc6d8c64 100644 --- a/deps/ranger-0.3.8/include/ranger/Data.h +++ b/deps/ranger-0.3.8/include/ranger/Data.h @@ -39,13 +39,24 @@ class Data { public: Data(); + Data(std::vector variable_names, size_t num_rows, size_t num_cols); virtual ~Data(); virtual double get(size_t row, size_t col) const = 0; + void getRow(size_t row, std::vector& output) const { + for(size_t col = 0; col < num_cols; col++) { + output.push_back(get(row, col)); + } + } size_t getVariableID(std::string variable_name); - virtual void reserveMemory() = 0; + virtual void reserveMemoryInternal() = 0; + + void reserveMemory() { + reserveMemoryInternal(); + } + virtual void set(size_t col, size_t row, double value, bool& error) = 0; void addSparseData(unsigned char* sparse_data, size_t num_cols_sparse); @@ -115,12 +126,8 @@ class Data { } } - void setExternalData(bool external) { - this->externalData = external; - } - void saveToTSV(const std::string& file); - + std::string getRow(size_t row) const { std::stringstream ss; ss << this->get(row, 0); @@ -129,7 +136,7 @@ class Data { } return ss.str(); } - + std::string getHeader() const { std::stringstream ss; ss << variable_names[0]; @@ -138,7 +145,7 @@ class Data { } return ss.str(); } - + protected: std::vector variable_names; size_t num_rows; @@ -148,8 +155,6 @@ class Data { unsigned char* sparse_data; size_t num_cols_no_sparse; - bool externalData; - size_t* index_data; std::vector> unique_data_values; size_t max_num_unique_values; diff --git a/deps/ranger-0.3.8/include/ranger/DataChar.h b/deps/ranger-0.3.8/include/ranger/DataChar.h index 4c0fc5f5..be428870 100644 --- a/deps/ranger-0.3.8/include/ranger/DataChar.h +++ b/deps/ranger-0.3.8/include/ranger/DataChar.h @@ -34,40 +34,46 @@ wright@imbs.uni-luebeck.de #include "globals.h" #include "Data.h" -class DataChar: public Data { +class DataChar : public Data { public: - DataChar(); - DataChar(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols, bool& error); - virtual ~DataChar(); - - double get(size_t row, size_t col) const { - if (col < num_cols_no_sparse) { - return data[col * num_rows + row]; - } else { - // Get data out of sparse storage. -1 because of GenABEL coding. - size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; - return (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); - } - } + DataChar(); + + DataChar(std::vector variable_names, size_t num_rows, size_t num_cols) : Data(variable_names, num_rows, num_cols) { + reserveMemory(); + }; + DataChar(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols, bool& error); + virtual ~DataChar(); - void reserveMemory() { - data = new char[num_cols * num_rows]; - } + double get(size_t row, size_t col) const { + if (col < num_cols_no_sparse) { + return data[col * num_rows + row]; + } else { + // Get data out of sparse storage. -1 because of GenABEL coding. + size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; + return (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); + } + } + + - void set(size_t col, size_t row, double value, bool& error) { - if (value > CHAR_MAX || value < CHAR_MIN) { - error = true; + void reserveMemoryInternal() { + data = new char[num_cols * num_rows]; } - if (floor(value) != ceil(value)) { - error = true; + + void set(size_t col, size_t row, double value, bool& error) { + if (value > CHAR_MAX || value < CHAR_MIN) { + error = true; + } + if (floor(value) != ceil(value)) { + error = true; + } + data[col * num_rows + row] = (char) value; } - data[col * num_rows + row] = (char) value; - } private: - char* data; + char* data; - DISALLOW_COPY_AND_ASSIGN(DataChar); + DISALLOW_COPY_AND_ASSIGN(DataChar); }; #endif /* DATACHAR_H_ */ diff --git a/deps/ranger-0.3.8/include/ranger/DataDouble.h b/deps/ranger-0.3.8/include/ranger/DataDouble.h index f722de8e..eaed6376 100644 --- a/deps/ranger-0.3.8/include/ranger/DataDouble.h +++ b/deps/ranger-0.3.8/include/ranger/DataDouble.h @@ -33,41 +33,39 @@ wright@imbs.uni-luebeck.de #include "utility.h" #include "Data.h" -class DataDouble: public Data { +class DataDouble : public Data { public: - DataDouble(); - DataDouble(double* data, std::vector variable_names, size_t num_rows, size_t num_cols) : - data(data) { - this->variable_names = variable_names; - this->num_rows = num_rows; - this->num_cols = num_cols; - this->num_cols_no_sparse = num_cols; - } - virtual ~DataDouble(); + DataDouble(); - double get(size_t row, size_t col) const { - if (col < num_cols_no_sparse) { - return data[col * num_rows + row]; - } else { - // Get data out of sparse storage. -1 because of GenABEL coding. - size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; - double result = (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); - return result; - } - } + DataDouble(std::vector variable_names, size_t num_rows, size_t num_cols) : Data(variable_names, num_rows, num_cols) { + reserveMemory(); + }; + DataDouble(double* data, std::vector variable_names, size_t num_rows, size_t num_cols); + virtual ~DataDouble(); - void reserveMemory() { - data = new double[num_cols * num_rows]; - } + double get(size_t row, size_t col) const { + if (col < num_cols_no_sparse) { + return data[col * num_rows + row]; + } else { + // Get data out of sparse storage. -1 because of GenABEL coding. + size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; + double result = (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); + return result; + } + } + + void reserveMemoryInternal() { + data = new double[num_cols * num_rows]; + } - void set(size_t col, size_t row, double value, bool& error) { - data[col * num_rows + row] = value; - } + void set(size_t col, size_t row, double value, bool& error) { + data[col * num_rows + row] = value; + } private: - double* data; + double* data; - DISALLOW_COPY_AND_ASSIGN(DataDouble); + DISALLOW_COPY_AND_ASSIGN(DataDouble); }; #endif /* DATADOUBLE_H_ */ diff --git a/deps/ranger-0.3.8/include/ranger/DataFloat.h b/deps/ranger-0.3.8/include/ranger/DataFloat.h index ed26f7da..b56c3595 100644 --- a/deps/ranger-0.3.8/include/ranger/DataFloat.h +++ b/deps/ranger-0.3.8/include/ranger/DataFloat.h @@ -32,34 +32,38 @@ wright@imbs.uni-luebeck.de #include "globals.h" #include "Data.h" -class DataFloat: public Data { +class DataFloat : public Data { public: - DataFloat(); - DataFloat(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols); - virtual ~DataFloat(); + DataFloat(); - double get(size_t row, size_t col) const { - if (col < num_cols_no_sparse) { - return data[col * num_rows + row]; - } else { - // Get data out of sparse storage. -1 because of GenABEL coding. - size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; - return (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); + DataFloat(std::vector variable_names, size_t num_rows, size_t num_cols) : Data(variable_names, num_rows, num_cols) { + reserveMemory(); + }; + DataFloat(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols); + virtual ~DataFloat(); + + double get(size_t row, size_t col) const { + if (col < num_cols_no_sparse) { + return data[col * num_rows + row]; + } else { + // Get data out of sparse storage. -1 because of GenABEL coding. + size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; + return (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); + } } - } - void reserveMemory() { - data = new float[num_cols * num_rows]; - } + void reserveMemoryInternal() { + data = new float[num_cols * num_rows]; + } - void set(size_t col, size_t row, double value, bool& error) { - data[col * num_rows + row] = (float) value; - } + void set(size_t col, size_t row, double value, bool& error) { + data[col * num_rows + row] = (float) value; + } private: - float* data; + float* data; - DISALLOW_COPY_AND_ASSIGN(DataFloat); + DISALLOW_COPY_AND_ASSIGN(DataFloat); }; #endif /* DATAFLOAT_H_ */ diff --git a/deps/ranger-0.3.8/include/ranger/Forest.h b/deps/ranger-0.3.8/include/ranger/Forest.h index c9351c8e..6497d3f1 100644 --- a/deps/ranger-0.3.8/include/ranger/Forest.h +++ b/deps/ranger-0.3.8/include/ranger/Forest.h @@ -46,221 +46,247 @@ class Forest { public: - Forest(); - virtual ~Forest(); - - // Init from c++ main or Rcpp from R - void initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, - std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, - std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, - std::string split_select_weights_file, std::vector& always_split_variable_names, - std::string status_variable_name, bool sample_with_replacement, - std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - std::string case_weights_file, bool predict_all, double sample_fraction); - void initR(std::string dependent_variable_name, Data* input_data, uint mtry, uint num_trees, - std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, - std::vector>& split_select_weights, std::vector& always_split_variable_names, - std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, - std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - std::vector& case_weights, bool predict_all, bool keep_inbag, double sample_fraction); - void init(std::string dependent_variable_name, MemoryMode memory_mode, Data* input_data, uint mtry, - std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, - uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, - std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - bool predict_all, double sample_fraction); - virtual void initInternal(std::string status_variable_name) = 0; - - // Grow or predict - void run(bool verbose); - - // Write results to output files - void writeOutput(std::ostream* out); - virtual void writeOutputInternal() = 0; - virtual void writeConfusionFile() = 0; - virtual void writePredictionFile() = 0; - void writeImportanceFile(); - - // Save forest to file - void saveToFile(); - virtual void saveToFileInternal(std::ofstream& outfile) = 0; - - std::vector>>getChildNodeIDs() { - std::vector>> result; - for (auto& tree : trees) { - result.push_back(tree->getChildNodeIDs()); + Forest(); + virtual ~Forest(); + + // Init from c++ main or Rcpp from R + void initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, + std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, + std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, + std::string split_select_weights_file, std::vector& always_split_variable_names, + std::string status_variable_name, bool sample_with_replacement, + std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, + std::string case_weights_file, bool predict_all, double sample_fraction); + void initR(std::string dependent_variable_name, Data* input_data, uint mtry, uint num_trees, + std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, + std::vector>&split_select_weights, std::vector& always_split_variable_names, + std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, + std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, + std::vector& case_weights, bool predict_all, bool keep_inbag, double sample_fraction); + void init(MemoryMode memory_mode, uint mtry, std::string output_prefix, + uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, + uint min_node_size, bool prediction_mode, bool sample_with_replacement, bool memory_saving_splitting, + SplitRule splitrule, bool predict_all, double sample_fraction); + void init(std::string dependent_variable_name, MemoryMode memory_mode, Data* input_data, uint mtry, + std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, + uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, + std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, + bool predict_all, double sample_fraction); + virtual void initInternal(std::string status_variable_name) = 0; + + // Grow or predict + void run(bool verbose); + + // Write results to output files + void writeOutput(std::ostream* out); + virtual void writeOutputInternal() = 0; + virtual void writeConfusionFile() = 0; + virtual void writePredictionFile() = 0; + void writeImportanceFile(); + + // Save forest to file + void saveToFile(); + void saveToFile(std::string filename); + virtual void saveToFileInternal(std::ofstream& outfile) = 0; + + std::vector>>getChildNodeIDs() { + std::vector>> result; + for (auto& tree : trees) { + result.push_back(tree->getChildNodeIDs()); + } + return result; } - return result; - } - std::vector> getSplitVarIDs() { - std::vector> result; - for (auto& tree : trees) { - result.push_back(tree->getSplitVarIDs()); + + std::vector> getSplitVarIDs() { + std::vector> result; + for (auto& tree : trees) { + result.push_back(tree->getSplitVarIDs()); + } + return result; + } + + std::vector> getSplitValues() { + std::vector> result; + for (auto& tree : trees) { + result.push_back(tree->getSplitValues()); + } + return result; + } + + const std::vector& getVariableImportance() const { + return variable_importance; + } + + double getOverallPredictionError() const { + return overall_prediction_error; + } + + const std::vector >& getPredictions() const { + return predictions; + } + + const double getPrediction(int i) const { + double sum = 0.0; + for (size_t j = 0; j < predictions[i].size(); j++) { + sum += predictions[i][j]; + } + + return sum / predictions[i].size(); + } + + double makePrediction(int i) const; + + size_t getDependentVarId() const { + return dependent_varID; + } + + size_t getNumTrees() const { + return num_trees; } - return result; - } - std::vector> getSplitValues() { - std::vector> result; - for (auto& tree : trees) { - result.push_back(tree->getSplitValues()); + + uint getMtry() const { + return mtry; } - return result; - } - const std::vector& getVariableImportance() const { - return variable_importance; - } - double getOverallPredictionError() const { - return overall_prediction_error; - } - const std::vector >& getPredictions() const { - return predictions; - } - size_t getDependentVarId() const { - return dependent_varID; - } - size_t getNumTrees() const { - return num_trees; - } - uint getMtry() const - { - return mtry; - } - uint getMinNodeSize() const - { - return min_node_size; - } - size_t getNumIndependentVariables() const - { - return num_independent_variables; - } - const std::vector& getIsOrderedVariable() const - { - return is_ordered_variable; - } - - std::vector> getInbagCounts() const { - std::vector> result; - for (auto& tree : trees) { - result.push_back(tree->getInbagCounts()); + + uint getMinNodeSize() const { + return min_node_size; } - return result; - } - + + size_t getNumIndependentVariables() const { + return num_independent_variables; + } + + const std::vector& getIsOrderedVariable() const { + return is_ordered_variable; + } + void setPredictionMode(bool prediction_mode) { this->prediction_mode = prediction_mode; } - void setData(Data* data) { - this->data = data; - // Assume we want to reset predictions too - this->predictions.clear(); - } - + /** + * Resets the data to a new dataset. Probably shouldn't use this unless you + * know what you are doing. Best to use "init" method instead. + * @param data + */ + void setData(Data* data, std::string dependent_variable_name, std::string status_variable_name, std::vector& unordered_variable_names); + void setVerboseOut(std::ostream* verbose_out) { this->verbose_out = verbose_out; } -void loadFromFile(std::string filename); - - + void loadFromFile(std::string filename); + + void grow(bool verbose); + void predict(); + + const std::vector& getCaseWeights() const { + return case_weights; + } + + void setCaseWeights(std::vector case_weights) { + this->case_weights = case_weights; + } + + + protected: - void grow(bool verbose); - virtual void growInternal() = 0; + virtual void growInternal() = 0; - // Predict using existing tree from file and data as prediction data - void predict(); - virtual void predictInternal() = 0; + // Predict using existing tree from file and data as prediction data + virtual void predictInternal() = 0; - void computePredictionError(); - virtual void computePredictionErrorInternal() = 0; + void computePredictionError(); + virtual void computePredictionErrorInternal() = 0; - void computePermutationImportance(); + void computePermutationImportance(); - // Multithreading methods for growing/prediction/importance, called by each thread - void growTreesInThread(uint thread_idx, std::vector* variable_importance); - void predictTreesInThread(uint thread_idx, const Data* prediction_data, bool oob_prediction); - void computeTreePermutationImportanceInThread(uint thread_idx, std::vector* importance, std::vector* variance); + // Multithreading methods for growing/prediction/importance, called by each thread + void growTreesInThread(uint thread_idx, std::vector* variable_importance); + void predictTreesInThread(uint thread_idx, const Data* prediction_data, bool oob_prediction); + void computeTreePermutationImportanceInThread(uint thread_idx, std::vector* importance, std::vector* variance); - // Load forest from file - virtual void loadFromFileInternal(std::ifstream& infile) = 0; + // Load forest from file + virtual void loadFromFileInternal(std::ifstream& infile) = 0; - // Set split select weights and variables to be always considered for splitting - void setSplitWeightVector(std::vector>& split_select_weights); - void setAlwaysSplitVariables(std::vector& always_split_variable_names); + // Set split select weights and variables to be always considered for splitting + void setSplitWeightVector(std::vector>&split_select_weights); + void setAlwaysSplitVariables(std::vector& always_split_variable_names); - // Show progress every few seconds + // Show progress every few seconds #ifdef WIN_R_BUILD - void showProgress(std::string operation, clock_t start_time, clock_t& lap_time); + void showProgress(std::string operation, clock_t start_time, clock_t& lap_time); #else - void showProgress(std::string operation); + void showProgress(std::string operation); #endif - // Verbose output stream, cout if verbose==true, logfile if not - std::ostream* verbose_out; - - size_t num_trees; - uint mtry; - uint min_node_size; - size_t num_variables; - size_t num_independent_variables; - uint seed; - size_t dependent_varID; - size_t num_samples; - bool prediction_mode; - MemoryMode memory_mode; - bool sample_with_replacement; - bool memory_saving_splitting; - SplitRule splitrule; - bool predict_all; - bool keep_inbag; - double sample_fraction; - - // For each varID true if ordered - std::vector is_ordered_variable; - - // Variable to not split at (only dependent_varID for non-survival forests) - std::vector no_split_variables; - - // Multithreading - uint num_threads; - std::vector thread_ranges; + // Verbose output stream, cout if verbose==true, logfile if not + std::ostream* verbose_out; + + size_t num_trees; + uint mtry; + uint min_node_size; + size_t num_variables; + size_t num_independent_variables; + uint seed; + size_t dependent_varID; + size_t num_samples; + bool prediction_mode; + MemoryMode memory_mode; + bool sample_with_replacement; + bool memory_saving_splitting; + SplitRule splitrule; + bool predict_all; + bool keep_inbag; + double sample_fraction; + + // For each varID true if ordered + std::vector is_ordered_variable; + + // Variable to not split at (only dependent_varID for non-survival forests) + std::vector no_split_variables; + + // Multithreading + uint num_threads; + std::vector thread_ranges; #ifndef WIN_R_BUILD - std::mutex mutex; - std::condition_variable condition_variable; + std::mutex mutex; + std::condition_variable condition_variable; #endif - std::vector trees; - Data* data; + std::vector trees; + Data* data; - std::vector> predictions; - double overall_prediction_error; + std::vector> predictions; + double overall_prediction_error; - // Weight vector for selecting possible split variables, one weight between 0 (never select) and 1 (always select) for each variable - // Deterministic variables are always selected - std::vector deterministic_varIDs; - std::vector split_select_varIDs; - std::vector> split_select_weights; + // Weight vector for selecting possible split variables, one weight between 0 (never select) and 1 (always select) for each variable + // Deterministic variables are always selected + std::vector deterministic_varIDs; + std::vector split_select_varIDs; + std::vector> split_select_weights; - // Bootstrap weights - std::vector case_weights; + // Bootstrap weights + std::vector case_weights; - // Random number generator - std::mt19937_64 random_number_generator; + // Random number generator + std::mt19937_64 random_number_generator; - std::string output_prefix; - ImportanceMode importance_mode; + std::string output_prefix; + ImportanceMode importance_mode; - // Variable importance for all variables in forest - std::vector variable_importance; + // Variable importance for all variables in forest + std::vector variable_importance; - // Computation progress (finished trees) - size_t progress; + // Computation progress (finished trees) + size_t progress; #ifdef R_BUILD - size_t aborted_threads; - bool aborted; + size_t aborted_threads; + bool aborted; #endif private: - DISALLOW_COPY_AND_ASSIGN(Forest); + DISALLOW_COPY_AND_ASSIGN(Forest); }; #endif /* FOREST_H_ */ diff --git a/deps/ranger-0.3.8/src/Data.cpp b/deps/ranger-0.3.8/src/Data.cpp index 712afb81..220dfbad 100644 --- a/deps/ranger-0.3.8/src/Data.cpp +++ b/deps/ranger-0.3.8/src/Data.cpp @@ -36,10 +36,17 @@ #include Data::Data() : -num_rows(0), num_rows_rounded(0), num_cols(0), sparse_data(0), num_cols_no_sparse(0), externalData(true), index_data( +num_rows(0), num_rows_rounded(0), num_cols(0), sparse_data(0), num_cols_no_sparse(0), index_data( 0), max_num_unique_values(0) { } +Data::Data(std::vector variable_names, size_t num_rows, size_t num_cols) : Data() { + this->variable_names = variable_names; + this->num_rows = num_rows; + this->num_cols = num_cols; + this->num_cols_no_sparse = num_cols; +} + Data::~Data() { if (index_data != 0) { delete[] index_data; @@ -94,7 +101,6 @@ bool Data::loadFromFile(std::string filename) { result = loadFromFileWhitespace(input_file, header_line); } - externalData = false; input_file.close(); return result; } diff --git a/deps/ranger-0.3.8/src/DataChar.cpp b/deps/ranger-0.3.8/src/DataChar.cpp index 737527dd..40028b0c 100644 --- a/deps/ranger-0.3.8/src/DataChar.cpp +++ b/deps/ranger-0.3.8/src/DataChar.cpp @@ -34,35 +34,28 @@ wright@imbs.uni-luebeck.de #include DataChar::DataChar() : - data(0) { +data(0) { } -DataChar::DataChar(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols, - bool& error) { - this->variable_names = variable_names; - this->num_rows = num_rows; - this->num_cols = num_cols; - this->num_cols_no_sparse = num_cols; - - reserveMemory(); - - // Save data and report errors - for (size_t i = 0; i < num_cols; ++i) { - for (size_t j = 0; j < num_rows; ++j) { - double value = data_double[i * num_rows + j]; - if (value > CHAR_MAX || value < CHAR_MIN) { - error = true; - } - if (floor(value) != ceil(value)) { - error = true; - } - data[i * num_rows + j] = (char) value; +DataChar::DataChar(double* data_double, std::vector variable_names, + size_t num_rows, size_t num_cols, bool& error) : + DataChar(variable_names, num_rows, num_cols) { + + // Save data and report errors + for (size_t i = 0; i < num_cols; ++i) { + for (size_t j = 0; j < num_rows; ++j) { + double value = data_double[i * num_rows + j]; + if (value > CHAR_MAX || value < CHAR_MIN) { + error = true; + } + if (floor(value) != ceil(value)) { + error = true; + } + data[i * num_rows + j] = (char) value; + } } - } } DataChar::~DataChar() { - if (!externalData) { delete[] data; - } } diff --git a/deps/ranger-0.3.8/src/DataDouble.cpp b/deps/ranger-0.3.8/src/DataDouble.cpp index af657bf6..862a85fa 100644 --- a/deps/ranger-0.3.8/src/DataDouble.cpp +++ b/deps/ranger-0.3.8/src/DataDouble.cpp @@ -29,12 +29,19 @@ wright@imbs.uni-luebeck.de #include DataDouble::DataDouble() : - data(0) { +data(0) { +} + +DataDouble::DataDouble(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols) : + DataDouble(variable_names, num_rows, num_cols) { + + for (size_t i = 0; i < num_cols; ++i) { + for (size_t j = 0; j < num_rows; ++j) { + data[i * num_rows + j] = data_double[i * num_rows + j]; + } + } } DataDouble::~DataDouble() { - if (!externalData) { delete[] data; - } } - diff --git a/deps/ranger-0.3.8/src/DataFloat.cpp b/deps/ranger-0.3.8/src/DataFloat.cpp index 9df28dc2..bcb64506 100644 --- a/deps/ranger-0.3.8/src/DataFloat.cpp +++ b/deps/ranger-0.3.8/src/DataFloat.cpp @@ -32,26 +32,19 @@ wright@imbs.uni-luebeck.de #include DataFloat::DataFloat() : - data(0) { +data(0) { } -DataFloat::DataFloat(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols) { - this->variable_names = variable_names; - this->num_rows = num_rows; - this->num_cols = num_cols; - this->num_cols_no_sparse = num_cols; - - reserveMemory(); - for (size_t i = 0; i < num_cols; ++i) { - for (size_t j = 0; j < num_rows; ++j) { - data[i * num_rows + j] = (float) data_double[i * num_rows + j]; +DataFloat::DataFloat(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols) : + DataFloat(variable_names, num_rows, num_cols) { + for (size_t i = 0; i < num_cols; ++i) { + for (size_t j = 0; j < num_rows; ++j) { + data[i * num_rows + j] = (float) data_double[i * num_rows + j]; + } } - } } DataFloat::~DataFloat() { - if (!externalData) { delete[] data; - } } diff --git a/deps/ranger-0.3.8/src/Forest.cpp b/deps/ranger-0.3.8/src/Forest.cpp index f58558f3..2682c571 100644 --- a/deps/ranger-0.3.8/src/Forest.cpp +++ b/deps/ranger-0.3.8/src/Forest.cpp @@ -45,826 +45,866 @@ Forest::Forest() : verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_variables(0), num_independent_variables( - 0), seed(0), dependent_varID(0), num_samples(0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement( - true), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction(1), num_threads( - DEFAULT_NUM_THREADS), data(0), overall_prediction_error(0), importance_mode(DEFAULT_IMPORTANCE_MODE), progress( - 0) { +0), seed(0), dependent_varID(0), num_samples(0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement( +true), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction(1), num_threads( +DEFAULT_NUM_THREADS), data(0), overall_prediction_error(0), importance_mode(DEFAULT_IMPORTANCE_MODE), progress( +0) { } Forest::~Forest() { - for (auto& tree : trees) { - delete tree; - } + for (auto& tree : trees) { + delete tree; + } } - void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, - std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, - std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, - std::string split_select_weights_file, std::vector& always_split_variable_names, - std::string status_variable_name, bool sample_with_replacement, std::vector& unordered_variable_names, - bool memory_saving_splitting, SplitRule splitrule, std::string case_weights_file, bool predict_all, - double sample_fraction) { - - this->verbose_out = verbose_out; - - // Initialize data with memmode - switch (memory_mode) { - case MEM_DOUBLE: - data = new DataDouble(); - break; - case MEM_FLOAT: - data = new DataFloat(); - break; - case MEM_CHAR: - data = new DataChar(); - break; - } - - // Load data - if (verbose_out) *verbose_out << "Loading input file: " << input_file << "." << std::endl; - bool rounding_error = data->loadFromFile(input_file); - if (rounding_error && verbose_out) { - *verbose_out << "Warning: Rounding or Integer overflow occurred. Use FLOAT or DOUBLE precision to avoid this." - << std::endl; - } - - // Set prediction mode - bool prediction_mode = false; - if (!load_forest_filename.empty()) { - prediction_mode = true; - } - - // Call other init function - init(dependent_variable_name, memory_mode, data, mtry, output_prefix, num_trees, seed, num_threads, importance_mode, - min_node_size, status_variable_name, prediction_mode, sample_with_replacement, unordered_variable_names, - memory_saving_splitting, splitrule, predict_all, sample_fraction); - - if (prediction_mode) { - loadFromFile(load_forest_filename); - } - // Set variables to be always considered for splitting - if (!always_split_variable_names.empty()) { - setAlwaysSplitVariables(always_split_variable_names); - } - - // TODO: Read 2d weights for tree-wise split select weights - // Load split select weights from file - if (!split_select_weights_file.empty()) { - std::vector> split_select_weights; - split_select_weights.resize(1); - loadDoubleVectorFromFile(split_select_weights[0], split_select_weights_file); - if (split_select_weights[0].size() != num_variables - 1) { - throw std::runtime_error("Number of split select weights is not equal to number of independent variables."); - } - setSplitWeightVector(split_select_weights); - } - - // Load case weights from file - if (!case_weights_file.empty()) { - loadDoubleVectorFromFile(case_weights, case_weights_file); - if (case_weights.size() != num_samples - 1) { - throw std::runtime_error("Number of case weights is not equal to number of samples."); - } - } - - // Check if all catvars are coded in integers starting at 1 - if (!unordered_variable_names.empty()) { - std::string error_message = checkUnorderedVariables(data, unordered_variable_names); - if (!error_message.empty()) { - throw std::runtime_error(error_message); - } - } + std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, + std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, + std::string split_select_weights_file, std::vector& always_split_variable_names, + std::string status_variable_name, bool sample_with_replacement, std::vector& unordered_variable_names, + bool memory_saving_splitting, SplitRule splitrule, std::string case_weights_file, bool predict_all, + double sample_fraction) { + + this->verbose_out = verbose_out; + + // Initialize data with memmode + switch (memory_mode) { + case MEM_DOUBLE: + data = new DataDouble(); + break; + case MEM_FLOAT: + data = new DataFloat(); + break; + case MEM_CHAR: + data = new DataChar(); + break; + } + + // Load data + if (verbose_out) *verbose_out << "Loading input file: " << input_file << "." << std::endl; + bool rounding_error = data->loadFromFile(input_file); + if (rounding_error && verbose_out) { + *verbose_out << "Warning: Rounding or Integer overflow occurred. Use FLOAT or DOUBLE precision to avoid this." + << std::endl; + } + + // Set prediction mode + bool prediction_mode = false; + if (!load_forest_filename.empty()) { + prediction_mode = true; + } + + // Call other init function + init(dependent_variable_name, memory_mode, data, mtry, output_prefix, num_trees, seed, num_threads, importance_mode, + min_node_size, status_variable_name, prediction_mode, sample_with_replacement, unordered_variable_names, + memory_saving_splitting, splitrule, predict_all, sample_fraction); + + if (prediction_mode) { + loadFromFile(load_forest_filename); + } + // Set variables to be always considered for splitting + if (!always_split_variable_names.empty()) { + setAlwaysSplitVariables(always_split_variable_names); + } + + // TODO: Read 2d weights for tree-wise split select weights + // Load split select weights from file + if (!split_select_weights_file.empty()) { + std::vector> split_select_weights; + split_select_weights.resize(1); + loadDoubleVectorFromFile(split_select_weights[0], split_select_weights_file); + if (split_select_weights[0].size() != num_variables - 1) { + throw std::runtime_error("Number of split select weights is not equal to number of independent variables."); + } + setSplitWeightVector(split_select_weights); + } + + // Load case weights from file + if (!case_weights_file.empty()) { + loadDoubleVectorFromFile(case_weights, case_weights_file); + if (case_weights.size() != num_samples - 1) { + throw std::runtime_error("Number of case weights is not equal to number of samples."); + } + } + + // Check if all catvars are coded in integers starting at 1 + if (!unordered_variable_names.empty()) { + std::string error_message = checkUnorderedVariables(data, unordered_variable_names); + if (!error_message.empty()) { + throw std::runtime_error(error_message); + } + } } void Forest::initR(std::string dependent_variable_name, Data* input_data, uint mtry, uint num_trees, - std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, - std::vector>& split_select_weights, std::vector& always_split_variable_names, - std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, - std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - std::vector& case_weights, bool predict_all, bool keep_inbag, double sample_fraction) { - - this->verbose_out = verbose_out; - - // Call other init function - init(dependent_variable_name, MEM_DOUBLE, input_data, mtry, "", num_trees, seed, num_threads, importance_mode, - min_node_size, status_variable_name, prediction_mode, sample_with_replacement, unordered_variable_names, - memory_saving_splitting, splitrule, predict_all, sample_fraction); - - // Set variables to be always considered for splitting - if (!always_split_variable_names.empty()) { - setAlwaysSplitVariables(always_split_variable_names); - } - - // Set split select weights - if (!split_select_weights.empty()) { - setSplitWeightVector(split_select_weights); - } - - // Set case weights - if (!case_weights.empty()) { - if (case_weights.size() != num_samples) { - throw std::runtime_error("Number of case weights not equal to number of samples."); - } - this->case_weights = case_weights; - } - - // Keep inbag counts - this->keep_inbag = keep_inbag; + std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, + std::vector>&split_select_weights, std::vector& always_split_variable_names, + std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, + std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, + std::vector& case_weights, bool predict_all, bool keep_inbag, double sample_fraction) { + + this->verbose_out = verbose_out; + + // Call other init function + init(dependent_variable_name, MEM_DOUBLE, input_data, mtry, "", num_trees, seed, num_threads, importance_mode, + min_node_size, status_variable_name, prediction_mode, sample_with_replacement, unordered_variable_names, + memory_saving_splitting, splitrule, predict_all, sample_fraction); + + // Set variables to be always considered for splitting + if (!always_split_variable_names.empty()) { + setAlwaysSplitVariables(always_split_variable_names); + } + + // Set split select weights + if (!split_select_weights.empty()) { + setSplitWeightVector(split_select_weights); + } + + // Set case weights + if (!case_weights.empty()) { + if (case_weights.size() != num_samples) { + throw std::runtime_error("Number of case weights not equal to number of samples."); + } + this->case_weights = case_weights; + } + + // Keep inbag counts + this->keep_inbag = keep_inbag; } -void Forest::init(std::string dependent_variable_name, MemoryMode memory_mode, Data* input_data, uint mtry, - std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, - uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, - std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - bool predict_all, double sample_fraction) { - - // Initialize data with memmode - this->data = input_data; - - // Initialize random number generator and set seed - if (seed == 0) { - std::random_device random_device; - random_number_generator.seed(random_device()); - } else { - random_number_generator.seed(seed); - } - - // Set number of threads - if (num_threads == DEFAULT_NUM_THREADS) { +void Forest::init(MemoryMode memory_mode, uint mtry, std::string output_prefix, + uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, + uint min_node_size, bool prediction_mode, bool sample_with_replacement, bool memory_saving_splitting, + SplitRule splitrule, bool predict_all, double sample_fraction) { + + // Initialize random number generator and set seed + if (seed == 0) { + std::random_device random_device; + random_number_generator.seed(random_device()); + } else { + random_number_generator.seed(seed); + } + + // Set number of threads + if (num_threads == DEFAULT_NUM_THREADS) { #ifdef WIN_R_BUILD - this->num_threads = 1; + this->num_threads = 1; #else - this->num_threads = std::thread::hardware_concurrency(); + this->num_threads = std::thread::hardware_concurrency(); #endif - } else { - this->num_threads = num_threads; - } - - // Set member variables - this->num_trees = num_trees; - this->mtry = mtry; - this->seed = seed; - this->output_prefix = output_prefix; - this->importance_mode = importance_mode; - this->min_node_size = min_node_size; - this->memory_mode = memory_mode; - this->prediction_mode = prediction_mode; - this->sample_with_replacement = sample_with_replacement; - this->memory_saving_splitting = memory_saving_splitting; - this->splitrule = splitrule; - this->predict_all = predict_all; - this->sample_fraction = sample_fraction; - - // Set number of samples and variables - num_samples = data->getNumRows(); - num_variables = data->getNumCols(); - - // Convert dependent variable name to ID - if (!prediction_mode && !dependent_variable_name.empty()) { - dependent_varID = data->getVariableID(dependent_variable_name); - } - - // Set unordered factor variables - if (!prediction_mode) { - is_ordered_variable.resize(num_variables, true); - for (auto& variable_name : unordered_variable_names) { - size_t varID = data->getVariableID(variable_name); - is_ordered_variable[varID] = false; - } - } - - no_split_variables.push_back(dependent_varID); - - initInternal(status_variable_name); - - num_independent_variables = num_variables - no_split_variables.size(); - - // Sort no split variables in ascending order - std::sort(no_split_variables.begin(), no_split_variables.end()); - - // Init split select weights - split_select_weights.push_back(std::vector()); - - // Check if mtry is in valid range - if (this->mtry > num_variables - 1) { - throw std::runtime_error("mtry can not be larger than number of variables in data."); - } - - // Check if any observations samples - if ((size_t) num_samples * sample_fraction < 1) { - throw std::runtime_error("sample_fraction too small, no observations sampled."); - } + } else { + this->num_threads = num_threads; + } + + // Set member variables + this->num_trees = num_trees; + this->mtry = mtry; + this->seed = seed; + this->output_prefix = output_prefix; + this->importance_mode = importance_mode; + this->min_node_size = min_node_size; + this->memory_mode = memory_mode; + this->prediction_mode = prediction_mode; + this->sample_with_replacement = sample_with_replacement; + this->memory_saving_splitting = memory_saving_splitting; + this->splitrule = splitrule; + this->predict_all = predict_all; + this->sample_fraction = sample_fraction; +} + +void Forest::init(std::string dependent_variable_name, MemoryMode memory_mode, Data* input_data, uint mtry, + std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, + uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, + std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, + bool predict_all, double sample_fraction) { + + this->init(memory_mode, mtry, output_prefix, num_trees, seed, num_threads, + importance_mode, min_node_size, prediction_mode, sample_with_replacement, + memory_saving_splitting, splitrule, predict_all, sample_fraction); + + this->setData(input_data, dependent_variable_name, status_variable_name, unordered_variable_names); } void Forest::run(bool verbose) { - if (prediction_mode) { - if (verbose && verbose_out) { - *verbose_out << "Predicting .." << std::endl; - } - predict(); - } else { - if (verbose && verbose_out) { - *verbose_out << "Growing trees .." << std::endl; - } + if (prediction_mode) { + if (verbose && verbose_out) { + *verbose_out << "Predicting .." << std::endl; + } + predict(); + } else { + if (verbose && verbose_out) { + *verbose_out << "Growing trees .." << std::endl; + } - grow(verbose); + grow(verbose); - if (verbose && verbose_out) { - *verbose_out << "Computing prediction error .." << std::endl; - } - computePredictionError(); + if (verbose && verbose_out) { + *verbose_out << "Computing prediction error .." << std::endl; + } + computePredictionError(); - if (importance_mode > IMP_GINI) { - if (verbose && verbose_out) { - *verbose_out << "Computing permutation variable importance .." << std::endl; - } - computePermutationImportance(); + if (importance_mode > IMP_GINI) { + if (verbose && verbose_out) { + *verbose_out << "Computing permutation variable importance .." << std::endl; + } + computePermutationImportance(); + } } - } +} + +/** +* Resets the data to a new dataset. Probably shouldn't use this unless you +* know what you are doing. Best to use "init" method instead. +* @param data +*/ +void Forest::setData(Data* data, std::string dependent_variable_name, std::string status_variable_name, std::vector& unordered_variable_names) { + + // Record the pointer to the data object + this->data = data; + + // Set number of samples and variables + this->num_samples = data->getNumRows(); + this->num_variables = data->getNumCols(); + + // Convert dependent variable name to ID + dependent_varID = data->getVariableID(dependent_variable_name); + + // Set unordered factor variables + is_ordered_variable.resize(num_variables, true); + for (auto& variable_name : unordered_variable_names) { + size_t varID = data->getVariableID(variable_name); + is_ordered_variable[varID] = false; + } + + no_split_variables.push_back(dependent_varID); + + initInternal(status_variable_name); + + num_independent_variables = num_variables - no_split_variables.size(); + + // Sort no split variables in ascending order + std::sort(no_split_variables.begin(), no_split_variables.end()); + + // Init split select weights + split_select_weights.push_back(std::vector()); + + // Check if mtry is in valid range + if (this->mtry > num_variables - 1) { + throw std::runtime_error("mtry can not be larger than number of variables in data."); + } + + // Check if any observations samples + if ((size_t) num_samples * sample_fraction < 1) { + throw std::runtime_error("sample_fraction too small, no observations sampled."); + } } void Forest::writeOutput(std::ostream* out) { - *out << std::endl; - writeOutputInternal(); - *out << "Dependent variable name: " << data->getVariableNames()[dependent_varID] << std::endl; - *out << "Dependent variable ID: " << dependent_varID << std::endl; - *out << "Number of trees: " << num_trees << std::endl; - *out << "Sample size: " << num_samples << std::endl; - *out << "Number of independent variables: " << num_independent_variables << std::endl; - *out << "Mtry: " << mtry << std::endl; - *out << "Target node size: " << min_node_size << std::endl; - *out << "Variable importance mode: " << importance_mode << std::endl; - *out << "Memory mode: " << memory_mode << std::endl; - *out << "Seed: " << seed << std::endl; - *out << "Number of threads: " << num_threads << std::endl; - *out << std::endl; - - if (prediction_mode) { - writePredictionFile(); - } else { - *out << "Overall OOB prediction error: " << overall_prediction_error << std::endl; + *out << std::endl; + writeOutputInternal(); + *out << "Dependent variable name: " << data->getVariableNames()[dependent_varID] << std::endl; + *out << "Dependent variable ID: " << dependent_varID << std::endl; + *out << "Number of trees: " << num_trees << std::endl; + *out << "Sample size: " << num_samples << std::endl; + *out << "Number of independent variables: " << num_independent_variables << std::endl; + *out << "Mtry: " << mtry << std::endl; + *out << "Target node size: " << min_node_size << std::endl; + *out << "Variable importance mode: " << importance_mode << std::endl; + *out << "Memory mode: " << memory_mode << std::endl; + *out << "Seed: " << seed << std::endl; + *out << "Number of threads: " << num_threads << std::endl; *out << std::endl; - if (!split_select_weights.empty() & !split_select_weights[0].empty()) { - *out - << "Warning: Split select weights used. Variable importance measures are only comparable for variables with equal weights." - << std::endl; - } + if (prediction_mode) { + writePredictionFile(); + } else { + *out << "Overall OOB prediction error: " << overall_prediction_error << std::endl; + *out << std::endl; - if (importance_mode != IMP_NONE) { - writeImportanceFile(); - } + if (!split_select_weights.empty() & !split_select_weights[0].empty()) { + *out + << "Warning: Split select weights used. Variable importance measures are only comparable for variables with equal weights." + << std::endl; + } + + if (importance_mode != IMP_NONE) { + writeImportanceFile(); + } - writeConfusionFile(); - } + writeConfusionFile(); + } } void Forest::writeImportanceFile() { - // Open importance file for writing - std::string filename = output_prefix + ".importance"; - std::ofstream importance_file; - importance_file.open(filename, std::ios::out); - if (!importance_file.good()) { - throw std::runtime_error("Could not write to importance file: " + filename + "."); - } - - // Write importance to file - for (size_t i = 0; i < variable_importance.size(); ++i) { - size_t varID = i; - for (auto& skip : no_split_variables) { - if (varID >= skip) { - ++varID; - } - } - std::string variable_name = data->getVariableNames()[varID]; - importance_file << variable_name << ": " << variable_importance[i] << std::endl; - } - - importance_file.close(); - if (verbose_out) *verbose_out << "Saved variable importance to file " << filename << "." << std::endl; + // Open importance file for writing + std::string filename = output_prefix + ".importance"; + std::ofstream importance_file; + importance_file.open(filename, std::ios::out); + if (!importance_file.good()) { + throw std::runtime_error("Could not write to importance file: " + filename + "."); + } + + // Write importance to file + for (size_t i = 0; i < variable_importance.size(); ++i) { + size_t varID = i; + for (auto& skip : no_split_variables) { + if (varID >= skip) { + ++varID; + } + } + std::string variable_name = data->getVariableNames()[varID]; + importance_file << variable_name << ": " << variable_importance[i] << std::endl; + } + + importance_file.close(); + if (verbose_out) *verbose_out << "Saved variable importance to file " << filename << "." << std::endl; } void Forest::saveToFile() { + saveToFile(output_prefix + ".forest"); +} + +void Forest::saveToFile(std::string filename) { - // Open file for writing - std::string filename = output_prefix + ".forest"; - std::ofstream outfile; - outfile.open(filename, std::ios::binary); - if (!outfile.good()) { - throw std::runtime_error("Could not write to output file: " + filename + "."); - } + // Open file for writing + std::ofstream outfile; + outfile.open(filename, std::ios::binary); + if (!outfile.good()) { + throw std::runtime_error("Could not write to output file: " + filename + "."); + } - // Write dependent_varID - outfile.write((char*) &dependent_varID, sizeof(dependent_varID)); + // Write dependent_varID + outfile.write((char*) &dependent_varID, sizeof (dependent_varID)); - // Write num_trees - outfile.write((char*) &num_trees, sizeof(num_trees)); + // Write num_trees + outfile.write((char*) &num_trees, sizeof (num_trees)); - // Write is_ordered_variable - saveVector1D(is_ordered_variable, outfile); + // Write is_ordered_variable + saveVector1D(is_ordered_variable, outfile); - saveToFileInternal(outfile); + saveToFileInternal(outfile); - // Write tree data for each tree - for (auto& tree : trees) { - tree->appendToFile(outfile); - } + // Write tree data for each tree + for (auto& tree : trees) { + tree->appendToFile(outfile); + } - // Close file - outfile.close(); - if (verbose_out) *verbose_out << "Saved forest to file " << filename << "." << std::endl; + // Close file + outfile.close(); + if (verbose_out) *verbose_out << "Saved forest to file " << filename << "." << std::endl; } void Forest::grow(bool verbose) { - // Create thread ranges - equalSplit(thread_ranges, 0, num_trees - 1, num_threads); + // Create thread ranges + equalSplit(thread_ranges, 0, num_trees - 1, num_threads); - // Call special grow functions of subclasses. There trees must be created. - growInternal(); + // Call special grow functions of subclasses. There trees must be created. + growInternal(); - // Init trees, create a seed for each tree, based on main seed - std::uniform_int_distribution udist; - for (size_t i = 0; i < num_trees; ++i) { - uint tree_seed; - if (seed == 0) { - tree_seed = udist(random_number_generator); - } else { - tree_seed = (i + 1) * seed; - } + // Init trees, create a seed for each tree, based on main seed + std::uniform_int_distribution udist; + for (size_t i = 0; i < num_trees; ++i) { + uint tree_seed; + if (seed == 0) { + tree_seed = udist(random_number_generator); + } else { + tree_seed = (i + 1) * seed; + } - // Get split select weights for tree - std::vector* tree_split_select_weights; - if (split_select_weights.size() > 1) { - tree_split_select_weights = &split_select_weights[i]; - } else { - tree_split_select_weights = &split_select_weights[0]; - } + // Get split select weights for tree + std::vector* tree_split_select_weights; + if (split_select_weights.size() > 1) { + tree_split_select_weights = &split_select_weights[i]; + } else { + tree_split_select_weights = &split_select_weights[0]; + } - trees[i]->init(data, mtry, dependent_varID, num_samples, tree_seed, &deterministic_varIDs, &split_select_varIDs, - tree_split_select_weights, importance_mode, min_node_size, &no_split_variables, sample_with_replacement, - &is_ordered_variable, memory_saving_splitting, splitrule, &case_weights, keep_inbag, sample_fraction); - } + trees[i]->init(data, mtry, dependent_varID, num_samples, tree_seed, &deterministic_varIDs, &split_select_varIDs, + tree_split_select_weights, importance_mode, min_node_size, &no_split_variables, sample_with_replacement, + &is_ordered_variable, memory_saving_splitting, splitrule, &case_weights, keep_inbag, sample_fraction); + } - // Init variable importance - variable_importance.resize(num_independent_variables, 0); + // Init variable importance + variable_importance.resize(num_independent_variables, 0); - // Grow trees in multiple threads + // Grow trees in multiple threads #ifdef WIN_R_BUILD - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->grow(&variable_importance); - progress++; - showProgress("Growing trees..", start_time, lap_time); - } + progress = 0; + clock_t start_time = clock(); + clock_t lap_time = clock(); + for (size_t i = 0; i < num_trees; ++i) { + trees[i]->grow(&variable_importance); + progress++; + showProgress("Growing trees..", start_time, lap_time); + } #else - progress = 0; + progress = 0; #ifdef R_BUILD - aborted = false; - aborted_threads = 0; + aborted = false; + aborted_threads = 0; #endif - std::vector threads; - threads.reserve(num_threads); + std::vector threads; + threads.reserve(num_threads); - // Initailize importance per thread - std::vector> variable_importance_threads(num_threads); + // Initailize importance per thread + std::vector> variable_importance_threads(num_threads); - for (uint i = 0; i < num_threads; ++i) { - if (importance_mode == IMP_GINI) { - variable_importance_threads[i].resize(num_independent_variables, 0); + for (uint i = 0; i < num_threads; ++i) { + if (importance_mode == IMP_GINI) { + variable_importance_threads[i].resize(num_independent_variables, 0); + } + + threads.push_back(std::thread(&Forest::growTreesInThread, this, i, &(variable_importance_threads[i]))); + } + if (verbose) showProgress("Growing trees.."); + for (auto &thread : threads) { + thread.join(); } - threads.push_back(std::thread(&Forest::growTreesInThread, this, i, &(variable_importance_threads[i]))); - } - if (verbose) showProgress("Growing trees.."); - for (auto &thread : threads) { - thread.join(); - } #ifdef R_BUILD - if (aborted_threads > 0) { - throw std::runtime_error("User interrupt."); - } + if (aborted_threads > 0) { + throw std::runtime_error("User interrupt."); + } #endif - // Sum thread importances - if (importance_mode == IMP_GINI) { - variable_importance.resize(num_independent_variables, 0); - for (size_t i = 0; i < num_independent_variables; ++i) { - for (uint j = 0; j < num_threads; ++j) { - variable_importance[i] += variable_importance_threads[j][i]; - } + // Sum thread importances + if (importance_mode == IMP_GINI) { + variable_importance.resize(num_independent_variables, 0); + for (size_t i = 0; i < num_independent_variables; ++i) { + for (uint j = 0; j < num_threads; ++j) { + variable_importance[i] += variable_importance_threads[j][i]; + } + } + variable_importance_threads.clear(); } - variable_importance_threads.clear(); - } #endif - // Divide importance by number of trees - if (importance_mode == IMP_GINI) { - for (auto& v : variable_importance) { - v /= num_trees; + // Divide importance by number of trees + if (importance_mode == IMP_GINI) { + for (auto& v : variable_importance) { + v /= num_trees; + } } - } } void Forest::predict() { - // Predict trees in multiple threads and join the threads with the main thread + // Predict trees in multiple threads and join the threads with the main thread #ifdef WIN_R_BUILD - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->predict(data, false); - progress++; - showProgress("Predicting..", start_time, lap_time); - } + progress = 0; + clock_t start_time = clock(); + clock_t lap_time = clock(); + for (size_t i = 0; i < num_trees; ++i) { + trees[i]->predict(data, false); + progress++; + showProgress("Predicting..", start_time, lap_time); + } #else - progress = 0; + progress = 0; #ifdef R_BUILD - aborted = false; - aborted_threads = 0; + aborted = false; + aborted_threads = 0; #endif - std::vector threads; - threads.reserve(num_threads); - for (uint i = 0; i < num_threads; ++i) { - threads.push_back(std::thread(&Forest::predictTreesInThread, this, i, data, false)); - } - showProgress("Predicting.."); - for (auto &thread : threads) { - thread.join(); - } + std::vector threads; + threads.reserve(num_threads); + for (uint i = 0; i < num_threads; ++i) { + threads.push_back(std::thread(&Forest::predictTreesInThread, this, i, data, false)); + } + showProgress("Predicting.."); + for (auto &thread : threads) { + thread.join(); + } #ifdef R_BUILD - if (aborted_threads > 0) { - throw std::runtime_error("User interrupt."); - } + if (aborted_threads > 0) { + throw std::runtime_error("User interrupt."); + } #endif #endif - // Call special functions for subclasses - predictInternal(); + // Call special functions for subclasses + predictInternal(); +} + +double Forest::makePrediction(int i) const { + double sum = std::accumulate(predictions[i].begin(), predictions[i].end(), 0.0); + double mean = sum / predictions[i].size(); + + double sq_sum = std::inner_product(predictions[i].begin(), predictions[i].end(), predictions[i].begin(), 0.0); + double stdev = std::sqrt(sq_sum / predictions[i].size() - mean * mean); + + std::random_device rd; + std::mt19937 gen(rd()); + + // values near the mean are the most likely + // standard deviation affects the dispersion of generated values from the mean + std::normal_distribution ndist(mean, stdev); + + // Return the new prediction based on the distribution + return std::abs(std::round(ndist(gen))); } void Forest::computePredictionError() { - // Predict trees in multiple threads + // Predict trees in multiple threads #ifdef WIN_R_BUILD - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->predict(data, true); - progress++; - showProgress("Predicting..", start_time, lap_time); - } + progress = 0; + clock_t start_time = clock(); + clock_t lap_time = clock(); + for (size_t i = 0; i < num_trees; ++i) { + trees[i]->predict(data, true); + progress++; + showProgress("Predicting..", start_time, lap_time); + } #else - std::vector threads; - threads.reserve(num_threads); - for (uint i = 0; i < num_threads; ++i) { - threads.push_back(std::thread(&Forest::predictTreesInThread, this, i, data, true)); - } - showProgress("Computing prediction error.."); - for (auto &thread : threads) { - thread.join(); - } + std::vector threads; + threads.reserve(num_threads); + for (uint i = 0; i < num_threads; ++i) { + threads.push_back(std::thread(&Forest::predictTreesInThread, this, i, data, true)); + } + showProgress("Computing prediction error.."); + for (auto &thread : threads) { + thread.join(); + } #ifdef R_BUILD - if (aborted_threads > 0) { - throw std::runtime_error("User interrupt."); - } + if (aborted_threads > 0) { + throw std::runtime_error("User interrupt."); + } #endif #endif - // Call special function for subclasses - computePredictionErrorInternal(); + // Call special function for subclasses + computePredictionErrorInternal(); } void Forest::computePermutationImportance() { - // Compute tree permutation importance in multiple threads + // Compute tree permutation importance in multiple threads #ifdef WIN_R_BUILD - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - - // Initailize importance and variance - variable_importance.resize(num_independent_variables, 0); - std::vector variance; - if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { - variance.resize(num_independent_variables, 0); - } - - // Compute importance - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->computePermutationImportance(&variable_importance, &variance); - progress++; - showProgress("Computing permutation importance..", start_time, lap_time); - } + progress = 0; + clock_t start_time = clock(); + clock_t lap_time = clock(); + + // Initailize importance and variance + variable_importance.resize(num_independent_variables, 0); + std::vector variance; + if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { + variance.resize(num_independent_variables, 0); + } + + // Compute importance + for (size_t i = 0; i < num_trees; ++i) { + trees[i]->computePermutationImportance(&variable_importance, &variance); + progress++; + showProgress("Computing permutation importance..", start_time, lap_time); + } #else - progress = 0; + progress = 0; #ifdef R_BUILD - aborted = false; - aborted_threads = 0; + aborted = false; + aborted_threads = 0; #endif - std::vector threads; - threads.reserve(num_threads); + std::vector threads; + threads.reserve(num_threads); - // Initailize importance and variance - std::vector> variable_importance_threads(num_threads); - std::vector> variance_threads(num_threads); + // Initailize importance and variance + std::vector> variable_importance_threads(num_threads); + std::vector> variance_threads(num_threads); - // Compute importance - for (uint i = 0; i < num_threads; ++i) { - variable_importance_threads[i].resize(num_independent_variables, 0); - if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { - variance_threads[i].resize(num_independent_variables, 0); + // Compute importance + for (uint i = 0; i < num_threads; ++i) { + variable_importance_threads[i].resize(num_independent_variables, 0); + if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { + variance_threads[i].resize(num_independent_variables, 0); + } + threads.push_back( + std::thread(&Forest::computeTreePermutationImportanceInThread, this, i, &(variable_importance_threads[i]), + &(variance_threads[i]))); + } + showProgress("Computing permutation importance.."); + for (auto &thread : threads) { + thread.join(); } - threads.push_back( - std::thread(&Forest::computeTreePermutationImportanceInThread, this, i, &(variable_importance_threads[i]), - &(variance_threads[i]))); - } - showProgress("Computing permutation importance.."); - for (auto &thread : threads) { - thread.join(); - } #ifdef R_BUILD - if (aborted_threads > 0) { - throw std::runtime_error("User interrupt."); - } + if (aborted_threads > 0) { + throw std::runtime_error("User interrupt."); + } #endif - // Sum thread importances - variable_importance.resize(num_independent_variables, 0); - for (size_t i = 0; i < num_independent_variables; ++i) { - for (uint j = 0; j < num_threads; ++j) { - variable_importance[i] += variable_importance_threads[j][i]; + // Sum thread importances + variable_importance.resize(num_independent_variables, 0); + for (size_t i = 0; i < num_independent_variables; ++i) { + for (uint j = 0; j < num_threads; ++j) { + variable_importance[i] += variable_importance_threads[j][i]; + } } - } - variable_importance_threads.clear(); + variable_importance_threads.clear(); - // Sum thread variances - std::vector variance(num_independent_variables, 0); - if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { - for (size_t i = 0; i < num_independent_variables; ++i) { - for (uint j = 0; j < num_threads; ++j) { - variance[i] += variance_threads[j][i]; - } + // Sum thread variances + std::vector variance(num_independent_variables, 0); + if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { + for (size_t i = 0; i < num_independent_variables; ++i) { + for (uint j = 0; j < num_threads; ++j) { + variance[i] += variance_threads[j][i]; + } + } + variance_threads.clear(); } - variance_threads.clear(); - } #endif - for (size_t i = 0; i < variable_importance.size(); ++i) { - variable_importance[i] /= num_trees; + for (size_t i = 0; i < variable_importance.size(); ++i) { + variable_importance[i] /= num_trees; - // Normalize by variance for scaled permutation importance - if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { - if (variance[i] != 0) { - variance[i] = variance[i] / num_trees - variable_importance[i] * variable_importance[i]; - variable_importance[i] /= sqrt(variance[i] / num_trees); - } + // Normalize by variance for scaled permutation importance + if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { + if (variance[i] != 0) { + variance[i] = variance[i] / num_trees - variable_importance[i] * variable_importance[i]; + variable_importance[i] /= sqrt(variance[i] / num_trees); + } + } } - } } #ifndef WIN_R_BUILD + void Forest::growTreesInThread(uint thread_idx, std::vector* variable_importance) { - if (thread_ranges.size() > thread_idx + 1) { - for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { - trees[i]->grow(variable_importance); + if (thread_ranges.size() > thread_idx + 1) { + for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { + trees[i]->grow(variable_importance); - // Check for user interrupt + // Check for user interrupt #ifdef R_BUILD - if (aborted) { - std::unique_lock lock(mutex); - ++aborted_threads; - condition_variable.notify_one(); - return; - } + if (aborted) { + std::unique_lock lock(mutex); + ++aborted_threads; + condition_variable.notify_one(); + return; + } #endif - // Increase progress by 1 tree - std::unique_lock lock(mutex); - ++progress; - condition_variable.notify_one(); + // Increase progress by 1 tree + std::unique_lock lock(mutex); + ++progress; + condition_variable.notify_one(); + } } - } } void Forest::predictTreesInThread(uint thread_idx, const Data* prediction_data, bool oob_prediction) { - if (thread_ranges.size() > thread_idx + 1) { - for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { - trees[i]->predict(prediction_data, oob_prediction); + if (thread_ranges.size() > thread_idx + 1) { + for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { + trees[i]->predict(prediction_data, oob_prediction); - // Check for user interrupt + // Check for user interrupt #ifdef R_BUILD - if (aborted) { - std::unique_lock lock(mutex); - ++aborted_threads; - condition_variable.notify_one(); - return; - } + if (aborted) { + std::unique_lock lock(mutex); + ++aborted_threads; + condition_variable.notify_one(); + return; + } #endif - // Increase progress by 1 tree - std::unique_lock lock(mutex); - ++progress; - condition_variable.notify_one(); + // Increase progress by 1 tree + std::unique_lock lock(mutex); + ++progress; + condition_variable.notify_one(); + } } - } } void Forest::computeTreePermutationImportanceInThread(uint thread_idx, std::vector* importance, - std::vector* variance) { - if (thread_ranges.size() > thread_idx + 1) { - for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { - trees[i]->computePermutationImportance(importance, variance); + std::vector* variance) { + if (thread_ranges.size() > thread_idx + 1) { + for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { + trees[i]->computePermutationImportance(importance, variance); - // Check for user interrupt + // Check for user interrupt #ifdef R_BUILD - if (aborted) { - std::unique_lock lock(mutex); - ++aborted_threads; - condition_variable.notify_one(); - return; - } + if (aborted) { + std::unique_lock lock(mutex); + ++aborted_threads; + condition_variable.notify_one(); + return; + } #endif - // Increase progress by 1 tree - std::unique_lock lock(mutex); - ++progress; - condition_variable.notify_one(); + // Increase progress by 1 tree + std::unique_lock lock(mutex); + ++progress; + condition_variable.notify_one(); + } } - } } #endif void Forest::loadFromFile(std::string filename) { - if (verbose_out) *verbose_out << "Loading forest from file " << filename << "." << std::endl; + if (verbose_out) *verbose_out << "Loading forest from file " << filename << "." << std::endl; - // Open file for reading - std::ifstream infile; - infile.open(filename, std::ios::binary); - if (!infile.good()) { - throw std::runtime_error("Could not read from input file: " + filename + "."); - } + // Open file for reading + std::ifstream infile; + infile.open(filename, std::ios::binary); + if (!infile.good()) { + throw std::runtime_error("Could not read from input file: " + filename + "."); + } - // Read dependent_varID and num_trees - infile.read((char*) &dependent_varID, sizeof(dependent_varID)); - infile.read((char*) &num_trees, sizeof(num_trees)); + // Read dependent_varID and num_trees + infile.read((char*) &dependent_varID, sizeof (dependent_varID)); + infile.read((char*) &num_trees, sizeof (num_trees)); - // Read is_ordered_variable - readVector1D(is_ordered_variable, infile); + // Read is_ordered_variable + readVector1D(is_ordered_variable, infile); - // Read tree data. This is different for tree types -> virtual function - loadFromFileInternal(infile); + // Read tree data. This is different for tree types -> virtual function + loadFromFileInternal(infile); - infile.close(); + infile.close(); - // Create thread ranges - equalSplit(thread_ranges, 0, num_trees - 1, num_threads); + // Create thread ranges + equalSplit(thread_ranges, 0, num_trees - 1, num_threads); } -void Forest::setSplitWeightVector(std::vector>& split_select_weights) { - - // Size should be 1 x num_independent_variables or num_trees x num_independent_variables - if (split_select_weights.size() != 1 && split_select_weights.size() != num_trees) { - throw std::runtime_error("Size of split select weights not equal to 1 or number of trees."); - } - - // Reserve space - if (split_select_weights.size() == 1) { - this->split_select_weights[0].resize(num_independent_variables); - } else { - this->split_select_weights.clear(); - this->split_select_weights.resize(num_trees, std::vector(num_independent_variables)); - } - this->split_select_varIDs.resize(num_independent_variables); - deterministic_varIDs.reserve(num_independent_variables); - - // Split up in deterministic and weighted variables, ignore zero weights - for (size_t i = 0; i < split_select_weights.size(); ++i) { +void Forest::setSplitWeightVector(std::vector>&split_select_weights) { // Size should be 1 x num_independent_variables or num_trees x num_independent_variables - if (split_select_weights[i].size() != num_independent_variables) { - throw std::runtime_error("Number of split select weights not equal to number of independent variables."); + if (split_select_weights.size() != 1 && split_select_weights.size() != num_trees) { + throw std::runtime_error("Size of split select weights not equal to 1 or number of trees."); } - for (size_t j = 0; j < split_select_weights[i].size(); ++j) { - double weight = split_select_weights[i][j]; + // Reserve space + if (split_select_weights.size() == 1) { + this->split_select_weights[0].resize(num_independent_variables); + } else { + this->split_select_weights.clear(); + this->split_select_weights.resize(num_trees, std::vector(num_independent_variables)); + } + this->split_select_varIDs.resize(num_independent_variables); + deterministic_varIDs.reserve(num_independent_variables); - if (i == 0) { - size_t varID = j; - for (auto& skip : no_split_variables) { - if (varID >= skip) { - ++varID; - } - } + // Split up in deterministic and weighted variables, ignore zero weights + for (size_t i = 0; i < split_select_weights.size(); ++i) { - if (weight == 1) { - deterministic_varIDs.push_back(varID); - } else if (weight < 1 && weight > 0) { - this->split_select_varIDs[j] = varID; - this->split_select_weights[i][j] = weight; - } else if (weight < 0 || weight > 1) { - throw std::runtime_error("One or more split select weights not in range [0,1]."); + // Size should be 1 x num_independent_variables or num_trees x num_independent_variables + if (split_select_weights[i].size() != num_independent_variables) { + throw std::runtime_error("Number of split select weights not equal to number of independent variables."); } - } else { - if (weight < 1 && weight > 0) { - this->split_select_weights[i][j] = weight; - } else if (weight < 0 || weight > 1) { - throw std::runtime_error("One or more split select weights not in range [0,1]."); + for (size_t j = 0; j < split_select_weights[i].size(); ++j) { + double weight = split_select_weights[i][j]; + + if (i == 0) { + size_t varID = j; + for (auto& skip : no_split_variables) { + if (varID >= skip) { + ++varID; + } + } + + if (weight == 1) { + deterministic_varIDs.push_back(varID); + } else if (weight < 1 && weight > 0) { + this->split_select_varIDs[j] = varID; + this->split_select_weights[i][j] = weight; + } else if (weight < 0 || weight > 1) { + throw std::runtime_error("One or more split select weights not in range [0,1]."); + } + + } else { + if (weight < 1 && weight > 0) { + this->split_select_weights[i][j] = weight; + } else if (weight < 0 || weight > 1) { + throw std::runtime_error("One or more split select weights not in range [0,1]."); + } + } } - } } - } - if (deterministic_varIDs.size() > this->mtry) { - throw std::runtime_error("Number of ones in split select weights cannot be larger than mtry."); - } - if (deterministic_varIDs.size() + split_select_varIDs.size() < mtry) { - throw std::runtime_error("Too many zeros in split select weights. Need at least mtry variables to split at."); - } + if (deterministic_varIDs.size() > this->mtry) { + throw std::runtime_error("Number of ones in split select weights cannot be larger than mtry."); + } + if (deterministic_varIDs.size() + split_select_varIDs.size() < mtry) { + throw std::runtime_error("Too many zeros in split select weights. Need at least mtry variables to split at."); + } } void Forest::setAlwaysSplitVariables(std::vector& always_split_variable_names) { - deterministic_varIDs.reserve(num_independent_variables); + deterministic_varIDs.reserve(num_independent_variables); - for (auto& variable_name : always_split_variable_names) { - size_t varID = data->getVariableID(variable_name); - deterministic_varIDs.push_back(varID); - } + for (auto& variable_name : always_split_variable_names) { + size_t varID = data->getVariableID(variable_name); + deterministic_varIDs.push_back(varID); + } - if (deterministic_varIDs.size() + this->mtry > num_independent_variables) { - throw std::runtime_error( - "Number of variables to be always considered for splitting plus mtry cannot be larger than number of independent variables."); - } + if (deterministic_varIDs.size() + this->mtry > num_independent_variables) { + throw std::runtime_error( + "Number of variables to be always considered for splitting plus mtry cannot be larger than number of independent variables."); + } } #ifdef WIN_R_BUILD + void Forest::showProgress(std::string operation, clock_t start_time, clock_t& lap_time) { - // Check for user interrupt - if (checkInterrupt()) { - throw std::runtime_error("User interrupt."); - } - - double elapsed_time = (clock() - lap_time) / CLOCKS_PER_SEC; - if (elapsed_time > STATUS_INTERVAL) { - double relative_progress = (double) progress / (double) num_trees; - double time_from_start = (clock() - start_time) / CLOCKS_PER_SEC; - uint remaining_time = (1 / relative_progress - 1) * time_from_start; - *verbose_out << operation << " Progress: " << round(100 * relative_progress) << "%. Estimated remaining time: " - << beautifyTime(remaining_time) << "." << std::endl; - lap_time = clock(); - } + // Check for user interrupt + if (checkInterrupt()) { + throw std::runtime_error("User interrupt."); + } + + double elapsed_time = (clock() - lap_time) / CLOCKS_PER_SEC; + if (elapsed_time > STATUS_INTERVAL) { + double relative_progress = (double) progress / (double) num_trees; + double time_from_start = (clock() - start_time) / CLOCKS_PER_SEC; + uint remaining_time = (1 / relative_progress - 1) * time_from_start; + *verbose_out << operation << " Progress: " << round(100 * relative_progress) << "%. Estimated remaining time: " + << beautifyTime(remaining_time) << "." << std::endl; + lap_time = clock(); + } } #else + void Forest::showProgress(std::string operation) { - using std::chrono::steady_clock; - using std::chrono::duration_cast; - using std::chrono::seconds; + using std::chrono::steady_clock; + using std::chrono::duration_cast; + using std::chrono::seconds; - steady_clock::time_point start_time = steady_clock::now(); - steady_clock::time_point last_time = steady_clock::now(); - std::unique_lock lock(mutex); + steady_clock::time_point start_time = steady_clock::now(); + steady_clock::time_point last_time = steady_clock::now(); + std::unique_lock lock(mutex); - // Wait for message from threads and show output if enough time elapsed - while (progress < num_trees) { - condition_variable.wait(lock); - seconds elapsed_time = duration_cast(steady_clock::now() - last_time); + // Wait for message from threads and show output if enough time elapsed + while (progress < num_trees) { + condition_variable.wait(lock); + seconds elapsed_time = duration_cast(steady_clock::now() - last_time); - // Check for user interrupt + // Check for user interrupt #ifdef R_BUILD - if (!aborted && checkInterrupt()) { - aborted = true; - } - if (aborted && aborted_threads >= num_threads) { - return; - } + if (!aborted && checkInterrupt()) { + aborted = true; + } + if (aborted && aborted_threads >= num_threads) { + return; + } #endif - if (progress > 0 && elapsed_time.count() > STATUS_INTERVAL) { - double relative_progress = (double) progress / (double) num_trees; - seconds time_from_start = duration_cast(steady_clock::now() - start_time); - uint remaining_time = (1 / relative_progress - 1) * time_from_start.count(); - *verbose_out << operation << " Progress: " << round(100 * relative_progress) << "%. Estimated remaining time: " - << beautifyTime(remaining_time) << "." << std::endl; - last_time = steady_clock::now(); + if (progress > 0 && elapsed_time.count() > STATUS_INTERVAL) { + double relative_progress = (double) progress / (double) num_trees; + seconds time_from_start = duration_cast(steady_clock::now() - start_time); + uint remaining_time = (1 / relative_progress - 1) * time_from_start.count(); + *verbose_out << operation << " Progress: " << round(100 * relative_progress) << "%. Estimated remaining time: " + << beautifyTime(remaining_time) << "." << std::endl; + last_time = steady_clock::now(); + } } - } } #endif diff --git a/doc/source/conf.py b/doc/source/conf.py index 419f9401..7d753103 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -53,9 +53,9 @@ # built documents. # # The short X.Y version. -version = '0.16.0' +version = '0.16.1' # The full version, including alpha/beta/rc tags. -release = '0.16.0' +release = '0.16.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/lib/Makefile.am b/lib/Makefile.am index 643163b7..bf45e80c 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -21,7 +21,12 @@ libportcullis_la_SOURCES = \ src/intron.cc \ src/junction.cc \ src/junction_system.cc \ - src/rule_parser.cc + src/rule_parser.cc \ + src/performance.cc \ + src/ss_forest.cc \ + src/knn.cc \ + src/enn.cc \ + src/smote.cc library_includedir=$(includedir)/portcullis-@PACKAGE_VERSION@/portcullis PI = include/portcullis @@ -32,17 +37,23 @@ library_include_HEADERS = \ $(PI)/bam/bam_writer.hpp \ $(PI)/bam/depth_parser.hpp \ $(PI)/bam/genome_mapper.hpp \ + $(PI)/ml/markov_model.hpp \ + $(PI)/ml/model_features.hpp \ + $(PI)/ml/performance.hpp \ + $(PI)/ml/k_fold.hpp \ + $(PI)/ml/knn.hpp \ + $(PI)/ml/enn.hpp \ + $(PI)/ml/smote.hpp \ + $(PI)/ml/ss_forest.hpp \ $(PI)/kmer.hpp \ - $(PI)/markov_model.hpp \ - $(PI)/model_features.hpp \ $(PI)/python_exe.hpp \ $(PI)/intron.hpp \ $(PI)/junction.hpp \ $(PI)/junction_system.hpp \ $(PI)/portcullis_fs.hpp \ $(PI)/seq_utils.hpp \ - $(PI)/rule_parser.hpp \ - $(PI)/performance.hpp + $(PI)/rule_parser.hpp + libportcullis_la_CPPFLAGS = \ -isystem $(top_srcdir)/deps/htslib-1.3 \ diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp index b18c3d2a..7da0112e 100644 --- a/lib/include/portcullis/junction.hpp +++ b/lib/include/portcullis/junction.hpp @@ -43,11 +43,12 @@ typedef std::unordered_map SplicedAlignmentMap; #include "bam/genome_mapper.hpp" using namespace portcullis::bam; +#include "ml/markov_model.hpp" +using portcullis::ml::KmerMarkovModel; +using portcullis::ml::PosMarkovModel; + #include "intron.hpp" #include "seq_utils.hpp" -#include "markov_model.hpp" -using portcullis::KmerMarkovModel; -using portcullis::PosMarkovModel; using portcullis::Intron; diff --git a/lib/include/portcullis/ml/enn.hpp b/lib/include/portcullis/ml/enn.hpp new file mode 100644 index 00000000..96b8ece1 --- /dev/null +++ b/lib/include/portcullis/ml/enn.hpp @@ -0,0 +1,104 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + + +#pragma once + +#include +#include +#include +using std::string; +using std::thread; +using std::vector; + +#include +#include +using boost::timer::auto_cpu_timer; + +#include + +namespace portcullis { +namespace ml { + +typedef boost::error_info ENNErrorInfo; +struct ENNException: virtual boost::exception, virtual std::exception { }; + +const uint16_t ENN_THRESHOLD = 5; + +/** + * An parallel implementation of Wilson's Edited nearest neighbour algorithm. + * Uses KNN to identify nearest neighbours for each sample, then marks entries + * to be removed or to be kept if more than half of the KNNs come from a different + * class. + * KNN logic originally derived from OpenCV, modified to work with ranger data + * types + */ +class ENN { +protected: + uint16_t k; + uint16_t threshold; + uint16_t threads; + bool verbose; + + double* data; + size_t rows; + size_t cols; + + vector labels; + +public: + + ENN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols, vector& _labels); + + uint16_t getK() const { + return k; + } + + void setK(uint16_t k) { + this->k = k; + } + + uint16_t getThreads() const { + return threads; + } + + void setThreads(uint16_t threads) { + this->threads = threads; + } + + bool isVerbose() const { + return verbose; + } + + void setVerbose(bool verbose) { + this->verbose = verbose; + } + + void setThreshold(uint16_t threshold) { + this->threshold = threshold; + } + + uint16_t getThreshold() const { + return threshold; + } + + + uint32_t execute( vector& results ) const; +}; + +} +} diff --git a/lib/include/portcullis/ml/icote.hpp b/lib/include/portcullis/ml/icote.hpp new file mode 100644 index 00000000..d0c5f3fd --- /dev/null +++ b/lib/include/portcullis/ml/icote.hpp @@ -0,0 +1,106 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once +#include +#include +using std::ostream; +using std::string; + +#include + +namespace portcullis { +namespace ml { + +typedef boost::error_info IcoteErrorInfo; +struct IcoteException: virtual boost::exception, virtual std::exception { }; + + +/** + * Object to perform classification on balanced ensembled selected from random sampling. + * + * It is based on the idea presented in the paper "Exploratory Undersampling + * Class-Imbalance Learning" by Liu et al. + */ +class Icote { +private: + uint16_t k; + uint16_t duplications; + uint16_t threads; + bool verbose; + + double* data; + size_t rows; + size_t cols; + + +protected: + + bool isConstant(size_t col); + + void fselect(); + +public: + + Smote(uint16_t defaultK, uint16_t _smoteness, uint16_t _threads, double* _data, size_t _rows, size_t _cols); + + ~Smote() { + delete[] synthetic; + } + + uint16_t getK() const { + return k; + } + + void setK(uint16_t k) { + this->k = k; + } + + uint16_t getThreads() const { + return threads; + } + + void setThreads(uint16_t threads) { + this->threads = threads; + } + + bool isVerbose() const { + return verbose; + } + + void setVerbose(bool verbose) { + this->verbose = verbose; + } + + double* getSynthetic() { + return synthetic; + } + + size_t getNbSynthRows() const { + return s_rows; + } + + double getSynth(size_t row, size_t col) const { + return synthetic[(row * cols) + col]; + } + + void execute(); + + void print(ostream& out) const; +}; +} +} \ No newline at end of file diff --git a/lib/include/portcullis/ml/k_fold.hpp b/lib/include/portcullis/ml/k_fold.hpp new file mode 100644 index 00000000..92d2597b --- /dev/null +++ b/lib/include/portcullis/ml/k_fold.hpp @@ -0,0 +1,80 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include +#include +#include +using std::vector; +using std::random_shuffle; + +#include + +namespace portcullis { +namespace ml { + +typedef boost::error_info KFoldErrorInfo; +struct KFoldException: virtual boost::exception, virtual std::exception { }; + +// Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/ +template +class KFold { +public: + KFold(int k, In _beg, In _end) : + beg(_beg), end(_end), K(k) { + if (K <= 0) + BOOST_THROW_EXCEPTION(KFoldException() << KFoldErrorInfo(string( + "The supplied value of K is =") + lexical_cast(K) + + ". One cannot create " + lexical_cast(K) + "no of folds")); + + //create the vector of integers + int foldNo = 0; + for (In i = beg; i != end; i++) { + whichFoldToGo.push_back(++foldNo); + if (foldNo == K) + foldNo = 0; + } + if (!K) + BOOST_THROW_EXCEPTION(KFoldException() << KFoldErrorInfo(string( + "With this value of k (=") + lexical_cast(K) + + ")Equal division of the data is not possible")); + + random_shuffle(whichFoldToGo.begin(), whichFoldToGo.end()); + } + + template + void getFold(int foldNo, Out training, Out testing) { + int k = 0; + In i = beg; + while (i != end) { + if (whichFoldToGo[k++] == foldNo) { + *testing++ = *i++; + } else + *training++ = *i++; + } + } + +private: + In beg; + In end; + int K; //how many folds in this + vector whichFoldToGo; +}; + +} +} \ No newline at end of file diff --git a/lib/include/portcullis/ml/knn.hpp b/lib/include/portcullis/ml/knn.hpp new file mode 100644 index 00000000..2b16e2f5 --- /dev/null +++ b/lib/include/portcullis/ml/knn.hpp @@ -0,0 +1,104 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + + +#pragma once + +#include +#include +#include +#include +#include +using std::ostream; +using std::string; +using std::thread; +using std::vector; +using std::shared_ptr; + +#include +#include +using boost::timer::auto_cpu_timer; + +#include + +namespace portcullis { +namespace ml { + +typedef boost::error_info KNNErrorInfo; +struct KNNException: virtual boost::exception, virtual std::exception { }; + +/** + * An parallel implementation of K Nearest Neighbour. + * Logic originally derived from OpenCV + */ +class KNN { +protected: + uint16_t k; + uint16_t threads; + bool verbose; + + double* data; + size_t rows; + size_t cols; + + vector>> results; + + void doSlice( uint16_t slice ); + +public: + + KNN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols); + + uint16_t getK() const { + return k; + } + + void setK(uint16_t k) { + this->k = k; + } + + uint16_t getThreads() const { + return threads; + } + + void setThreads(uint16_t threads) { + this->threads = threads; + } + + bool isVerbose() const { + return verbose; + } + + void setVerbose(bool verbose) { + this->verbose = verbose; + } + + const vector>>& getResults() const { + return results; + } + + const vector& getNNs(size_t index) const { + return *results[index]; + } + + void execute(); + + void print(ostream& out); +}; + +} +} diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/ml/markov_model.hpp similarity index 93% rename from lib/include/portcullis/markov_model.hpp rename to lib/include/portcullis/ml/markov_model.hpp index 613fb177..7318bba4 100644 --- a/lib/include/portcullis/markov_model.hpp +++ b/lib/include/portcullis/ml/markov_model.hpp @@ -26,6 +26,7 @@ using std::vector; namespace portcullis { +namespace ml { typedef boost::error_info MMErrorInfo; struct MMException: virtual boost::exception, virtual std::exception {}; @@ -74,9 +75,9 @@ class MarkovModel { virtual double getScore(const string& seq) = 0; }; -class KmerMarkovModel : public portcullis::MarkovModel { +class KmerMarkovModel : public portcullis::ml::MarkovModel { private: - portcullis::KMMU model; + portcullis::ml::KMMU model; public: KmerMarkovModel() : MarkovModel(1) {} KmerMarkovModel(const uint32_t _order) : MarkovModel(_order) {}; @@ -89,9 +90,9 @@ class KmerMarkovModel : public portcullis::MarkovModel { } }; -class PosMarkovModel : public portcullis::MarkovModel { +class PosMarkovModel : public portcullis::ml::MarkovModel { private: - portcullis::PMMU model; + portcullis::ml::PMMU model; public: PosMarkovModel() : MarkovModel(1) {} PosMarkovModel(const uint32_t _order) : MarkovModel(_order) {}; @@ -104,6 +105,6 @@ class PosMarkovModel : public portcullis::MarkovModel { } }; - +} } diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/ml/model_features.hpp similarity index 89% rename from lib/include/portcullis/model_features.hpp rename to lib/include/portcullis/ml/model_features.hpp index 1820cc01..a6c762df 100644 --- a/lib/include/portcullis/model_features.hpp +++ b/lib/include/portcullis/ml/model_features.hpp @@ -27,16 +27,17 @@ using boost::filesystem::path; #include #include -#include +#include #include using portcullis::bam::GenomeMapper; -using portcullis::MarkovModel; +using portcullis::ml::MarkovModel; using portcullis::Junction; using portcullis::JunctionPtr; using portcullis::JunctionList; using portcullis::SplicingScores; namespace portcullis { +namespace ml { typedef shared_ptr ForestPtr; @@ -66,6 +67,9 @@ struct Feature { class ModelFeatures { private: size_t fi; +protected: + void setRow(Data* d, size_t row, JunctionPtr j, bool labelled); + public: uint32_t L95; KmerMarkovModel exonModel; @@ -113,9 +117,11 @@ class ModelFeatures { void trainSplicingModels(const JunctionList& pass, const JunctionList& fail); Data* juncs2FeatureVectors(const JunctionList& x); + Data* juncs2FeatureVectors(const JunctionList& xl, const JunctionList& xu); + - ForestPtr trainInstance(const JunctionList& x, string outputPrefix, - uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose); + ForestPtr trainInstance(const JunctionList& pos, const JunctionList& neg, string outputPrefix, + uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose, bool smote, bool enn); void resetActiveFeatureIndex() { fi = 0; @@ -131,4 +137,5 @@ class ModelFeatures { } }; +} } \ No newline at end of file diff --git a/lib/include/portcullis/performance.hpp b/lib/include/portcullis/ml/performance.hpp similarity index 79% rename from lib/include/portcullis/performance.hpp rename to lib/include/portcullis/ml/performance.hpp index c219f1f7..53ea35e5 100644 --- a/lib/include/portcullis/performance.hpp +++ b/lib/include/portcullis/ml/performance.hpp @@ -17,15 +17,25 @@ #pragma once +#include #include +#include +#include +#include +#include +using std::ostream; using std::stringstream; +using std::shared_ptr; +using std::string; +using std::vector; #include #include using boost::filesystem::path; namespace portcullis { - +namespace ml { + class Performance { protected: uint32_t tp; @@ -207,6 +217,8 @@ class Performance { return getPrecision() + getNPV() - 100.0; } + string toShortString() const; + string toLongString() const; static string shortHeader() { return "TP\tTN\tFP\tFN\tREC\tPRC\tF1"; @@ -222,54 +234,34 @@ class Performance { return out.str(); } + static void loadGenuine(path& genuineFile, vector& results); + +}; + +class PerformanceList { +public: + + void clear() { + this->scores.clear(); + } - string toShortString() const { - vector parts; - parts.push_back(std::to_string(tp)); - parts.push_back(std::to_string(tn)); - parts.push_back(std::to_string(fp)); - parts.push_back(std::to_string(fn)); - parts.push_back(to_2dp_string(getRecall())); - parts.push_back(to_2dp_string(getPrecision())); - parts.push_back(to_2dp_string(getF1Score())); - return boost::algorithm::join(parts, "\t"); - } - - string toLongString() const { - vector parts; - parts.push_back(std::to_string(tp)); - parts.push_back(std::to_string(tn)); - parts.push_back(std::to_string(fp)); - parts.push_back(std::to_string(fn)); - parts.push_back(to_2dp_string(getPrevalence())); - parts.push_back(to_2dp_string(getBias())); - parts.push_back(to_2dp_string(getSensitivity())); - parts.push_back(to_2dp_string(getSpecificity())); - parts.push_back(to_2dp_string(getPrecision())); - parts.push_back(to_2dp_string(getNPV())); - parts.push_back(to_2dp_string(getF1Score())); - parts.push_back(to_2dp_string(getAccuracy())); - parts.push_back(to_2dp_string(getInformedness())); - parts.push_back(to_2dp_string(getMarkedness())); - parts.push_back(to_2dp_string(getMCC())); - return boost::algorithm::join(parts, "\t"); - } - - - static void loadGenuine(path& genuineFile, vector& results) { - - // Load reference data - std::ifstream refs(genuineFile.string()); - string line; - uint32_t lineNb = 0; - while (std::getline(refs, line)) { - std::istringstream iss(line); - bool res; - iss >> res; - results.push_back(res); - } - refs.close(); + shared_ptr operator [](int i) const { + return this->scores[i]; } + + void add(shared_ptr p) { + this->scores.push_back(p); + } + + void outputMeanPerformance(std::ostream& resout); + +protected: + + vector> scores; + + void outputMeanScore(const vector& scores, const string& score_type, std::ostream& resout); + }; +} } \ No newline at end of file diff --git a/lib/include/portcullis/ml/smote.hpp b/lib/include/portcullis/ml/smote.hpp new file mode 100644 index 00000000..86929675 --- /dev/null +++ b/lib/include/portcullis/ml/smote.hpp @@ -0,0 +1,102 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once +#include +#include +using std::ostream; +using std::string; + +#include + +namespace portcullis { +namespace ml { + +typedef boost::error_info SmoteErrorInfo; +struct SmoteException: virtual boost::exception, virtual std::exception { }; + + +/** + * Object to perform classification on balanced ensembled selected from random sampling. + * + * It is based on the idea presented in the paper "Exploratory Undersampling + * Class-Imbalance Learning" by Liu et al. + */ +class Smote { +private: + uint16_t k; + uint16_t smoteness; + uint16_t threads; + bool verbose; + + double* data; + size_t rows; + size_t cols; + + double* synthetic; + size_t s_rows; + +public: + + Smote(uint16_t defaultK, uint16_t _smoteness, uint16_t _threads, double* _data, size_t _rows, size_t _cols); + + ~Smote() { + delete[] synthetic; + } + + uint16_t getK() const { + return k; + } + + void setK(uint16_t k) { + this->k = k; + } + + uint16_t getThreads() const { + return threads; + } + + void setThreads(uint16_t threads) { + this->threads = threads; + } + + bool isVerbose() const { + return verbose; + } + + void setVerbose(bool verbose) { + this->verbose = verbose; + } + + double* getSynthetic() { + return synthetic; + } + + size_t getNbSynthRows() const { + return s_rows; + } + + double getSynth(size_t row, size_t col) const { + return synthetic[(row * cols) + col]; + } + + void execute(); + + void print(ostream& out) const; +}; +} +} \ No newline at end of file diff --git a/lib/include/portcullis/ml/ss_forest.hpp b/lib/include/portcullis/ml/ss_forest.hpp new file mode 100644 index 00000000..14d3d444 --- /dev/null +++ b/lib/include/portcullis/ml/ss_forest.hpp @@ -0,0 +1,82 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include +using std::shared_ptr; + +#include +using boost::filesystem::path; + +#include +#include + +#include +#include +using portcullis::ml::ModelFeatures; +using portcullis::ml::MarkovModel; +using portcullis::ml::ForestPtr; + +#include +using portcullis::bam::GenomeMapper; + +using portcullis::Junction; +using portcullis::JunctionPtr; +using portcullis::JunctionList; +using portcullis::SplicingScores; + +namespace portcullis { +namespace ml { + +typedef boost::error_info SSRFErrorInfo; +struct SSRFException: virtual boost::exception, virtual std::exception {}; + +const uint16_t REPEAT_LIMIT = 3; + +class SemiSupervisedForest { +private: + uint16_t trees; + Data* labelled; + Data* unlabelled; + Data* all; + ModelFeatures mf; + ForestPtr forest; + bool verbose; + uint16_t threads; + string outputPrefix; + double contribution; // How much of a contribution unlabelled data should make to the decision making + + +public: + SemiSupervisedForest(ModelFeatures& _mf, const JunctionList& labelled, const JunctionList& unlabelled, + string outputPrefix, uint16_t trees, uint16_t threads, double contribution, bool verbose); + + virtual ~SemiSupervisedForest(); + + ForestPtr train(); + + ForestPtr getForest() { return forest; }; + + static Data* juncs2FeatureVectors(const JunctionList& x); + + + + +}; +} +} \ No newline at end of file diff --git a/lib/src/enn.cc b/lib/src/enn.cc new file mode 100644 index 00000000..e82580d4 --- /dev/null +++ b/lib/src/enn.cc @@ -0,0 +1,89 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::endl; + +#include +using portcullis::ml::KNN; + +#include + +portcullis::ml::ENN::ENN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols, vector& _labels) { + if (_rows != _labels.size()) { + BOOST_THROW_EXCEPTION(ENNException() << ENNErrorInfo(string( + "The supplied number of rows does not match the number of labels"))); + } + + data = _data; + rows = _rows; + cols = _cols; + labels = _labels; + if (_rows < defaultK && _rows < 100) + k = _rows; + else + k = defaultK; + threads = _threads; + verbose = false; + + threshold = k / 2; +} + +uint32_t portcullis::ml::ENN::execute(vector& results) const { + + auto_cpu_timer timer(1, "ENN Time taken: %ws\n\n"); + + KNN knn(k, threads, data, rows, cols); + knn.setVerbose(verbose); + knn.execute(); + + if (verbose) { + cout << "Finding outliers" << endl; + } + + results.clear(); + results.resize(rows); + uint32_t discard = 0; + + const uint16_t threshold = this->getThreshold(); + + for(size_t i = 0; i < rows; i++) { + uint16_t pos_count = 0; + uint16_t neg_count = 0; + bool pos = labels[i]; + const vector nn = knn.getNNs(i); + for (size_t j = 0; j < k; j++) { + uint32_t index = nn[j]; + if (labels[index]) { + pos_count++; + } else { + neg_count++; + } + } + results[i] = ((pos && pos_count >= threshold) || (!pos && neg_count >= threshold)); + if (!results[i]) discard++; + } + + uint32_t keep = results.size() - discard; + + if (verbose) { + cout << "Marked " << keep << " to be kept and " << discard << " to be discarded." << endl; + } + + return discard; +} diff --git a/lib/src/icote.cc b/lib/src/icote.cc new file mode 100644 index 00000000..127f8f29 --- /dev/null +++ b/lib/src/icote.cc @@ -0,0 +1,174 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::endl; + +#include +using portcullis::ml::KNN; + +#include + +portcullis::ml::Icote::Icote(uint16_t defaultK, uint16_t _duplications, uint16_t _threads, double* _data, size_t _rows, size_t _cols) { + + data = _data; + rows = _rows; + cols = _cols; + if (_rows < defaultK && _rows < 100) + k = _rows; + else + k = defaultK; + duplications = _duplications < 1 ? 1 : _duplications; + threads = _threads; + verbose = false; + + s_rows = duplications * rows; + +} + +void portcullis::ml::Smote::execute() { + + auto_cpu_timer timer(1, "ICOTE Time taken: %ws\n\n"); + + if (verbose) { + cout << "Starting Immume Centroids Oversampling Technique (ICOTE)" << endl; + } + + // Attribute selection + // To reduce computational cost remove constant attributes + if (verbose) cout << "Selecting non-constant features" << endl; + + vector is_const(cols); + vector const_val(cols); + uint16_t nb_const_col = 0; + for(size_t k = 0; k < cols; k++) { + is_const[k] = this->isConstant(k); + if (is_const[k]) { + nb_const_col++; + const_val[k] = data[k]; + } + } + uint16_t sel_col = cols - nb_const_col; + + if (verbose) cout << "Ignoring " << const_col << " columns as values are constant across all samples" << endl; + + double* ag = new double[rows * sel_col]; + for(size_t i = 0; i < rows; i++) { + size_t new_col = 0; + for(size_t j = 0; j < cols; j++) { + if (!is_const[j]) { + ag[(i * sel_col) + new_col++] = data[(i*cols) + j]; + } + } + } + + if (verbose) cout << "Created non-constant feature matrix" << endl; + + + // Normalisation + if (verbose) cout << "Normalising feature matrix" << endl; + vector min_col(sel_col); + vector max_col(sel_col); + for(size_t j = 0; j < sel_col; j++) { + double min_v = std::numeric_limits::max(); + double max_v = std::numeric_limits::min(); + for(size_t i = 0; i < rows; i++) { + double x = ag[(i * sel_col) + j]; + min_v = std::min(min_v, x); + max_v = std::max(max_v, x); + } + min_col[j] = min_v; + max_col[j] = max_v; + } + + for(size_t j = 0; j < sel_col; j++) { + double min_x = min_col[j]; + double max_x = max_col[j]; + for(size_t i = 0; i < rows; i++) { + double x = ag[(i * sel_col) + j]; + ag[(i * sel_col) + j] = (x - min_x) / (max_x - min_x); + } + } + if (verbose) cout << "Normalised feature matrix" << endl; + + if (verbose) cout << "Generating Random Antibodies" << endl; + vector ab(k * sel_col); + std::mt19937 rng(12345); + std::uniform_real_distribution dgen(0.0, 1.0); + + for(size_t i = 0; i < k; i++) { + for(size_t j = 0; j < sel_col; j++) { + ab[(i*sel_col) + j] = dgen(rng); + } + } + if (verbose) cout << "Random Antibodies generated" << endl; + + uint16_t N = 0; + while (N < 10) { + vector M(duplications * rows * sel_col); + + for(size_t i = 0; i < rows; i++) { + vector dist(rows * duplications * sel_col); + for(size_t j = 0; j < duplications; j++) { + double s = 0.0; + for (size_t c = 0; c < sel_cols; c++) { + s += std::pow(ag[i * sel_cols + c] - ab[i * sel_cols + c], 2.0); + } + dist[(i * sel_cols) + j] = std::sqrt(s); + } + + // Clone K antibodies in proportion to antigen-antibody affinity + + } + + if (verbose) cout << "Mutating antibodies" << endl; + + + double s = 0.0; + for (size_t j = 0; j < sel_cols; j++) { + s += std::pow(antibodies[i * sel_cols + j] - attr_sel[i * sel_cols + j], 2.0); + } + double ak = (1.0 / sel_cols) * std::sqrt(ak); + } + + if (verbose) cout << "Antibodies mutated" << endl; + + } + + +} + +void portcullis::ml::Smote::print(ostream& out) const { + + out << "Input:" << endl; + for(size_t i = 0; i < rows; i++) { + for(size_t j = 0; j < cols; j++) { + out << data[(i * cols) + j] << " "; + } + out << endl; + } + + out << endl << "Synthetic:" << endl; + for(size_t i = 0; i < s_rows; i++) { + for(size_t j = 0; j < cols; j++) { + out << synthetic[(i * cols) + j] << " "; + } + out << endl; + } + out << endl; +} \ No newline at end of file diff --git a/lib/src/junction.cc b/lib/src/junction.cc index d791b272..1dc1bf70 100644 --- a/lib/src/junction.cc +++ b/lib/src/junction.cc @@ -192,6 +192,7 @@ portcullis::Junction::Junction(shared_ptr _location, int32_t _leftAncSta meanQueryLength = 0; suspicious = false; pfp = false; + genuine = false; canonicalSpliceSites = NO; maxMinAnchor = intron->minAnchorLength(_leftAncStart, _rightAncEnd); diffAnchor = 0; @@ -247,6 +248,7 @@ portcullis::Junction::Junction(const Junction& j, bool withAlignments) { meanQueryLength = j.meanQueryLength; suspicious = j.suspicious; pfp = j.pfp; + genuine = j.genuine; canonicalSpliceSites = j.canonicalSpliceSites; maxMinAnchor = j.maxMinAnchor; diffAnchor = j.diffAnchor; diff --git a/lib/src/knn.cc b/lib/src/knn.cc new file mode 100644 index 00000000..2c2fd43a --- /dev/null +++ b/lib/src/knn.cc @@ -0,0 +1,128 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +#include +using std::ostream; +using std::cout; +using std::endl; +using std::make_shared; + +#include + +portcullis::ml::KNN::KNN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols) { + + data = _data; + rows = _rows; + cols = _cols; + if (_rows < defaultK && _rows < 100) + k = _rows; + else + k = defaultK; + threads = _threads; + verbose = false; + results.resize(_rows); +} + +void portcullis::ml::KNN::doSlice(uint16_t slice) { + + // Get coordinates of entries to search through in this slice + uint32_t slice_size = rows / threads; + uint32_t start = slice_size * slice; + uint32_t end = (slice_size * slice) + slice_size - 1; + if (slice == threads - 1 && end <= rows) { + end = rows - 1; // Make sure we get the last few entries + } + uint32_t testcount = end - start + 1; + + vector dbuf(testcount*k, std::numeric_limits::max()); + vector rbuf(testcount*k, 0); + + // Get feature space values for KNN for each sample in slice + for (uint32_t baseidx = 0; baseidx < rows; baseidx++) { + for (uint32_t testidx = start; testidx <= end; testidx++) { + + const uint32_t ri = (testidx - start) * k; + + // Get sum of squared differences (no need to do the sqrt to get + // the Euclidean distance... this saves about 20% runtime) + double s = 0.0; + for (uint16_t i = 0; i < cols; i++) { + s += std::pow(data[baseidx * cols + i] - data[testidx * cols + i], 2.0); + } + + // Find position to add entry... + int i = k; + for (; i > 0; i--) + if (s >= dbuf[ri + i - 1]) + break; + + // ... or skip if not if this pair is not in the current set of KNN + if (i >= k) + continue; + + // Shift back entries after i + for (int j = k - 2; j >= i; j--) { + dbuf[ri + j + 1] = dbuf[ri + j]; + rbuf[ri + j + 1] = rbuf[ri + j]; + } + + // Record KNN and index + dbuf[ri + i] = s; + rbuf[ri + i] = baseidx; + } + } + + // Store final results + for (uint32_t testidx = start; testidx <= end; testidx++) { + shared_ptr> knn = make_shared>(); + + for(size_t i = 0; i < k; i++) { + uint32_t index = rbuf[((testidx - start) * k) + i]; + knn->push_back(index); + } + + results[testidx] = knn; + } +} + +void portcullis::ml::KNN::execute() { + + auto_cpu_timer timer(1, " Time taken: %ws\n"); + + if (verbose) { + cout << "Performing K Nearest Neighbour (KNN) ..."; + cout.flush(); + } + + vector t(threads); + for (uint16_t i = 0; i < threads; i++) { + t[i] = thread(&KNN::doSlice, this, i); + } + for (uint16_t i = 0; i < threads; i++) { + t[i].join(); + } +} + +void portcullis::ml::KNN::print(ostream& out) { + for(auto& i : results) { + for(auto j : *i) { + out << j << " "; + } + out << endl; + } +} \ No newline at end of file diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc index 2e7828d7..5af054a7 100644 --- a/lib/src/markov_model.cc +++ b/lib/src/markov_model.cc @@ -25,9 +25,10 @@ using std::endl; #include using portcullis::SeqUtils; -#include +#include +using portcullis::ml::KMMU; -void portcullis::KmerMarkovModel::train(const vector& input, const uint32_t _order) { +void portcullis::ml::KmerMarkovModel::train(const vector& input, const uint32_t _order) { order = _order; KMMU temp; for(auto& seq : input) { @@ -52,19 +53,30 @@ void portcullis::KmerMarkovModel::train(const vector& input, const uint3 } -double portcullis::KmerMarkovModel::getScore(const string& seq) { +double portcullis::ml::KmerMarkovModel::getScore(const string& seq) { string s = SeqUtils::makeClean(seq); double score = 1.0; + uint32_t no_count = 0; for(uint16_t i = order; i < s.size(); i++){ - score *= model[s.substr(i-order, order)][s.substr(i, 1)]; + double m = model[s.substr(i-order, order)][s.substr(i, 1)]; + if (m != 0.0) { + score *= m; + } + else { + no_count++; + } } if(score == 0.0) { return -100.0; } + else if (no_count > 2) { + // Add a penalty for situations where we repeatedly don't find a kmer in the tranining set + score /= ((double)no_count * 0.5); + } return log(score); } -void portcullis::PosMarkovModel::train(const vector& input, const uint32_t _order) { +void portcullis::ml::PosMarkovModel::train(const vector& input, const uint32_t _order) { order = _order; PMMU temp; for(auto& seq : input) { @@ -87,7 +99,7 @@ void portcullis::PosMarkovModel::train(const vector& input, const uint32 } -double portcullis::PosMarkovModel::getScore(const string& seq) { +double portcullis::ml::PosMarkovModel::getScore(const string& seq) { string s = SeqUtils::makeClean(seq); double score = 1.0; for(uint16_t i = order; i < s.size(); i++){ diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 902c7813..72e929e8 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -28,14 +28,21 @@ using std::make_shared; #include #include +#include +#include +using portcullis::ml::ENN; +using portcullis::ml::Smote; + #include using portcullis::Junction; -#include +#include + +#include "portcullis/junction_system.hpp" -void portcullis::ModelFeatures::initGenomeMapper(const path& genomeFile) { +void portcullis::ml::ModelFeatures::initGenomeMapper(const path& genomeFile) { // Initialise gmap.setGenomeFile(genomeFile); @@ -44,7 +51,7 @@ void portcullis::ModelFeatures::initGenomeMapper(const path& genomeFile) { gmap.loadFastaIndex(); } -uint32_t portcullis::ModelFeatures::calcIntronThreshold(const JunctionList& juncs) { +uint32_t portcullis::ml::ModelFeatures::calcIntronThreshold(const JunctionList& juncs) { vector intron_sizes; for(auto& j : juncs) { @@ -58,7 +65,7 @@ uint32_t portcullis::ModelFeatures::calcIntronThreshold(const JunctionList& junc return L95; } -void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in) { +void portcullis::ml::ModelFeatures::trainCodingPotentialModel(const JunctionList& in) { vector exons; vector introns; @@ -102,7 +109,7 @@ void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in intronModel.train(introns, 5); } -void portcullis::ModelFeatures::trainSplicingModels(const JunctionList& pass, const JunctionList& fail) { +void portcullis::ml::ModelFeatures::trainSplicingModels(const JunctionList& pass, const JunctionList& fail) { vector donors; vector acceptors; @@ -167,7 +174,64 @@ void portcullis::ModelFeatures::trainSplicingModels(const JunctionList& pass, co acceptorFModel.train(acceptors, 5); } -Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { +void portcullis::ml::ModelFeatures::setRow(Data* d, size_t row, JunctionPtr j, bool labelled) { + SplicingScores ss = j->calcSplicingScores(gmap, donorTModel, donorFModel, acceptorTModel, acceptorFModel, + donorPWModel, acceptorPWModel); + + bool error = false; + d->set(0, row, j->isGenuine(), error); + + uint16_t i = 1; + + if (features[1].active) { + d->set(i++, row, j->getNbUniquelySplicedReads(), error); + } + if (features[2].active) { + d->set(i++, row, j->getNbDistinctAlignments(), error); + } + if (features[3].active) { + d->set(i++, row, j->getNbReliableAlignments(), error); + } + if (features[4].active) { + d->set(i++, row, j->getEntropy(), error); + } + if (features[5].active) { + d->set(i++, row, j->getReliable2RawRatio(), error); + } + if (features[6].active) { + d->set(i++, row, j->getMaxMinAnchor(), error); + } + if (features[7].active) { + d->set(i++, row, j->getMaxMMES(), error); + } + if (features[8].active) { + d->set(i++, row, j->getMeanMismatches(), error); + } + if (features[9].active) { + d->set(i++, row, L95 == 0 ? 0.0 : j->calcIntronScore(L95), error); + } + if (features[10].active) { + d->set(i++, row, std::min(j->getHammingDistance5p(), j->getHammingDistance3p()), error); + } + if (features[11].active) { + d->set(i++, row, isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel), error); + } + if (features[12].active) { + d->set(i++, row, isPWModelEmpty() ? 0.0 : ss.positionWeighting, error); + } + if (features[13].active) { + d->set(i++, row, isPWModelEmpty() ? 0.0 : ss.splicingSignal, error); + } + + //Junction overhang values at each position are first converted into deviation from expected distributions + for(size_t joi = 0; joi < JO_NAMES.size(); joi++) { + if (features[joi + 14].active) { + d->set(i++, row, j->getJunctionOverhangLogDeviation(joi), error); + } + } +} + +Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { vector headers; for(auto& f : features) { @@ -177,94 +241,234 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { } // Convert junction list info to double* - double* d = new double[x.size() * headers.size()]; + Data* d = new DataDouble(headers, x.size(), headers.size()); uint32_t row = 0; - for (const auto& j : x) { - SplicingScores ss = j->calcSplicingScores(gmap, donorTModel, donorFModel, acceptorTModel, acceptorFModel, - donorPWModel, acceptorPWModel); - - d[0 * x.size() + row] = j->isGenuine(); - - uint16_t i = 1; + for (const auto& j : x) { + setRow(d, row, j, true); + row++; + } + + return d; +} + +Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& xl, const JunctionList& xu) { - if (features[1].active) { - d[i++ * x.size() + row] = j->getNbUniquelySplicedReads(); - } - if (features[2].active) { - d[i++ * x.size() + row] = j->getNbDistinctAlignments(); - } - if (features[3].active) { - d[i++ * x.size() + row] = j->getNbReliableAlignments(); - } - if (features[4].active) { - d[i++ * x.size() + row] = j->getEntropy(); - } - if (features[5].active) { - d[i++ * x.size() + row] = j->getReliable2RawRatio(); - } - if (features[6].active) { - d[i++ * x.size() + row] = j->getMaxMinAnchor(); - } - if (features[7].active) { - d[i++ * x.size() + row] = j->getMaxMMES(); - } - if (features[8].active) { - d[i++ * x.size() + row] = j->getMeanMismatches(); + vector headers; + for(auto& f : features) { + if (f.active) { + headers.push_back(f.name); } - if (features[9].active) { - d[i++ * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); + } + + // Convert junction list info to double* + Data* d = new DataDouble(headers, xl.size() + xu.size(), headers.size()); + + uint32_t row = 0; + for (const auto& j : xl) { + setRow(d, row, j, true); + row++; + } + + for (const auto& j : xu) { + setRow(d, row, j, false); + row++; + } + + return d; +} + + + +portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const JunctionList& pos, const JunctionList& neg, + string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose, bool smote, bool enn) { + + // Work out number of times to duplicate negative set + const int N = (pos.size() / neg.size()) - 1; + + // Duplicate pointers to negative set + JunctionList neg2; + neg2.reserve(neg.size()); + neg2.insert(neg2.end(), neg.begin(), neg.end()); + + uint32_t smote_rows = 0; + double* smote_data = 0; + if (N > 0 && smote) { + + cout << "Oversampling negative set to balance with positive set using SMOTE" << endl; + Data* negData = juncs2FeatureVectors(neg); + const int SC = negData->getNumCols() - 1; + + size_t nelements = negData->getNumRows() * SC ; + double* nm = new double[nelements]; + for( uint32_t baseidx = 0; baseidx < negData->getNumRows(); baseidx++ ) { + double* r = &nm[baseidx * SC]; + for( size_t c = 1; c < negData->getNumCols(); c++) { + r[c - 1] = negData->get(baseidx, c); + //cout << r[c-1]; + } + //cout << endl; } - if (features[10].active) { - d[i++ * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); + + Smote smote(5, N, threads, nm, negData->getNumRows(), negData->getNumCols()-1); + smote.execute(); + smote_rows = smote.getNbSynthRows(); + smote_data = new double[smote_rows * SC]; + double* sd = smote.getSynthetic(); + for(size_t i = 0; i < smote_rows * SC; i++) { + smote_data[i] = sd[i]; } - if (features[11].active) { - d[i++ * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); + cout << "Number of synthesized entries: " << smote.getNbSynthRows() << endl; + } + else if (N <= 0 && smote) { + + cout << "Undersampling negative set to balance with positive set" << endl; + std::mt19937 rng(12345); + while(neg2.size() > pos.size()) { + std::uniform_int_distribution gen(0, neg2.size()); // uniform, unbiased + int i = gen(rng); + neg2.erase(neg2.begin()+i); } - if (features[12].active) { - d[i++ * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.positionWeighting; + } + + + if (verbose) cout << endl << "Combining positive, negative " << (N > 0 ? "and synthetic negative " : "") << "datasets." << endl; + + JunctionList training; + training.reserve(pos.size() + neg2.size()); + training.insert(training.end(), pos.begin(), pos.end()); + training.insert(training.end(), neg2.begin(), neg2.end()); + + JunctionSystem trainingSystem(training); + trainingSystem.sort(); + JunctionList x = trainingSystem.getJunctions(); + + + Data* otd = juncs2FeatureVectors(x); + + // Create data to correct size + Data* trainingData = N > 0 && smote ? + new DataDouble( + otd->getVariableNames(), + x.size() + smote_rows, + otd->getNumCols()) + : otd; + + if (N > 0 && smote) { + const int SC = trainingData->getNumCols() - 1; + bool error = false; + for(size_t i = 0; i < otd->getNumRows(); i++) { + for(size_t j = 0; j < trainingData->getNumCols(); j++) { + trainingData->set(j, i, otd->get(i, j), error); + } } - if (features[13].active) { - d[i++ * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.splicingSignal; + + size_t k = x.size(); + for(size_t i = 0; i < smote_rows; i++) { + trainingData->set(0, k, 0.0, error); // Set genuine (i.e. not genuine) flag + for(size_t j = 1; j < trainingData->getNumCols(); j++) { + trainingData->set(j, k, smote_data[(i * SC) + j-1], error); + } + k++; } + delete[] smote_data; + } + + vector results; - - //Junction overhang values at each position are first converted into deviation from expected distributions - for(size_t joi = 0; joi < JO_NAMES.size(); joi++) { - if (features[joi + 14].active) { - d[i++ * x.size() + row] = j->getJunctionOverhangLogDeviation(joi); + if (enn) { + if (verbose) cout << endl << "Converting training data for ENN" << endl; + size_t elements = trainingData->getNumRows()*(trainingData->getNumCols()-1); + double* m = new double[elements]; + for( uint32_t baseidx = 0; baseidx < trainingData->getNumRows(); baseidx++ ) { + double* r = &m[baseidx * (trainingData->getNumCols()-1)]; + for( size_t c = 1; c < trainingData->getNumCols(); c++) { + r[c - 1] = trainingData->get(baseidx, c); + } + } + + if (verbose) cout << "Extracting labels for ENN" << endl; + vector labels; + uint32_t p = 0, n = 0, o = 0; + for(size_t i = 0; i < trainingData->getNumRows(); i++) { + labels.push_back(trainingData->get(i, 0) == 1.0); + if (trainingData->get(i, 0) == 1.0) { + p++; + } + else if (trainingData->get(i, 0) == 0.0) { + n++; + } + else { + o++; } } - - row++; - } - - Data* data = new DataDouble(d, headers, x.size(), headers.size()); - data->setExternalData(false); // This causes 'd' to be deleted when 'data' is deleted - return data; -} + cout << "P: " << p << "; N: " << n << "; O: " << o << endl; + cout << endl << "Starting Wilson's Edited Nearest Neighbour (ENN) to clean decision region" << endl; + ENN enn(3, threads, m, trainingData->getNumRows(), trainingData->getNumCols()-1, labels); + enn.setThreshold(3); + enn.setVerbose(true); + uint32_t count = enn.execute(results); + delete[] m; -portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionList& x, - string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) { - - cout << "Creating feature vector" << endl; - Data* trainingData = juncs2FeatureVectors(x); + uint32_t pcount = 0, ncount = 0; + JunctionList x2; + for(size_t i = 0; i < trainingData->getNumRows(); i++) { + if (trainingData->get(i, 0) == 1.0 && !results[i]) { + pcount++; + } + else if (trainingData->get(i, 0) == 0.0 && !results[i]) { + ncount++; + } + } + + cout << "Should discard " << pcount << " + entries and " << ncount << " - entries (Total=" << count << ")" << endl << endl; + } - path feature_file = outputPrefix + ".features"; - cout << "Saving feature vector to disk: " << feature_file << endl; + uint32_t pcount = 0, ncount = 0; - ofstream fout(feature_file.c_str(), std::ofstream::out); + Data* trainingData2 = enn ? + new DataDouble( + trainingData->getVariableNames(), + trainingData->getNumRows() - pcount, + trainingData->getNumCols()) : + trainingData; - fout << Intron::locationOutputHeader() << "\t" << trainingData->getHeader() << endl; - for(size_t i = 0; i < x.size(); i++) { - fout << *(x[i]->getIntron()) << "\t" << trainingData->getRow(i) << endl; + if (enn) { + size_t new_index = 0; + for(size_t i = 0; i < trainingData->getNumRows(); i++) { + if (results[i]) { // || trainingData->get(i, 0) == 0.0) { + bool error = false; + for(size_t j = 0; j < trainingData->getNumCols(); j++) { + trainingData2->set(j, new_index, trainingData->get(i, j), error); + } + new_index++; + if (trainingData->get(i, 0) == 1) { + pcount++; + } + else { + ncount++; + } + } + } + + delete trainingData; + + cout << "Final training set contains " << pcount << " positive entries and " << ncount << " negative entries" << endl; } - fout.close(); - - cout << "Initialising random forest" << endl; + /*path feature_file = outputPrefix + ".features"; + if (verbose) cout << "Saving feature vector to disk: " << feature_file << endl; + + ofstream fout(feature_file.c_str(), std::ofstream::out); + fout << Intron::locationOutputHeader() << "\t" << trainingData2->getHeader() << endl; + for(size_t i = 0; i < x2.size(); i++) { + fout << *(x2[i]->getIntron()) << "\t" << trainingData2->getRow(i) << endl; + } + fout.close();*/ + + if (verbose) cout << "Initialising random forest" << endl; ForestPtr f = nullptr; if (probabilityMode) { f = make_shared(); @@ -278,26 +482,29 @@ portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionLis f->init( "Genuine", // Dependant variable name MEM_DOUBLE, // Memory mode - trainingData, // Data object + trainingData2, // Data object 0, // M Try (0 == use default) outputPrefix, // Output prefix - 250, //trees, // Number of trees + trees, // Number of trees 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible threads, // Number of threads IMP_GINI, // Importance measure probabilityMode ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size "", // Status var name false, // Prediction mode - true, // Replace + false, // Replace catVars, // Unordered categorical variable names (vector) false, // Memory saving - DEFAULT_SPLITRULE, // Split rule + AUC, //DEFAULT_SPLITRULE, // Split rule false, // predall 1.0); // Sample fraction - cout << "Training" << endl; + if (verbose) cout << "Training" << endl; f->setVerboseOut(&cerr); f->run(verbose); + cout << "OOBE: " << f->getOverallPredictionError() << endl; + + delete trainingData2; return f; } \ No newline at end of file diff --git a/lib/src/performance.cc b/lib/src/performance.cc new file mode 100644 index 00000000..4a468930 --- /dev/null +++ b/lib/src/performance.cc @@ -0,0 +1,124 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +#include +using std::cout; +using std::endl; + +#include + + +string portcullis::ml::Performance::toShortString() const { + vector parts; + parts.push_back(std::to_string(tp)); + parts.push_back(std::to_string(tn)); + parts.push_back(std::to_string(fp)); + parts.push_back(std::to_string(fn)); + parts.push_back(Performance::to_2dp_string(getRecall())); + parts.push_back(Performance::to_2dp_string(getPrecision())); + parts.push_back(Performance::to_2dp_string(getF1Score())); + return boost::algorithm::join(parts, "\t"); +} + +string portcullis::ml::Performance::toLongString() const { + vector parts; + parts.push_back(std::to_string(tp)); + parts.push_back(std::to_string(tn)); + parts.push_back(std::to_string(fp)); + parts.push_back(std::to_string(fn)); + parts.push_back(Performance::to_2dp_string(getPrevalence())); + parts.push_back(Performance::to_2dp_string(getBias())); + parts.push_back(Performance::to_2dp_string(getSensitivity())); + parts.push_back(Performance::to_2dp_string(getSpecificity())); + parts.push_back(Performance::to_2dp_string(getPrecision())); + parts.push_back(Performance::to_2dp_string(getNPV())); + parts.push_back(Performance::to_2dp_string(getF1Score())); + parts.push_back(Performance::to_2dp_string(getAccuracy())); + parts.push_back(Performance::to_2dp_string(getInformedness())); + parts.push_back(Performance::to_2dp_string(getMarkedness())); + parts.push_back(Performance::to_2dp_string(getMCC())); + return boost::algorithm::join(parts, "\t"); +} + + +void portcullis::ml::Performance::loadGenuine(path& genuineFile, vector& results) { + + // Load reference data + std::ifstream refs(genuineFile.string()); + string line; + uint32_t lineNb = 0; + while (std::getline(refs, line)) { + std::istringstream iss(line); + bool res; + iss >> res; + results.push_back(res); + } + refs.close(); +} + +void portcullis::ml::PerformanceList::outputMeanPerformance(std::ostream& resout) { + + vector prevs; + vector biases; + vector recs; + vector prcs; + vector spcs; + vector f1s; + vector infs; + vector mrks; + vector accs; + vector mccs; + + for(auto& p : this->scores) { + prevs.push_back(p->getPrevalence()); + biases.push_back(p->getBias()); + recs.push_back(p->getRecall()); + prcs.push_back(p->getPrecision()); + spcs.push_back(p->getSpecificity()); + f1s.push_back(p->getF1Score()); + infs.push_back(p->getInformedness()); + mrks.push_back(p->getMarkedness()); + accs.push_back(p->getAccuracy()); + mccs.push_back(p->getMCC()); + } + + outputMeanScore(prevs, "prevalence", resout); + outputMeanScore(biases, "bias", resout); + outputMeanScore(recs, "recall", resout); + outputMeanScore(prcs, "precision", resout); + outputMeanScore(f1s, "F1", resout); + outputMeanScore(spcs, "specificity", resout); + outputMeanScore(accs, "accuracy", resout); + outputMeanScore(infs, "informedness", resout); + outputMeanScore(mrks, "markededness", resout); + outputMeanScore(mccs, "MCC", resout); +} + +void portcullis::ml::PerformanceList::outputMeanScore(const vector& scores, const string& score_type, std::ostream& resout) { + double sum = std::accumulate(scores.begin(), scores.end(), 0.0); + double mean = sum / scores.size(); + double sq_sum = std::inner_product(scores.begin(), scores.end(), scores.begin(), 0.0); + double stdev = std::sqrt(sq_sum / scores.size() - mean * mean); + + stringstream msg; + msg << "Mean " << std::left << std::setw(13) << score_type << ": " << std::fixed << std::setprecision(2) << mean << "% (+/- " << stdev << "%)" << endl; + + cout << msg.str(); + resout << msg.str(); +} + diff --git a/lib/src/smote.cc b/lib/src/smote.cc new file mode 100644 index 00000000..c611e527 --- /dev/null +++ b/lib/src/smote.cc @@ -0,0 +1,99 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::endl; + +#include +using portcullis::ml::KNN; + +#include + +portcullis::ml::Smote::Smote(uint16_t defaultK, uint16_t _smoteness, uint16_t _threads, double* _data, size_t _rows, size_t _cols) { + + data = _data; + rows = _rows; + cols = _cols; + if (_rows < defaultK && _rows < 100) + k = _rows; + else + k = defaultK; + smoteness = _smoteness < 1 ? 1 : _smoteness; + threads = _threads; + verbose = false; + + s_rows = smoteness * rows; + synthetic = new double[s_rows * cols]; +} + +void portcullis::ml::Smote::execute() { + + auto_cpu_timer timer(1, "SMOTE Time taken: %ws\n\n"); + + if (verbose) { + cout << "Starting Synthetic Minority Oversampling Technique (SMOTE)" << endl; + } + + uint32_t new_index = 0; + + KNN knn(k, threads, data, rows, cols); + knn.setVerbose(verbose); + knn.execute(); + + std::mt19937 rng(12345); + std::uniform_int_distribution igen(0, k); + std::uniform_real_distribution dgen(0, 1); + + + for(size_t i = 0; i < rows; i++) { + uint16_t N = smoteness; + while(N > 0) { + const vector nns = knn.getNNs(i); + uint32_t nn = nns[igen(rng)]; // Nearest neighbour row index + + for(size_t j = 0; j < cols; j++) { + double dif = data[(nn * cols) + j] - data[(i * cols) + j]; + double gap = dgen(rng); + synthetic[(new_index * cols) + j] = data[(i * cols) + j] + gap * dif; + } + + new_index++; + N--; + } + } +} + +void portcullis::ml::Smote::print(ostream& out) const { + + out << "Input:" << endl; + for(size_t i = 0; i < rows; i++) { + for(size_t j = 0; j < cols; j++) { + out << data[(i * cols) + j] << " "; + } + out << endl; + } + + out << endl << "Synthetic:" << endl; + for(size_t i = 0; i < s_rows; i++) { + for(size_t j = 0; j < cols; j++) { + out << synthetic[(i * cols) + j] << " "; + } + out << endl; + } + out << endl; +} \ No newline at end of file diff --git a/lib/src/ss_forest.cc b/lib/src/ss_forest.cc new file mode 100644 index 00000000..b5f0a0ba --- /dev/null +++ b/lib/src/ss_forest.cc @@ -0,0 +1,227 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::cerr; +using std::endl; +using std::ostream; + +#include +#include + +#include + +#include +using portcullis::SeqUtils; + +#include + +portcullis::ml::SemiSupervisedForest::SemiSupervisedForest(ModelFeatures& mf, + const JunctionList& _labelled, const JunctionList& _unlabelled, + string _outputPrefix, uint16_t _trees, uint16_t _threads, double _contribution, bool _verbose) { + + verbose = _verbose; + contribution = _contribution; + + labelled = mf.juncs2FeatureVectors(_labelled); + if (verbose) cout << "Created labelled FV with " << labelled->getNumRows() << " entries." << endl; + + unlabelled = mf.juncs2FeatureVectors(_unlabelled); + if (verbose) cout << "Created unlabelled FV with " << unlabelled->getNumRows() << " entries." << endl; + + // Combine labelled and unlabelled + // This is inefficient but it makes life easier... hopefully it doesn't take too + // much memory + all = mf.juncs2FeatureVectors(_labelled, _unlabelled); + if (verbose) cout << "Created combined FV with " << all->getNumRows() << " entries." << endl; + + + outputPrefix = _outputPrefix; + threads = _threads; + trees = _trees; + +} + +portcullis::ml::SemiSupervisedForest::~SemiSupervisedForest() { + delete labelled; + delete unlabelled; + delete all; +} + +ForestPtr portcullis::ml::SemiSupervisedForest::train() { + + if (verbose) cout << "Initialising random forest" << endl; + ForestPtr l = make_shared(); + + vector catVars; + + l->init( + "Genuine", // Dependant variable name + MEM_DOUBLE, // Memory mode + labelled, // Data object + 0, // M Try (0 == use default) + outputPrefix, // Output prefix + trees, // Number of trees + 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible + threads, // Number of threads + IMP_GINI, // Importance measure + DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size + "", // Status var name (not required unless using survival) + false, // Prediction mode + true, // Sample with replacement + catVars, // Unordered categorical variable names (vector) + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + true, // Retain results for all trees (predict all) + 1.0); // Sample fraction + + vector case_weights = {0.7, 0.3}; + //l->setCaseWeights(case_weights); + l->setVerboseOut(&cout); + + if (verbose) cout << "Training on labelled data" << endl; + l->run(verbose); + + string orig_forest = outputPrefix+".ssrf.0.forest"; + l->saveToFile(orig_forest); + + vector oobe; + + // Store out of box prediction error for first run on just labelled data + oobe.push_back(l->getOverallPredictionError()); + cout << "OOBE: " << l->getOverallPredictionError() << endl; + + l->setPredictionMode(true); + l->setData(unlabelled, "Genuine", "", catVars); + l->run(verbose); + + + ForestPtr u = make_shared(); + u->init( + MEM_DOUBLE, // Memory mode + 0, // M Try (0 == use default) + outputPrefix, // Output prefix + trees, // Number of trees + 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible + threads, // Number of threads + IMP_GINI, // Importance measure + DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size + false, // Prediction mode + true, // Replace + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + true, // predall + 1.0); // Sample fraction + u->setData(all, "Genuine", "", catVars); + + + // Loop until no improvement using deterministic annealing + bool initOnly = false; + bool first = true; + bool improved = true; + uint16_t repeat = 1; + uint16_t it = 1; + string best_forest; + while(improved || repeat <= REPEAT_LIMIT) { + + // If OOBE improved during the last iteration make new predictions on + // unlabelled data with the current model + if (improved && !first) { + if (verbose) cout << "Making predictions on the unlabelled set using current model" << endl; + u->setPredictionMode(true); + u->run(verbose); + if (verbose) cout << "Made predictions." << endl; + } + + // For the unlabelled set draw random labels using the actual probability + // distributions of each tree in the forest and reassign genuine flag + for(size_t i = 0; i < unlabelled->getNumRows(); i++) { + // Note we are assuming field 0 contains the label + // Override field 0 with the new label + bool error = false; + double pred = first ? l->makePrediction(i) : u->makePrediction(labelled->getNumRows() + i); + //cout << pred << endl; + all->set(0, labelled->getNumRows() + i, pred, error); + if (error) { + BOOST_THROW_EXCEPTION(SSRFException() << SSRFErrorInfo(string( + "Error setting label for initially unlabelled junction ") + std::to_string(i))); + } + } + + if (verbose) cout << "Re-training newly labelled data" << endl; + u->setPredictionMode(false); + u->run(verbose); + + cout << "OOBE: " << u->getOverallPredictionError() << endl; + + double error_delta = oobe.back() - u->getOverallPredictionError(); + if (error_delta <= 0.0 && !first) { + improved = false; + repeat++; + cout << "No improvement with this permutation" << endl; + } + else { + oobe.push_back(u->getOverallPredictionError()); + improved = true; + repeat = 1; + forest = u; + if (!first) cout << "Improvement of " << error_delta << " to OOBE with this iteration" << endl; + best_forest = outputPrefix+".ssrf." + std::to_string(it++) + ".forest"; + u->saveToFile(best_forest); + } + + first = false; + } + + if (oobe.back() - oobe.front() >= 0.0) { + initOnly = true; + cout << "Using unlabelled data does not appear to have improved the model. Reverting to original model derived only from initally labelled data." << endl; + } + else { + for(size_t j = labelled->getNumRows(); j < labelled->getNumRows()+unlabelled->getNumRows(); j++) { + bool error = false; + all->set(0, j, all->get(j, 0), error); + } + } + + // Revert to the best forest + ForestPtr b = make_shared(); + b->init( + "Genuine", // Dependant variable name + MEM_DOUBLE, // Memory mode + initOnly ? labelled : all, // Data object + 0, // M Try (0 == use default) + outputPrefix, // Output prefix + trees, // Number of trees + 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible + threads, // Number of threads + IMP_GINI, // Importance measure + DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size + "", // Status var name + false, // Prediction mode + true, // Replace + catVars, // Unordered categorical variable names (vector) + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + true, // predall + 1.0); // Sample fraction + + b->run(verbose); + + return b; +} diff --git a/scripts/hpc.json b/scripts/hpc.json index 65ef3300..c38ec995 100755 --- a/scripts/hpc.json +++ b/scripts/hpc.json @@ -1,52 +1,92 @@ { - "__default__": { - "threads": "4", - "memory": "5000", - "queue": "Test" - }, - "align_gsnap": { - "memory": "30000" - }, - "align_star": { - "memory": "60000" - }, - "align_hisat": { - "memory": "20000" - }, - "bam_sort": { - "memory": "10000" - }, - "asm_cufflinks": { - "memory": "30000" - }, - "asm_stringtie": { - "memory": "25000" - }, - "asm_class": { - "memory": "30000" - }, - "portcullis_junc": { - "memory": "25000" - }, - "truesight": { - "memory": "60000" - }, - "finesplice": { - "memory": "20000" - }, - "spanki": { - "memory": "20000" - }, - "simgen_model": { - "memory": "30000" - }, - "align_tophat_wref": { - "memory": "20000" - }, - "simgen_bowtie_align": { - "memory": "20000" - }, - "fastqc": { - "memory": "20000" - } + "__default__" : + { + "threads" : "4", + "memory" : "5000" + }, + "align_gsnap_index" : + { + "memory" : "20000" + }, + "align_gsnap" : + { + "memory" : "30000" + }, + "align_star_index" : + { + "memory" : "60000" + }, + "align_star" : + { + "memory" : "60000" + }, + "align_hisat" : + { + "memory" : "20000" + }, + "bam_sort" : + { + "memory" : "10000" + }, + "asm_cufflinks" : + { + "memory" : "30000" + }, + "asm_stringtie" : + { + "memory" : "25000" + }, + "asm_class" : + { + "memory" : "30000" + }, + "portcullis_prep" : + { + "memory" : "10000" + }, + "portcullis_junc" : + { + "memory" : "25000" + }, + "portcullis_junc2" : + { + "memory" : "25000" + }, + "portcullis_prep2" : + { + "memory" : "10000" + }, + "truesight" : + { + "memory": "60000" + }, + "soapsplice" : + { + "memory": "60000" + }, + "finesplice" : + { + "memory": "20000" + }, + "spanki" : + { + "memory": "20000" + }, + "simgen_model" : + { + "memory": "30000" + }, + "align_tophat_wref" : + { + "memory": "20000" + }, + "simgen_bowtie_align" : + { + "memory": "20000" + }, + "fastqc" : + { + "memory": "20000" + } + } diff --git a/scripts/predict.snakefile b/scripts/predict.snakefile index 99f780df..cf44867d 100644 --- a/scripts/predict.snakefile +++ b/scripts/predict.snakefile @@ -38,6 +38,7 @@ LOAD_PORTCULLIS = config["load_portcullis"] LOAD_SPANKI = config["load_spanki"] LOAD_FINESPLICE = config["load_finesplice"] LOAD_TRUESIGHT = config["load_truesight"] +LOAD_SOAPSPLICE = config["load_soapsplice"] LOAD_PYTHON3 = config["load_python3"] LOAD_GT = config["load_gt"] LOAD_MIKADO = config["load_mikado"] @@ -72,6 +73,8 @@ JUNC_DIR = OUT_DIR + "/junctions" JUNC_DIR_FULL = os.path.abspath(JUNC_DIR) PORTCULLIS_DIR = JUNC_DIR + "/portcullis" PORTCULLIS_DIR_FULL = os.path.abspath(PORTCULLIS_DIR) +PORTCULLIS_DIR2 = JUNC_DIR + "/portcullis2" +PORTCULLIS_DIR2_FULL = os.path.abspath(PORTCULLIS_DIR2) CWD = os.getcwd() @@ -85,7 +88,7 @@ ISOFORM_FRACTION = {"permissive": 0.01, "semipermissive": 0.03, "default": 0.1, ######################### Rules -localrules: all, truesight2bed +localrules: all, truesight2bed, soapsplice2bed # Define rule all: @@ -95,7 +98,9 @@ rule all: expand(JUNC_DIR+"/output/tophat-{reads}-spanki.bed", reads=INPUT_SETS), expand(JUNC_DIR+"/output/tophat-{reads}-finesplice.bed", reads=INPUT_SETS), expand(JUNC_DIR+"/output/truesight-{reads}-truesight.bed", reads=INPUT_SETS), - expand(ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.bed", asm_method=ASSEMBLY_METHODS, asm_mode=ASSEMBLY_MODES, aln_method=ALIGNMENT_METHODS, reads=INPUT_SETS) + expand(JUNC_DIR+"/output/soapsplice-{reads}-soapsplice.bed", reads=INPUT_SETS), + expand(ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.bed", asm_method=ASSEMBLY_METHODS, asm_mode=ASSEMBLY_MODES, aln_method=ALIGNMENT_METHODS, reads=INPUT_SETS), + #expand(PORTCULLIS_DIR2+"/{aln_method}-{reads}/junc/{aln_method}-{reads}.junctions.tab", aln_method=ALIGNMENT_METHODS, reads=INPUT_SETS) # expand(ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.stats", asm_method=ASSEMBLY_METHODS, asm_mode=ASSEMBLY_MODES, aln_method=ALIGNMENT_METHODS, reads=INPUT_SETS) @@ -269,7 +274,9 @@ rule portcullis_prep: ref=REF, bam=rules.bam_sort.output, idx=rules.bam_index.output - output: PORTCULLIS_DIR+"/{aln_method}-{reads}/prep/portcullis.sorted.alignments.bam.bai" + output: + bai=PORTCULLIS_DIR+"/{aln_method}-{reads}/prep/portcullis.sorted.alignments.bam.bai", + fa=PORTCULLIS_DIR+"/{aln_method}-{reads}/prep/portcullis.genome.fa" params: outdir=PORTCULLIS_DIR+"/{aln_method}-{reads}/prep", load=LOAD_PORTCULLIS @@ -278,7 +285,7 @@ rule portcullis_prep: message: "Using portcullis to prepare: {input}" run: strand = PORTCULLIS_STRAND if wildcards.reads.startswith("real") else "unstranded" - shell("{params.load} && portcullis prep -o {params.outdir} -l --strandedness={strand} -t {threads} {input.ref} {input.bam} > {log} 2>&1") + shell("{params.load} && portcullis prep -o {params.outdir} --strandedness={strand} -t {threads} {input.ref} {input.bam} > {log} 2>&1") rule portcullis_junc: @@ -294,11 +301,12 @@ rule portcullis_junc: message: "Using portcullis to analyse potential junctions: {input}" run: strand = PORTCULLIS_STRAND if wildcards.reads.startswith("real") else "unstranded" - shell("{params.load} && {RUN_TIME} portcullis junc -o {params.outdir} -p {wildcards.aln_method}-{wildcards.reads} --strandedness={strand} -t {threads} {params.prepdir} > {log} 2>&1") + shell("{params.load} && {RUN_TIME} portcullis junc -o {params.outdir}/{wildcards.aln_method}-{wildcards.reads} --strandedness={strand} -t {threads} {params.prepdir} > {log} 2>&1") rule portcullis_filter: - input: rules.portcullis_junc.output + input: tab=rules.portcullis_junc.output, + ref=rules.portcullis_prep.output.fa output: link=JUNC_DIR+"/output/{aln_method}-{reads}-portcullis.bed", unfilt_link=JUNC_DIR+"/output/{aln_method}-{reads}-all.bed", @@ -311,7 +319,7 @@ rule portcullis_filter: log: PORTCULLIS_DIR+"/{aln_method}-{reads}-filter.log" threads: int(THREADS) message: "Using portcullis to filter invalid junctions: {input}" - shell: "{params.load} && portcullis filter -t {threads} -o {params.outdir} -p {wildcards.aln_method}-{wildcards.reads} {input} > {log} 2>&1 && ln -sf {params.bed} {output.link} && touch -h {output.link} && ln -sf {params.unfilt_bed} {output.unfilt_link} && touch -h {output.unfilt_link}" + shell: "{params.load} && portcullis filter -t {threads} -o {params.outdir}/{wildcards.aln_method}-{wildcards.reads} {input.ref} {input.tab} > {log} 2>&1 && ln -sf {params.bed} {output.link} && touch -h {output.link} && ln -sf {params.unfilt_bed} {output.unfilt_link} && touch -h {output.unfilt_link}" rule portcullis_bamfilt: @@ -327,6 +335,43 @@ rule portcullis_bamfilt: message: "Using portcullis to filter alignments containing invalid junctions: {input.tab}" shell: "{params.load} && portcullis bamfilt --output={output.bam} {input.tab} {input.bam} > {log} 2>&1" +''' +rule portcullis_prep2: + input: + ref=REF, + bam=rules.bam_sort.output, + idx=rules.bam_index.output + output: PORTCULLIS_DIR2+"/{aln_method}-{reads}/prep/portcullis.sorted.alignments.bam.bai" + params: + outdir=PORTCULLIS_DIR2+"/{aln_method}-{reads}/prep", + load=LOAD_PORTCULLIS2 + log: PORTCULLIS_DIR2+"/{aln_method}-{reads}-prep.log" + threads: int(THREADS) + message: "Using portcullis to prepare: {input}" + run: + strand = PORTCULLIS_STRAND if wildcards.reads.startswith("real") else "unstranded" + shell("{params.load} && portcullis prep -o {params.outdir} --strandedness={strand} -t {threads} {input.ref} {input.bam} > {log} 2>&1") + + +rule portcullis_junc2: + input: + bai=rules.portcullis_prep2.output + output: PORTCULLIS_DIR2+"/{aln_method}-{reads}/junc/{aln_method}-{reads}.junctions.tab" + params: + prepdir=PORTCULLIS_DIR2+"/{aln_method}-{reads}/prep", + outdir=PORTCULLIS_DIR2+"/{aln_method}-{reads}/junc", + load=LOAD_PORTCULLIS2 + log: PORTCULLIS_DIR2+"/{aln_method}-{reads}-junc.log" + threads: int(THREADS) + message: "Using portcullis to analyse potential junctions: {input}" + run: + strand = PORTCULLIS_STRAND if wildcards.reads.startswith("real") else "unstranded" + shell("{params.load} && {RUN_TIME} portcullis junc -o {params.outdir} -p {wildcards.aln_method}-{wildcards.reads} --strandedness={strand} -t {threads} {params.prepdir} > {log} 2>&1") +''' + + + + rule spanki: input: bam=rules.bam_sort.output, @@ -408,7 +453,7 @@ rule truesight: log: JUNC_DIR_FULL+"/truesight/{reads}-truesight.log" threads: int(THREADS) message: "Using Truesight to find junctions" - shell: "{params.load_fs} && cd {params.outdir} && {RUN_TIME} truesight_pair.pl -i {MIN_INTRON} -I {MAX_INTRON} -v 1 -r {params.index} -p {threads} -o . -f {params.r1} {params.r2} > {log} 2>&1 && cd {CWD}" + shell: "{params.load_fs} && cd {params.outdir} && {RUN_TIME} truesight_pair.pl -i {MIN_INTRON} -I {MAX_INTRON} -v 1 -r {params.index} -p {threads} -o . -f {params.r1} {params.r2} > {log} 2>&1 && rm GapAli.sam && cd {CWD}" rule truesight2bed: input: rules.truesight.output @@ -422,6 +467,43 @@ rule truesight2bed: message: "Creating bed file from truesight output: {input}" shell: "{params.load_portcullis} && truesight2bed.py {input} > {output.bed} && ln -sf {params.bed} {output.link} && touch -h {output.link}" +rule soapsplice_index: + input: fa=REF + output: JUNC_DIR + "/soapsplice/index/"+NAME+".index.bwt" + log: JUNC_DIR + "/soapsplice/index.log" + params: + load_ss=LOAD_SOAPSPLICE, + index=JUNC_DIR + "/soapsplice/index/"+NAME + threads: 1 + message: "Creating index for soapsplice" + shell: "{params.load_ss} && 2bwt-builder {input.fa} {params.index} > {log} 2>&1" + +rule soapsplice: + input: + r1=READS_DIR+"/{reads}.R1.fq", + r2=READS_DIR+"/{reads}.R2.fq", + idx=rules.soapsplice_index.output + output: JUNC_DIR+"/soapsplice/{reads}/ss-{reads}.junc" + params: + load_ss=LOAD_SOAPSPLICE, + index=JUNC_DIR + "/soapsplice/index/"+NAME+".index", + outdir=JUNC_DIR+"/soapsplice/{reads}" + log: JUNC_DIR+"/soapsplice/{reads}-soapsplice.log" + threads: int(THREADS) + message: "Using soapsplice to find junctions" + shell: "{params.load_ss} && {RUN_TIME} soapsplice -d {params.index} -1 {input.r1} -2 {input.r2} -I 450 -o {params.outdir}/ss-{wildcards.reads} -p {threads} -t {MAX_INTRON} -c 0 -f 2 -L {MAX_INTRON} -l {MIN_INTRON} > {log} 2>&1 && rm -f {params.outdir}/ss-{wildcards.reads}.sam" + +rule soapsplice2bed: + input: rules.soapsplice.output + output: + link=JUNC_DIR+"/output/soapsplice-{reads}-soapsplice.bed", + bed=JUNC_DIR+"/soapsplice/{reads}/soapsplice-{reads}-soapsplice.bed" + params: + load_portcullis=LOAD_PORTCULLIS, + bed="../soapsplice/{reads}/soapsplice-{reads}-soapsplice.bed" + threads: 1 + message: "Creating bed file from soapsplice output: {input}" + shell: "{params.load_portcullis} && soapsplice2bed.py {input} > {output.bed} && ln -sf {params.bed} {output.link} && touch -h {output.link}" ### @@ -489,13 +571,9 @@ rule gtf_2_bed: output: bed=ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.bed" params: - load_gt=LOAD_GT, - load_cuff=LOAD_CUFFLINKS, load_p=LOAD_PORTCULLIS, - gff=ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.gff3", - gffi=ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.introns.gff3" message: "Converting GTF to BED for: {input.gtf}" - shell: "{params.load_gt} && {params.load_cuff} && gffread -E {input.gtf} -o- > {params.gff} && gt gff3 -sort -tidy -addintrons -force -o {params.gffi} {params.gff} && {params.load_p} && gff2bed.py {params.gffi} > {output.bed} && rm {params.gff} {params.gffi}" + shell: "{params.load_p} && gtf2bed.py {input.gtf} > {output.bed}" rule gtf_stats: input: diff --git a/scripts/read_gen.snakefile b/scripts/read_gen.snakefile index d2ece61b..44d5fd50 100755 --- a/scripts/read_gen.snakefile +++ b/scripts/read_gen.snakefile @@ -35,6 +35,7 @@ LOAD_SAMTOOLS = config["load_samtools"] LOAD_CUFFLINKS = config["load_cufflinks"] LOAD_STRINGTIE = config["load_stringtie"] LOAD_PORTCULLIS = config["load_portcullis"] +LOAD_PORTCULLIS2 = config["load_portcullis2"] LOAD_PYTHON3 = config["load_python3"] INDEX_STAR_EXTRA = config["index_star_extra"] @@ -261,11 +262,11 @@ rule sim_var_portcullis_prep: output: PORT_DIR+"/prep/portcullis.sorted.alignments.bam.bai" params: outdir=PORT_DIR+"/prep", - load=LOAD_PORTCULLIS + load=LOAD_PORTCULLIS2 log: PORT_DIR+"/prep.log" threads: int(THREADS) message: "Using portcullis to prepare: {input}" - shell: "{params.load} && portcullis prep -o {params.outdir} -l --strandedness={PORTCULLIS_STRAND} -t {threads} {input.ref} {input.bam} > {log} 2>&1" + shell: "{params.load} && portcullis prep -o {params.outdir} --strandedness={PORTCULLIS_STRAND} -t {threads} {input.ref} {input.bam} > {log} 2>&1" rule sim_var_portcullis_junc: @@ -275,10 +276,10 @@ rule sim_var_portcullis_junc: params: prepdir=PORT_DIR+"/prep", outdir=PORT_DIR+"/junc", - load=LOAD_PORTCULLIS + load=LOAD_PORTCULLIS2 log: PORT_DIR+"/junc.log" threads: int(THREADS) message: "Using portcullis to analyse potential junctions: {input}" - shell: "{params.load} && portcullis junc -o {params.outdir} -p portcullis_sim_var -t {threads} {params.prepdir} > {log} 2>&1" + shell: "{params.load} && portcullis junc -o {params.outdir}/portcullis_sim_var -t {threads} {params.prepdir} > {log} 2>&1" diff --git a/src/junction_builder.cc b/src/junction_builder.cc index 9b44b641..1835797a 100644 --- a/src/junction_builder.cc +++ b/src/junction_builder.cc @@ -93,6 +93,11 @@ void portcullis::JunctionBuilder::process() { } } + if (!bfs::exists(prepData.getSortedBamFilePath())) { + BOOST_THROW_EXCEPTION(JunctionBuilderException() << JunctionBuilderErrorInfo(string( + "Could not find prepared BAM file at: ") + prepData.getSortedBamFilePath().string())); + } + // Test if we have all the required data if (!prepData.valid(useCsi)) { BOOST_THROW_EXCEPTION(JunctionBuilderException() << JunctionBuilderErrorInfo(string( diff --git a/src/junction_filter.cc b/src/junction_filter.cc index b3e78942..5860ad54 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include using std::boolalpha; @@ -33,6 +34,7 @@ using std::map; using std::unordered_map; using std::unordered_set; using std::vector; +using std::to_string; using std::cout; using std::cerr; @@ -52,26 +54,26 @@ namespace po = boost::program_options; #include #include -#include +#include +using portcullis::ml::SemiSupervisedForest; + #include #include #include #include -#include #include using portcullis::PortcullisFS; using portcullis::Intron; using portcullis::IntronHasher; -using portcullis::Performance; using portcullis::eval; -using portcullis::bam::GenomeMapper; #include "junction_filter.hpp" +#include "prepare.hpp" -portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile, - const path& _output) { +portcullis::JunctionFilter::JunctionFilter(const path& _prepDir, const path& _junctionFile, + const path& _output) { junctionFile = _junctionFile; - genomeFile = ""; + prepData = PreparedFiles(_prepDir); modelFile = ""; genuineFile = ""; output = _output; @@ -86,43 +88,49 @@ portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile, source = DEFAULT_FILTER_SOURCE; verbose = false; threshold = DEFAULT_FILTER_THRESHOLD; + smote = true; + enn = true; } - - + void portcullis::JunctionFilter::filter() { path outputDir = output.parent_path(); string outputPrefix = output.leaf().string(); - + if (outputDir.empty()) { outputDir = "."; } - + // Test if provided genome exists if (!exists(junctionFile)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find junction file at: ") + junctionFile.string())); + "Could not find junction file at: ") + junctionFile.string())); + } + + if (!bfs::exists(prepData.getGenomeFilePath())) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Could not find prepared genome file at: ") + prepData.getGenomeFilePath().string())); } // Test if provided filter config file exists if (!modelFile.empty() && !exists(modelFile)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find filter model file at: ") + modelFile.string())); + "Could not find filter model file at: ") + modelFile.string())); } - + if (!filterFile.empty() && !exists(filterFile)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find filter configuration file at: ") + filterFile.string())); + "Could not find filter configuration file at: ") + filterFile.string())); } - + if (!genuineFile.empty() && !exists(genuineFile)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find file containing marked junction labels at: ") + genuineFile.string())); + "Could not find file containing marked junction labels at: ") + genuineFile.string())); } - + if (!referenceFile.empty() && !exists(referenceFile)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find reference BED file at: ") + referenceFile.string())); + "Could not find reference BED file at: ") + referenceFile.string())); } if (!exists(outputDir)) { @@ -130,12 +138,11 @@ void portcullis::JunctionFilter::filter() { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( "Could not create output directory at: ") + outputDir.string())); } - } - else if (!bfs::is_directory(outputDir)) { + } else if (!bfs::is_directory(outputDir)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "File exists with name of suggested output directory: ") + outputDir.string())); + "File exists with name of suggested output directory: ") + outputDir.string())); } - + cout << "Loading junctions from " << junctionFile.string() << " ..."; cout.flush(); @@ -143,75 +150,75 @@ void portcullis::JunctionFilter::filter() { JunctionSystem originalJuncs(junctionFile); cout << " done." << endl - << "Found " << originalJuncs.getJunctions().size() << " junctions." << endl << endl; + << "Found " << originalJuncs.getJunctions().size() << " junctions." << endl << endl; unordered_set ref; - if (!referenceFile.empty()) { + if (!referenceFile.empty()) { ifstream ifs(referenceFile.c_str()); string line; // Loop through until end of file or we move onto the next ref seq - while ( std::getline(ifs, line) ) { + while (std::getline(ifs, line)) { boost::trim(line); vector parts; // #2: Search for tokens - boost::split( parts, line, boost::is_any_of("\t"), boost::token_compress_on ); + boost::split(parts, line, boost::is_any_of("\t"), boost::token_compress_on); // Ignore any non-entry lines if (parts.size() == 12) { - int end = std::stoi(parts[7]) - 1; // -1 to get from BED to portcullis coords for end pos + int end = std::stoi(parts[7]) - 1; // -1 to get from BED to portcullis coords for end pos string key = parts[0] + "(" + parts[6] + "," + std::to_string(end) + ")" + parts[5]; ref.insert(key); } } } - + vector genuine; if (!genuineFile.empty()) { - + cout << "Loading list of correct predictions of performance analysis ..."; cout.flush(); - + Performance::loadGenuine(genuineFile, genuine); - + if (genuine.size() != originalJuncs.getJunctions().size()) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( "Genuine file contains ") + lexical_cast(genuine.size()) + - " entries. Junction file contains " + lexical_cast(originalJuncs.getJunctions().size()) + + " entries. Junction file contains " + lexical_cast(originalJuncs.getJunctions().size()) + " junctions. The number of entries in both files must be the same to assess performance.")); } - + // Copy over results into junction list - for(size_t i = 0; i < originalJuncs.getJunctions().size(); i++) { + for (size_t i = 0; i < originalJuncs.getJunctions().size(); i++) { originalJuncs.getJunctionAt(i)->setGenuine(genuine[i]); } - + cout << " done." << endl << endl; } - + // Also keep the current list of junctions JunctionList currentJuncs; - + // Copy everything into passJunc to begin with - for(auto& j : originalJuncs.getJunctions()) { + for (auto& j : originalJuncs.getJunctions()) { currentJuncs.push_back(j); } - + // To be overridden if we are training ModelFeatures mf; - mf.initGenomeMapper(this->genomeFile); - - mf.features[1].active=false; // NB USRS (BAD) - mf.features[2].active=false; // NB DISTRS (BAD) + mf.initGenomeMapper(prepData.getGenomeFilePath()); + + mf.features[1].active = false; // NB USRS (BAD) + mf.features[2].active = false; // NB DISTRS (BAD) //mf.features[3].active=false; // NB RELRS (GOOD) - mf.features[4].active = false; // ENTROPY (BAD - JO LOGDEV ARE BETTER) + mf.features[4].active = false; // ENTROPY (BAD - JO LOGDEV ARE BETTER) //mf.features[5].active = false; // REL2RAW (GOOD) - mf.features[6].active=false; // MAXMINANC (BAD - MAXMMES IS BETTER) + mf.features[6].active = false; // MAXMINANC (BAD - MAXMMES IS BETTER) //mf.features[7].active=false; // MAXMMES (GOOD) //mf.features[8].active=false; // MEAN MISMATCH (GOOD) //mf.features[9].active=false; // INTRON (GOOD) //mf.features[10].active=false; // MIN_HAMM (GOOD) - mf.features[11].active=false; // CODING POTENTIAL (BAD) + mf.features[11].active = false; // CODING POTENTIAL (BAD) //mf.features[12].active=false; // POS WEIGHTS (GOOD) //mf.features[13].active=false; // SPLICE SIGNAL (GOOD) /*mf.features[14].active=false; // JO LOGDEV FEATURES BETTER THAN ENTROPY @@ -229,23 +236,29 @@ void portcullis::JunctionFilter::filter() { mf.features[26].active=false; mf.features[27].active=false; mf.features[28].active=false; - mf.features[29].active=false;*/ - - - - + mf.features[29].active=false; + */ + + + + double ratio = 0.0; + if (train) { - + // The initial positive and negative sets - JunctionList pos, unlabelled, neg; - + JunctionList pos, unlabelled, neg, unlabelled2; + cout << "Self training mode activated." << endl << endl; - + createPositiveSet(currentJuncs, pos, unlabelled, mf); - - createNegativeSet(mf.L95, unlabelled, neg); - + + createNegativeSet(mf.L95, unlabelled, neg, unlabelled2); + cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl; + + ratio = 1.0 - ((double)pos.size() / (double)(pos.size() + neg.size())); + + cout << "Pos to neg ratio: " << ratio << endl << endl; cout << "Training markov models ..."; cout.flush(); @@ -253,132 +266,111 @@ void portcullis::JunctionFilter::filter() { mf.trainSplicingModels(pos, neg); cout << " done." << endl << endl; - - + // Balance models for training + /*if (pos.size() > neg.size()) { + undersample(pos, neg.size()); + } + else if (pos.size() < neg.size()) { + undersample(neg, pos.size()); + } + cout << "Balanced datasets to size of smallest set: " << pos.size() << endl << endl;*/ + // Build the training set by combining the positive and negative sets - JunctionList training; - training.reserve(pos.size() + neg.size()); - training.insert(training.end(), pos.begin(), pos.end()); - training.insert(training.end(), neg.begin(), neg.end()); - JunctionSystem trainingSystem(training); - trainingSystem.sort(); + JunctionSystem posSystem(pos); + posSystem.sort(); + JunctionSystem negSystem(neg); + negSystem.sort(); + cout << "Training Random Forest" << endl - << "----------------------" << endl << endl; + << "----------------------" << endl << endl; bool done = false; - shared_ptr forest = nullptr; - while(!done) { - - forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); - const vector importance = forest->getVariableImportance(); - mf.resetActiveFeatureIndex(); - uint16_t min_importance_idx = 10000; - double min_importance_val = 100000.0; - uint16_t min_feature_idx = 10000; - // This approach to feature selection (i.e. removing worst features by importance) - // doesn't appear to have much impact on final results and is computationally - // expensive. Removing for now. - /*for(size_t i = 0; i < importance.size(); i++) { - double imp = importance[i]; - int16_t j = mf.getNextActiveFeatureIndex(); - if (imp < 0.1 && imp < min_importance_val) { - min_importance_idx = i; - min_importance_val = imp; - min_feature_idx = j; - } - }*/ - if (min_importance_idx != 10000) { - mf.features[min_feature_idx].active = false; - cout << "Deactivating feature: " << mf.features[min_feature_idx].name << " - " << min_importance_val << endl; - } - else { - done = true; - } - } - + shared_ptr forest = mf.trainInstance(posSystem.getJunctions(), negSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true, smote, enn); + /*SemiSupervisedForest ssf(mf, trainingSystem.getJunctions(), unlabelled2, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, 0.1, true); + shared_ptr forest = ssf.train();*/ + /* const vector importance = forest->getVariableImportance(); - bool foundIrrelevant = false; mf.resetActiveFeatureIndex(); - cout << "Active features remaining:" << endl; + cout << "Feature importance:" << endl; for(auto& i : importance) { int16_t j = mf.getNextActiveFeatureIndex(); cout << mf.features[j].name << " - " << i << endl; - } - + }*/ + forest->saveToFile(); - forest->writeOutput(&cout); - + //forest->writeOutput(&cout); + modelFile = output.string() + ".selftrain.forest"; cout << endl; } - + // Manage a junction system of all discarded junctions JunctionSystem discardedJuncs; - + // Do ML based filtering if requested - if(!modelFile.empty() && exists(modelFile)){ - cout << "Predicting valid junctions using random forest model" << endl - << "----------------------------------------------------" << endl << endl; - + if (!modelFile.empty() && exists(modelFile)) { + cout << "Predicting valid junctions using random forest model" << endl + << "----------------------------------------------------" << endl << endl; + JunctionList passJuncs; JunctionList failJuncs; - + forestPredict(currentJuncs, passJuncs, failJuncs, mf); - + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering results")); - + // Reset currentJuncs currentJuncs.clear(); - for(auto& j : passJuncs) { + for (auto& j : passJuncs) { currentJuncs.push_back(j); } - - for(auto& j : failJuncs) { + + for (auto& j : failJuncs) { discardedJuncs.addJunction(j); - } + } } - - + + // Do rule based filtering if requested - if (!filterFile.empty() && exists(filterFile)) { - + if (!filterFile.empty() && exists(filterFile)) { + JunctionList passJuncs; JunctionList failJuncs; JuncResultMap resultMap; - + doRuleBasedFiltering(filterFile, currentJuncs, passJuncs, failJuncs, "Rule-based filtering", resultMap); - + RuleFilter::saveResults(path(output.string() + ".rule_filtering.results"), originalJuncs, resultMap); - - printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Rule-based filtering")); - + + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Rule-based filtering")); + // Reset currentJuncs currentJuncs.clear(); - for(auto& j : passJuncs) { + for (auto& j : passJuncs) { currentJuncs.push_back(j); } - + // Add to discarded - for(auto& j : failJuncs) { + for (auto& j : failJuncs) { discardedJuncs.addJunction(j); - } + } } - + if (maxLength > 0 || this->doCanonicalFiltering()) { - + JunctionList passJuncs; JunctionList failJuncs; - - for(auto& j : currentJuncs) { - + + for (auto& j : currentJuncs) { + bool pass = true; if (maxLength > 0) { if (j->getIntronSize() > maxLength) { pass = false; } } - + if (pass && this->doCanonicalFiltering()) { if (this->filterNovel && j->getSpliceSiteType() == CanonicalSS::NO) { pass = false; @@ -393,41 +385,39 @@ void portcullis::JunctionFilter::filter() { if (pass) { passJuncs.push_back(j); - } - else { + } else { failJuncs.push_back(j); discardedJuncs.addJunction(j); } } - + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Post filtering (length and/or canonical) results")); - + // Reset currentJuncs currentJuncs.clear(); - for(auto& j : passJuncs) { + for (auto& j : passJuncs) { currentJuncs.push_back(j); } } - + cout << endl; - + JunctionSystem filteredJuncs; JunctionSystem refKeptJuncs; - + if (currentJuncs.empty()) { cout << "WARNING: Filters discarded all junctions from input." << endl; - } - else { - + } else { + cout << "Recalculating junction grouping and distance stats based on new junction list that passed filters ..."; cout.flush(); - for(auto& j : currentJuncs) { + for (auto& j : currentJuncs) { filteredJuncs.addJunction(j); } - + if (!referenceFile.empty()) { - for(auto& j : discardedJuncs.getJunctions()) { + for (auto& j : discardedJuncs.getJunctions()) { if (ref.count(j->locationAsString()) > 0) { filteredJuncs.addJunction(j); refKeptJuncs.addJunction(j); @@ -438,26 +428,26 @@ void portcullis::JunctionFilter::filter() { filteredJuncs.calcJunctionStats(); cout << " done." << endl << endl; - + if (!referenceFile.empty()) { cout << "Brought back " << refKeptJuncs.size() << " junctions that were discarded by filters but were present in reference file." << endl << endl; } } - - printFilteringResults( originalJuncs.getJunctions(), - filteredJuncs.getJunctions(), - discardedJuncs.getJunctions(), - string("Overall results")); + + printFilteringResults(originalJuncs.getJunctions(), + filteredJuncs.getJunctions(), + discardedJuncs.getJunctions(), + string("Overall results")); cout << endl << "Saving junctions passing filter to disk:" << endl; filteredJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".pass", source + "_pass"); - + if (saveBad) { cout << "Saving junctions failing filter to disk:" << endl; discardedJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".fail", source + "_fail"); - + if (!referenceFile.empty()) { cout << "Saving junctions failing filters but present in reference:" << endl; @@ -466,82 +456,106 @@ void portcullis::JunctionFilter::filter() { } } +void portcullis::JunctionFilter::undersample(JunctionList& jl, size_t size) { + + std::mt19937 rng(12345); + + while(jl.size() > size) { + std::uniform_int_distribution gen(0, jl.size()); // uniform, unbiased + int i = gen(rng); + jl.erase(jl.begin()+i); + } +} + void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf) { JuncResultMap resultMap; - + cout << "Creating initial positive set for training" << endl - << "------------------------------------------" << endl << endl - << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial positive set." << endl << endl; + << "------------------------------------------" << endl << endl + << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial positive set." << endl << endl; if (!genuineFile.empty()) { - cout << "Performance of each positive filter layer (Low FPs is important):" << endl; - } + cout << "Performance of each positive filter layer (Low FPs is important):" << endl; + } cout << "LAYER\t"; if (!genuineFile.empty()) { cout << Performance::longHeader(); + } else { + cout << "PASS\tFAIL"; } - + JunctionList p1, p2, p3; - + cout << endl << "1\t"; RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, unlabelled, "Creating initial positive set for training", resultMap); if (!genuineFile.empty()) { cout << calcPerformance(p1, unlabelled)->toLongString(); + } else { + cout << p1.size() << "\t" << unlabelled.size(); } - cout << endl << "2\t"; + cout << endl << "2\t"; RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, unlabelled, "Creating initial positive set for training", resultMap); if (!genuineFile.empty()) { cout << calcPerformance(p2, unlabelled)->toLongString(); + } else { + cout << p2.size() << "\t" << unlabelled.size(); } - cout << endl << "3\t"; + cout << endl << "3\t"; RuleFilter::filter(this->getIntitalPosRulesFile(3), p2, p3, unlabelled, "Creating initial positive set for training", resultMap); if (!genuineFile.empty()) { cout << calcPerformance(p3, unlabelled)->toLongString(); + } else { + cout << p3.size() << "\t" << unlabelled.size(); } - cout << endl << "L95x1.5\t"; - const uint32_t L95 = mf.calcIntronThreshold(p2); - const uint32_t pos_length_limit = L95 * 1.5; - + cout << endl << "L95x1.2\t"; + + const uint32_t L95 = mf.calcIntronThreshold(p3); + const uint32_t pos_length_limit = L95 * 1.2; + JunctionList passJuncs; - for(auto& j : p3) { + for (auto& j : p3) { if (j->getIntronSize() <= pos_length_limit) { passJuncs.push_back(j); - } - else { + } else { unlabelled.push_back(j); } } if (!genuineFile.empty()) { cout << calcPerformance(passJuncs, unlabelled)->toLongString(); + } else { + cout << passJuncs.size() << "\t" << unlabelled.size(); } - - cout << endl << endl << "Found " << passJuncs.size() << " junctions for the positive set" << endl << endl; - // Analyse positive set to get L0.05 of intron size - cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl; - + JunctionSystem isp(passJuncs); + isp.sort(); + cout << endl << endl << "Found " << isp.size() << " junctions for the positive set" << endl << endl; + + // Analyse positive set to get L0.05 of intron size + cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(isp.getJunctions()) << endl << endl; + + cout << "Saving initial positive set:" << endl; - JunctionSystem isp(passJuncs); + isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp"); - for(auto& j : passJuncs) { + for (auto& j : isp.getJunctions()) { JunctionPtr copy = make_shared(*j); copy->setGenuine(true); - pos.push_back(copy); + pos.push_back(copy); } if (!genuineFile.empty()) { - + JunctionSystem invalidPos; - for(auto& j : passJuncs) { + for (auto& j : passJuncs) { if (!j->isGenuine()) { invalidPos.addJunction(j); } } JunctionSystem missedPos; - for(auto& j : unlabelled) { + for (auto& j : unlabelled) { if (j->isGenuine()) { missedPos.addJunction(j); } @@ -555,77 +569,94 @@ void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, Junc } } -void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg) { - +void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg, JunctionList& failJuncs) { + JuncResultMap resultMap; - + cout << "Creating initial negative set for training" << endl - << "------------------------------------------" << endl << endl - << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial negative set." << endl << endl; + << "------------------------------------------" << endl << endl + << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial negative set." << endl << endl; if (!genuineFile.empty()) { - cout << "Performance of each negative filter layer (Low FNs is important):" << endl; + cout << "Performance of each negative filter layer (Low FNs is important):" << endl; } cout << "LAYER\t"; if (!genuineFile.empty()) { cout << Performance::longHeader(); + } else { + cout << "PASS\tFAIL"; } - + JunctionList p1, p2, p3, p4, p5, p6, p7, p8, f1, f2, f3, f4, f5, f6, f7, f8; - + cout << endl << "1\t"; RuleFilter::filter(this->getIntitalNegRulesFile(1), all, p1, f1, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p1, f1, true)->toLongString(); + cout << calcPerformance(p1, f1, true)->toLongString(); + } else { + cout << p1.size() << "\t" << f1.size(); } - cout << endl << "2\t"; + cout << endl << "2\t"; RuleFilter::filter(this->getIntitalNegRulesFile(2), f1, p2, f2, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p2, f2, true)->toLongString(); + cout << calcPerformance(p2, f2, true)->toLongString(); + } else { + cout << p2.size() << "\t" << f2.size(); } cout << endl << "3\t"; RuleFilter::filter(this->getIntitalNegRulesFile(3), f2, p3, f3, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p3, f3, true)->toLongString(); + cout << calcPerformance(p3, f3, true)->toLongString(); + } else { + cout << p3.size() << "\t" << f3.size(); } cout << endl << "4\t"; RuleFilter::filter(this->getIntitalNegRulesFile(4), f3, p4, f4, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p4, f4, true)->toLongString(); + cout << calcPerformance(p4, f4, true)->toLongString(); + } else { + cout << p4.size() << "\t" << f4.size(); } cout << endl << "5\t"; RuleFilter::filter(this->getIntitalNegRulesFile(5), f4, p5, f5, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p5, f5, true)->toLongString(); + cout << calcPerformance(p5, f5, true)->toLongString(); + } else { + cout << p5.size() << "\t" << f5.size(); } cout << endl << "6\t"; RuleFilter::filter(this->getIntitalNegRulesFile(6), f5, p6, f6, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p6, f6, true)->toLongString(); + cout << calcPerformance(p6, f6, true)->toLongString(); + } else { + cout << p6.size() << "\t" << f6.size(); } cout << endl << "7\t"; RuleFilter::filter(this->getIntitalNegRulesFile(7), f6, p7, f7, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p7, f7, true)->toLongString(); + cout << calcPerformance(p7, f7, true)->toLongString(); + } else { + cout << p7.size() << "\t" << f7.size(); } - cout << endl << "L95x5\t"; - - JunctionList passJuncs, failJuncs; - const uint32_t L95x5 = L95 * 5; - for(auto& j : f7) { - if (j->getIntronSize() > L95x5 && j->getMaxMMES() < 10 ) { - p7.push_back(j); - } - else { - failJuncs.push_back(j); - } + cout << endl << "L95x10\t"; + + JunctionList passJuncs; + const uint32_t L95x10 = L95 * 10; + for (auto& j : f7) { + if (j->getIntronSize() > L95x10 && j->getMaxMMES() < 12) { + p8.push_back(j); + } else { + failJuncs.push_back(j); + } } if (!genuineFile.empty()) { - cout << calcPerformance(p8, failJuncs, true)->toLongString(); + cout << calcPerformance(p8, failJuncs, true)->toLongString(); + } else { + cout << p8.size() << "\t" << failJuncs.size(); } - + cout << endl << endl << "Concatenating negatives from each layer to create negative set" << endl << endl; - + passJuncs.insert(passJuncs.end(), p1.begin(), p1.end()); passJuncs.insert(passJuncs.end(), p2.begin(), p2.end()); passJuncs.insert(passJuncs.end(), p3.begin(), p3.end()); @@ -637,183 +668,183 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL // This will remove any duplicates JunctionSystem isn(passJuncs); - + isn.sort(); + + cout << "Final\t"; if (!genuineFile.empty()) { - cout << "Final\t" << calcPerformance(isn.getJunctions(), failJuncs, true)->toLongString() << endl << endl; + cout << calcPerformance(isn.getJunctions(), failJuncs, true)->toLongString(); + } + else { + cout << isn.getJunctions().size() << "\t" << failJuncs.size(); } - cout << "Found " << isn.getJunctions().size() << " junctions for the negative set" << endl << endl; + cout << endl << endl << "Found " << isn.getJunctions().size() << " junctions for the negative set" << endl << endl; cout << endl << "Saving initial negative set:" << endl; isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn"); - for(auto& j : isn.getJunctions()) { - JunctionPtr copy = make_shared(*j); - copy->setGenuine(false); - neg.push_back(copy); + for (auto& j : isn.getJunctions()) { + JunctionPtr copy = make_shared(*j); + copy->setGenuine(false); + neg.push_back(copy); } if (!genuineFile.empty()) { - - JunctionSystem invalidNeg; - JunctionSystem missedNeg; - - for(auto& j : isn.getJunctions()) { - if (j->isGenuine()) { - invalidNeg.addJunction(j); - } - } - for(auto& j : failJuncs) { - if (!j->isGenuine()) { - missedNeg.addJunction(j); - } - } - - cout << "Saving genuine valid junctions in initial negative set to disk:" << endl; - invalidNeg.saveAll(output.string() + ".selftrain.initialset.invalidneg", "portcullis_invalid_isn"); - - cout << "Saving missed negative junctions to disk:" << endl; - missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); + + JunctionSystem invalidNeg; + JunctionSystem missedNeg; + + for (auto& j : isn.getJunctions()) { + if (j->isGenuine()) { + invalidNeg.addJunction(j); + } + } + for (auto& j : failJuncs) { + if (!j->isGenuine()) { + missedNeg.addJunction(j); + } + } + + cout << "Saving genuine valid junctions in initial negative set to disk:" << endl; + invalidNeg.saveAll(output.string() + ".selftrain.initialset.invalidneg", "portcullis_invalid_isn"); + + cout << "Saving missed negative junctions to disk:" << endl; + missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); } } - void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) { // Output stats size_t diff = in.size() - pass.size(); cout << endl << prefix << endl - << "-------------------------" << endl - << "Input contained " << in.size() << " junctions." << endl - << "Output contains " << pass.size() << " junctions." << endl - << "Filtered out " << diff << " junctions." << endl; - + << "-------------------------" << endl + << "Input contained " << in.size() << " junctions." << endl + << "Output contains " << pass.size() << " junctions." << endl + << "Filtered out " << diff << " junctions." << endl; + if (!genuineFile.empty() && exists(genuineFile)) { shared_ptr p = calcPerformance(pass, fail); cout << Performance::longHeader() << endl; cout << p->toLongString() << endl << endl; - } + } } shared_ptr portcullis::JunctionFilter::calcPerformance(const JunctionList& pass, const JunctionList& fail, bool invert) { - + uint32_t tp = 0, tn = 0, fp = 0, fn = 0; if (invert) { - for(auto& j : pass) { - if (!j->isGenuine()) tn++; else fn++; + for (auto& j : pass) { + if (!j->isGenuine()) tn++; + else fn++; } - for(auto& j : fail) { - if (j->isGenuine()) tp++; else fp++; - } - } - else { - for(auto& j : pass) { - if (j->isGenuine()) tp++; else fp++; + for (auto& j : fail) { + if (j->isGenuine()) tp++; + else fp++; + } + } else { + for (auto& j : pass) { + if (j->isGenuine()) tp++; + else fp++; } - for(auto& j : fail) { - if (!j->isGenuine()) tn++; else fn++; + for (auto& j : fail) { + if (!j->isGenuine()) tn++; + else fn++; } } - + return make_shared(tp, tn, fp, fn); } - void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail, const string& prefix, JuncResultMap& resultMap) { cout << "Loading JSON rule-based filtering config file: " << ruleFile.string() << endl; cout << "Filtering junctions ..."; cout.flush(); - map filterCounts = RuleFilter::filter(ruleFile, all, pass, fail, prefix, resultMap); + map filterCounts = RuleFilter::filter(ruleFile, all, pass, fail, prefix, resultMap); cout << " done." << endl << endl - << "Number of junctions failing for each filter: " << endl; + << "Number of junctions failing for each filter: " << endl; - for(map::iterator iterator = filterCounts.begin(); iterator != filterCounts.end(); iterator++) { + for (map::iterator iterator = filterCounts.begin(); iterator != filterCounts.end(); iterator++) { cout << iterator->first << ": " << iterator->second << endl; } } - + void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf) { - + cout << "Creating feature vector" << endl; - + Data* testingData = mf.juncs2FeatureVectors(all); - + cout << "Initialising random forest" << endl; - - shared_ptr f = nullptr; - if (train) { - f = make_shared(); - } - else { - f = make_shared(); - } + + shared_ptr f = make_shared(); vector catVars; - + f->init( - "Genuine", // Dependant variable name - MEM_DOUBLE, // Memory mode - testingData, // Data object - 0, // M Try (0 == use default) - "", // Output prefix - 250, //DEFAULT_SELFTRAIN_TREES, // Number of trees (will be overwritten when loading the model) - 1234567890, // Seed for random generator - threads, // Number of threads - IMP_GINI, // Importance measure - train ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size - //DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, - "", // Status var name - true, // Prediction mode - true, // Replace - catVars, // Unordered categorical variable names (vector) - false, // Memory saving - DEFAULT_SPLITRULE, // Split rule - false, // predall - 1.0); // Sample fraction - + "Genuine", // Dependant variable name + MEM_DOUBLE, // Memory mode + testingData, // Data object + 0, // M Try (0 == use default) + "", // Output prefix + 250, //DEFAULT_SELFTRAIN_TREES, // Number of trees (will be overwritten when loading the model) + 1234567890, // Seed for random generator + threads, // Number of threads + IMP_GINI, // Importance measure + train ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size + //DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, + "", // Status var name + true, // Prediction mode + true, // Replace + catVars, // Unordered categorical variable names (vector) + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + false, // predall + 1.0); // Sample fraction + f->setVerboseOut(&cerr); - + // Load trees from saved model f->loadFromFile(modelFile.string()); - + cout << "Making predictions" << endl; f->run(verbose); - + path scorepath = output.string() + ".scores"; ofstream scoreStream(scorepath.c_str()); - + scoreStream << "Score\t" << Intron::locationOutputHeader() << "\tStrand\tSS\t" << testingData->getHeader() << endl; - - for(size_t i = 0; i < all.size(); i++) { - scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) - << "\t" << strandToChar(all[i]->getConsensusStrand()) - << "\t" << cssToChar(all[i]->getSpliceSiteType()) - << "\t" << testingData->getRow(i) << endl; + for (size_t i = 0; i < all.size(); i++) { + + scoreStream << (1.0 - f->getPredictions()[i][0]) << "\t" << *(all[i]->getIntron()) + << "\t" << strandToChar(all[i]->getConsensusStrand()) + << "\t" << cssToChar(all[i]->getSpliceSiteType()) + << "\t" << testingData->getRow(i) << endl; } - + scoreStream.close(); - - + + if (!genuineFile.empty() && exists(genuineFile)) { vector thresholds; - for(double i = 0.0; i <= 1.0; i += 0.01) { + for (double i = 0.0; i <= 1.0; i += 0.01) { thresholds.push_back(i); } double max_mcc = 0.0; double max_f1 = 0.0; double best_t_mcc = 0.0; double best_t_f1 = 0.0; - + cout << "Threshold\t" << Performance::longHeader() << endl; - for(auto& t : thresholds) { + for (auto& t : thresholds) { JunctionList pjl; JunctionList fjl; categorise(f, all, pjl, fjl, t); @@ -830,38 +861,54 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction best_t_f1 = t; } } - + cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; - cout << "Actual threshold set at " << threshold << endl; - categorise(f, all, pass, fail, best_t_f1); - } - else { - categorise(f, all, pass, fail, threshold); + //threshold = best_t_mcc; } - cout << "Saved junction scores to: " << scorepath << endl; + //threshold = calcGoodThreshold(f, all); + cout << "Threshold set at " << threshold << endl; + categorise(f, all, pass, fail, threshold); + cout << "Saved junction scores to: " << scorepath << endl; + + delete testingData; } void portcullis::JunctionFilter::categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t) { - - for(size_t i = 0; i < all.size(); i++) { - if (f->getPredictions()[i][0] <= t) { + + for (size_t i = 0; i < all.size(); i++) { + if ((1.0 - f->getPredictions()[i][0]) >= t) { pass.push_back(all[i]); - } - else { + } else { fail.push_back(all[i]); } } } +double portcullis::JunctionFilter::calcGoodThreshold(shared_ptr f, const JunctionList& all) { + + uint32_t pos = 0; + uint32_t neg = 0; + + for (auto& p : f->getPredictions()) { + if ((1.0 - p[0]) >= 0.5) { + pos++; + } else { + neg++; + } + } + + return (double)pos / (double)(pos+neg); +} + int portcullis::JunctionFilter::main(int argc, char *argv[]) { path junctionFile; - path genomeFile; + path prepDir; path modelFile; path genuineFile; path filterFile; @@ -873,41 +920,47 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { int32_t max_length; string canonical; string source; + bool no_smote; + bool enn; double threshold; bool verbose; bool help; - + struct winsize w; ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); // Declare the supported options. - po::options_description generic_options(helpMessage(), w.ws_col, w.ws_col/1.7); + po::options_description generic_options(helpMessage(), w.ws_col, w.ws_col / 1.7); generic_options.add_options() - ("output,o", po::value(&output)->default_value(DEFAULT_FILTER_OUTPUT), - "Output prefix for files generated by this program.") - ("filter_file,f", po::value(&filterFile), - "If you wish to custom rule-based filter the junctions file, use this option to provide a list of the rules you wish to use. By default we don't filter using a rule-based method, we instead filter via a self-trained random forest model. See manual for more details.") - ("model_file,m", po::value(&modelFile), - "If you wish to use a custom random forest model to filter the junctions file, rather than self-training on the input dataset use this option to. See manual for more details.") + ("output,o", po::value(&output)->default_value(DEFAULT_FILTER_OUTPUT), + "Output prefix for files generated by this program.") + ("filter_file,f", po::value(&filterFile), + "If you wish to custom rule-based filter the junctions file, use this option to provide a list of the rules you wish to use. By default we don't filter using a rule-based method, we instead filter via a self-trained random forest model. See manual for more details.") + ("model_file,m", po::value(&modelFile), + "If you wish to use a custom random forest model to filter the junctions file, rather than self-training on the input dataset use this option to. See manual for more details.") ("genuine,g", po::value(&genuineFile), - "If you have a list of line separated boolean values in a file, indicating whether each junction in your input is genuine or not, then we can use that information here to gauge the accuracy of the predictions.") + "If you have a list of line separated boolean values in a file, indicating whether each junction in your input is genuine or not, then we can use that information here to gauge the accuracy of the predictions.") ("reference,r", po::value(&referenceFile), - "Reference annotation of junctions in BED format. Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria. If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.") + "Reference annotation of junctions in BED format. Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria. If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.") ("no_ml,n", po::bool_switch(&no_ml)->default_value(false), - "Disables machine learning filtering") + "Disables machine learning filtering") ("save_bad,b", po::bool_switch(&saveBad)->default_value(false), - "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") + "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") ("source", po::value(&source)->default_value(DEFAULT_FILTER_SOURCE), - "The value to enter into the \"source\" field in GFF files.") + "The value to enter into the \"source\" field in GFF files.") ("max_length,l", po::value(&max_length)->default_value(0), - "Filter junctions longer than this value. Default (0) is to not filter based on length.") + "Filter junctions longer than this value. Default (0) is to not filter based on length.") ("canonical,c", po::value(&canonical)->default_value("OFF"), - "Keep junctions based on their splice site status. Valid options: OFF,C,S,N. Where C = Canonical junctions (GU-AG), S = Semi-canonical junctions (AT-AC, or GT-AG), N = Non-canonical. OFF means, keep all junctions (i.e. don't filter by canonical status). User can separate options by a comma to keep two categories.") - ("threads,t", po::value(&threads)->default_value(DEFAULT_FILTER_THREADS), - "The number of threads to use during testing (only applies if using forest model).") - ("verbose,v", po::bool_switch(&verbose)->default_value(false), - "Print extra information") + "Keep junctions based on their splice site status. Valid options: OFF,C,S,N. Where C = Canonical junctions (GU-AG), S = Semi-canonical junctions (AT-AC, or GT-AG), N = Non-canonical. OFF means, keep all junctions (i.e. don't filter by canonical status). User can separate options by a comma to keep two categories.") + ("threads,t", po::value(&threads)->default_value(DEFAULT_FILTER_THREADS), + "The number of threads to use during testing (only applies if using forest model).") + ("enn", po::bool_switch(&enn)->default_value(false), + "Use this flag to enable Edited Nearest Neighbour to clean decision region") + ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), + "The threshold score at which we determine a junction to be genuine or not. Increase value towards 0.0 to increase precision, decrease towards 0.0 to increase sensitivity. We generally find that increasing sensitivity helps when using high coverage data, or when the aligner has already performed some form of junction filtering.") + ("verbose,v", po::bool_switch(&verbose)->default_value(false), + "Print extra information") ("help", po::bool_switch(&help)->default_value(false), "Produce help message") ; @@ -915,16 +968,16 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { // in config file, but will not be shown to the user. po::options_description hidden_options("Hidden options"); hidden_options.add_options() - ("genome-file", po::value(&genomeFile), "Path to the genome file to process.") - ("junction-file", po::value(&junctionFile), "Path to the junction file to process.") - ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), - "The threshold score at which we determine a junction to be genuine or not.") + ("prep_data_dir,i", po::value(&prepDir), "Path to directory containing prepared data.") + ("junction_file", po::value(&junctionFile), "Path to the junction file to process.") + ("no_smote", po::bool_switch(&no_smote)->default_value(false), + "Use this flag to disable synthetic oversampling") ; // Positional option for the input bam file po::positional_options_description p; - p.add("genome-file", 1); - p.add("junction-file", 1); + p.add("prep_data_dir", 1); + p.add("junction_file", 1); // Combine non-positional options @@ -942,38 +995,36 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { return 1; } - - - auto_cpu_timer timer(1, "\nPortcullis junction filter completed.\nTotal runtime: %ws\n\n"); + auto_cpu_timer timer(1, "\nPortcullis junction filter completed.\nTotal runtime: %ws\n\n"); cout << "Running portcullis in junction filter mode" << endl - << "------------------------------------------" << endl << endl; + << "------------------------------------------" << endl << endl; // Create the prepare class - JunctionFilter filter(junctionFile, output); + JunctionFilter filter(prepDir, junctionFile, output); filter.setSaveBad(saveBad); filter.setSource(source); filter.setVerbose(verbose); filter.setThreads(threads); filter.setMaxLength(max_length); filter.setCanonical(canonical); - + // Only set the filter rules if specified. filter.setFilterFile(filterFile); filter.setGenuineFile(genuineFile); if (modelFile.empty() && !no_ml) { - filter.setTrain(true); - } - else { + filter.setTrain(true); + } else { filter.setTrain(false); if (!no_ml) { filter.setModelFile(modelFile); } } filter.setReferenceFile(referenceFile); - filter.setGenomeFile(genomeFile); filter.setThreshold(threshold); - + filter.setSmote(!no_smote); + filter.setENN(enn); + filter.filter(); return 0; diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index d21b14e2..34698e04 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -47,19 +47,24 @@ using boost::filesystem::create_directory; using boost::filesystem::symbolic_link_exists; #include +using portcullis::bam::GenomeMapper; + +#include +#include +using portcullis::ml::Performance; +using portcullis::ml::ModelFeatures; + #include #include #include -#include #include -#include -using portcullis::bam::GenomeMapper; using portcullis::PortcullisFS; using portcullis::Intron; using portcullis::IntronHasher; -using portcullis::Performance; using portcullis::JuncResultMap; -using portcullis::ModelFeatures; + +#include "prepare.hpp" +using portcullis::PreparedFiles; namespace portcullis { @@ -76,7 +81,7 @@ const string ST_IPOS_RULES_FILE = "selftrain_initial_pos"; const string ST_INEG_RULES_FILE = "selftrain_initial_neg"; const uint16_t DEFAULT_FILTER_THREADS = 1; const uint16_t DEFAULT_SELFTRAIN_TREES = 100; -const double DEFAULT_FILTER_THRESHOLD = 0.625; +const double DEFAULT_FILTER_THRESHOLD = 0.5; class JunctionFilter { @@ -84,7 +89,7 @@ class JunctionFilter { private: path junctionFile; - path genomeFile; + PreparedFiles prepData; path modelFile; path filterFile; path genuineFile; @@ -99,6 +104,8 @@ class JunctionFilter { bool filterNovel; string source; double threshold; + bool smote; + bool enn; bool verbose; @@ -107,7 +114,8 @@ class JunctionFilter { static path scriptsDir; static path dataDir; - JunctionFilter( const path& _junctionFile, + JunctionFilter( const path& _prepDir, + const path& _junctionFile, const path& _output); virtual ~JunctionFilter() { @@ -133,14 +141,6 @@ class JunctionFilter { this->junctionFile = junctionFile; } - path getGenomeFile() const { - return genomeFile; - } - - void setGenomeFile(path genomeFile) { - this->genomeFile = genomeFile; - } - double getThreshold() const { return threshold; } @@ -222,6 +222,23 @@ class JunctionFilter { this->threads = threads; } + bool isENN() const { + return enn; + } + + void setENN(bool enn) { + this->enn = enn; + } + + bool isSmote() const { + return smote; + } + + void setSmote(bool smote) { + this->smote = smote; + } + + void setCanonical(const string& canonical) { vector modes; boost::split( modes, canonical, boost::is_any_of(","), boost::token_compress_on ); @@ -301,7 +318,11 @@ class JunctionFilter { void createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf); - void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg); + void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg, JunctionList& failJuncs); + + double calcGoodThreshold(shared_ptr f, const JunctionList& all); + + void undersample(JunctionList& jl, size_t size); public: @@ -309,11 +330,12 @@ class JunctionFilter { return string("\nPortcullis Filter Mode Help.\n\n") + "Filters out junctions that are unlikely to be genuine or that have too little\n" + "supporting evidence. The user can control three stages of the filtering\n" + - "process. First the user can perform filtering based on a pre-defined random\n" + - "forest model. Second the user can specify a configuration file described a\n" + + "process. First the user can perform filtering based on a random forest model\n" + + "self-trained on the provided data, alternatively the user can provide a pre-\n" + + "trained model. Second the user can specify a configuration file describing a\n" + "set of filtering rules to apply. Third, the user can directly through the\n" + "command line filter based on junction (intron) length, or the canonical label.\n\n" + - "Usage: portcullis filter [options] \n\n" + + "Usage: portcullis filter [options] \n\n" + "Options"; } diff --git a/src/portcullis.cc b/src/portcullis.cc index 74af944f..56cfd57c 100644 --- a/src/portcullis.cc +++ b/src/portcullis.cc @@ -264,7 +264,7 @@ int mainFull(int argc, char *argv[]) { path juncOut = juncDir.string() + "/portcullis_all"; // Identify junctions and calculate metrics - JunctionBuilder jb(prepDir.string(), juncOut); + JunctionBuilder jb(prepDir, juncOut); jb.setThreads(threads); jb.setExtra(false); // Run in fast mode jb.setSeparate(false); // Run in fast mode @@ -284,7 +284,7 @@ int mainFull(int argc, char *argv[]) { path filtOut = outputDir.string() + "/3-filt/portcullis_filtered"; path juncTab = juncDir.string() + "/portcullis_all.junctions.tab"; - JunctionFilter filter(juncTab, filtOut); + JunctionFilter filter(prepDir, juncTab, filtOut); filter.setVerbose(verbose); filter.setSource(source); filter.setMaxLength(max_length); diff --git a/src/train.cc b/src/train.cc index 689fa1c5..7a02e3dc 100644 --- a/src/train.cc +++ b/src/train.cc @@ -26,7 +26,7 @@ using std::vector; using std::cout; using std::cerr; using std::endl; -using std::unique_ptr; +using std::shared_ptr; using std::make_shared; #include @@ -46,17 +46,20 @@ namespace po = boost::program_options; #include #include +#include +#include +using portcullis::ml::Performance; +using portcullis::ml::PerformanceList; +using portcullis::ml::KFold; + #include -#include using portcullis::JunctionSystem; using portcullis::JunctionList; -using portcullis::Performance; #include "train.hpp" -using portcullis::KFold; using portcullis::Train; -portcullis::Train::Train(const path& _junctionFile, const path& _refFile){ +portcullis::Train::Train(const path& _junctionFile, const path& _refFile) { junctionFile = _junctionFile; refFile = _refFile; outputPrefix = ""; @@ -76,9 +79,9 @@ void portcullis::Train::testInstance(shared_ptr f, const JunctionList& x // Convert testing set junctions into feature vector ModelFeatures mf; Data* testingData = mf.juncs2FeatureVectors(x); - - f->setPredictionMode(true); - f->setData(testingData); + vector catvars; + f->setPredictionMode(true); + f->setData(testingData, "Genuine", "", catvars); f->run(false); delete testingData; @@ -156,7 +159,7 @@ void portcullis::Train::train() { cout << "Training on full dataset" << endl; - ForestPtr f = ModelFeatures().trainInstance(junctions, outputPrefix.string(), trees, threads, false, true); + ForestPtr f = ModelFeatures().trainInstance(junctions, JunctionList(), outputPrefix.string(), trees, threads, false, true, false, false); f->saveToFile(); f->writeOutput(&cout); @@ -170,7 +173,7 @@ void portcullis::Train::train() { KFold kf(folds, junctions.begin(), junctions.end()); JunctionList test, train; - vector> perfs; + PerformanceList perfs; cout << endl << "Starting " << folds << "-fold cross validation" << endl; @@ -191,7 +194,7 @@ void portcullis::Train::train() { kf.getFold(i, back_inserter(train), back_inserter(test)); // Train on this particular set - ForestPtr f = ModelFeatures().trainInstance(train, outputPrefix.string(), trees, threads, false, false); + ForestPtr f = ModelFeatures().trainInstance(train, JunctionList(), outputPrefix.string(), trees, threads, false, false, false, false); // Test model instance testInstance(f, test); @@ -213,12 +216,12 @@ void portcullis::Train::train() { } } - unique_ptr p( new Performance(tp, tn, fp, fn) ); + shared_ptr p = make_shared(tp, tn, fp, fn); cout << p->toLongString() << endl; resout << p->toLongString() << endl; - perfs.push_back(std::move(p)); + perfs.add(p); // Clear the train and test vectors in preparation for the next step train.clear(); @@ -227,7 +230,7 @@ void portcullis::Train::train() { cout << "Cross validation completed" << endl << endl; - outputMeanPerformance(perfs, resout); + perfs.outputMeanPerformance(resout); resout.close(); cout << endl << "Saved cross validation results to file " << outputPrefix.string() << ".cv_results" << endl; @@ -237,57 +240,6 @@ void portcullis::Train::train() { } -void portcullis::Train::outputMeanPerformance(const vector>& scores, std::ofstream& resout) { - - vector prevs; - vector biases; - vector recs; - vector prcs; - vector spcs; - vector f1s; - vector infs; - vector mrks; - vector accs; - vector mccs; - - for(auto& p : scores) { - prevs.push_back(p->getPrevalence()); - biases.push_back(p->getBias()); - recs.push_back(p->getRecall()); - prcs.push_back(p->getPrecision()); - spcs.push_back(p->getSpecificity()); - f1s.push_back(p->getF1Score()); - infs.push_back(p->getInformedness()); - mrks.push_back(p->getMarkedness()); - accs.push_back(p->getAccuracy()); - mccs.push_back(p->getMCC()); - } - - outputMeanScore(prevs, "prevalence", resout); - outputMeanScore(biases, "bias", resout); - outputMeanScore(recs, "recall", resout); - outputMeanScore(prcs, "precision", resout); - outputMeanScore(f1s, "F1", resout); - outputMeanScore(spcs, "specificity", resout); - outputMeanScore(accs, "accuracy", resout); - outputMeanScore(infs, "informedness", resout); - outputMeanScore(mrks, "markededness", resout); - outputMeanScore(mccs, "MCC", resout); -} - -void portcullis::Train::outputMeanScore(const vector& scores, const string& score_type, std::ofstream& resout) { - double sum = std::accumulate(scores.begin(), scores.end(), 0.0); - double mean = sum / scores.size(); - double sq_sum = std::inner_product(scores.begin(), scores.end(), scores.begin(), 0.0); - double stdev = std::sqrt(sq_sum / scores.size() - mean * mean); - - stringstream msg; - msg << "Mean " << std::left << std::setw(13) << score_type << ": " << std::fixed << std::setprecision(2) << mean << "% (+/- " << stdev << "%)" << endl; - - cout << msg.str(); - resout << msg.str(); -} - void portcullis::Train::getRandomSubset(const JunctionList& in, JunctionList& out) { // Create a list of randomly shuffled indices with maximum of "in". diff --git a/src/train.hpp b/src/train.hpp index 40fe2b34..0a1c3538 100644 --- a/src/train.hpp +++ b/src/train.hpp @@ -35,11 +35,10 @@ using std::unique_ptr; using boost::filesystem::path; #include -#include -#include -#include -using portcullis::bam::GenomeMapper; -using portcullis::ModelFeatures; + +#include +using portcullis::ml::ModelFeatures; +using portcullis::ml::ForestPtr; namespace portcullis { @@ -54,53 +53,6 @@ const double DEFAULT_TRAIN_FRACTION = 1.0; const int DEFAULT_SEED = 1234567; // To avoid non-deterministic behaviour - -// Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/ -template -class KFold { -public: - KFold(int k, In _beg, In _end) : - beg(_beg), end(_end), K(k) { - if (K <= 0) - BOOST_THROW_EXCEPTION(TrainException() << TrainErrorInfo(string( - "The supplied value of K is =") + lexical_cast(K) + - ". One cannot create " + lexical_cast(K) + "no of folds")); - - //create the vector of integers - int foldNo = 0; - for (In i = beg; i != end; i++) { - whichFoldToGo.push_back(++foldNo); - if (foldNo == K) - foldNo = 0; - } - if (!K) - BOOST_THROW_EXCEPTION(TrainException() << TrainErrorInfo(string( - "With this value of k (=") + lexical_cast(K) + - ")Equal division of the data is not possible")); - - random_shuffle(whichFoldToGo.begin(), whichFoldToGo.end()); - } - - template - void getFold(int foldNo, Out training, Out testing) { - int k = 0; - In i = beg; - while (i != end) { - if (whichFoldToGo[k++] == foldNo) { - *testing++ = *i++; - } else - *training++ = *i++; - } - } - -private: - In beg; - In end; - int K; //how many folds in this - vector whichFoldToGo; -}; - - class Train { @@ -223,13 +175,9 @@ class Train { protected: - void testInstance(shared_ptr f, const JunctionList& y); - - void getRandomSubset(const JunctionList& in, JunctionList& out); - - void outputMeanPerformance(const vector>& scores, std::ofstream& resout); + void testInstance(ForestPtr f, const JunctionList& y); - void outputMeanScore(const vector& scores, const string& score_type, std::ofstream& resout); + void getRandomSubset(const JunctionList& in, JunctionList& out); }; } diff --git a/tests/Makefile.am b/tests/Makefile.am index d6d27253..a3040b86 100755 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -25,6 +25,7 @@ check_PROGRAMS = check_unit_tests check_unit_tests_SOURCES = bam_tests.cpp \ seq_utils_tests.cpp \ kmer_tests.cpp \ + smote_tests.cpp \ intron_tests.cpp \ junction_tests.cpp \ check_portcullis.cc diff --git a/tests/smote_tests.cpp b/tests/smote_tests.cpp new file mode 100644 index 00000000..0c8b9056 --- /dev/null +++ b/tests/smote_tests.cpp @@ -0,0 +1,60 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include + +#include +#include +#include +using std::cout; +using std::cerr; +using std::endl; +using std::stringstream; + + +#include +#include +#include +namespace bfs = boost::filesystem; +using bfs::path; + +#include +using portcullis::ml::Smote; + + +TEST(smote, simple) { + + double data[] = { + 0.2, 0.4, 1.5, 2.3, 0.0, + 0.3, 0.3, 2.6, 5.2, 0.1, + 0.3, 0.3, 2.4, 5.2, 0.1, + 0.1, 0.2, 0.5, 3.1, 0.3, + 0.3, 0.3, 2.6, 5.2, 0.9, + 0.3, 0.3, 2.6, 5.2, 0.1, + 0.3, 0.7, 2.6, 5.2, 0.1, + 0.2, 0.3, 2.6, 4.2, 0.1, + 0.3, 0.8, 2.6, 2.2, 0.1, + 1.3, 1.3, 2.6, 8.2, 0.8 + }; + + Smote smote(3, 2, 1, data, 10, 5); + smote.execute(); + //smote.print(cerr); + + EXPECT_EQ(smote.getNbSynthRows(), 20); +} +