diff --git a/Makefile.am b/Makefile.am index 59ddf4de..65431406 100644 --- a/Makefile.am +++ b/Makefile.am @@ -39,6 +39,7 @@ dist_bin_SCRIPTS = scripts/analyse_alignments.py \ scripts/predict.snakefile \ scripts/read_gen.snakefile \ scripts/snakey.py \ + scripts/soapsplice2bed.py \ scripts/spanki2bed.py \ scripts/spanki_filter.py \ scripts/split_error_classes.py \ @@ -55,8 +56,16 @@ dist_bin_SCRIPTS = scripts/analyse_alignments.py \ configdir = $(datadir)/portcullis dist_config_DATA = \ data/default_filter.json \ - data/selftrain_initial_neg.json \ - data/selftrain_initial_pos.json + data/selftrain_initial_neg.layer1.json \ + data/selftrain_initial_neg.layer2.json \ + data/selftrain_initial_neg.layer3.json \ + data/selftrain_initial_neg.layer4.json \ + data/selftrain_initial_neg.layer5.json \ + data/selftrain_initial_neg.layer6.json \ + data/selftrain_initial_neg.layer7.json \ + data/selftrain_initial_pos.layer1.json \ + data/selftrain_initial_pos.layer2.json \ + data/selftrain_initial_pos.layer3.json # SRC DIRS make_dirs=deps/htslib-1.3 deps/ranger-0.3.8 lib src tests diff --git a/configure.ac b/configure.ac index 94687932..6996e2cd 100644 --- a/configure.ac +++ b/configure.ac @@ -4,7 +4,7 @@ # Autoconf setup AC_PREREQ([2.68]) -AC_INIT([portcullis],[0.15.0],[daniel.mapleson@tgac.ac.uk],[portcullis],[http://www.tgac.ac.uk]) +AC_INIT([portcullis],[0.16.0],[daniel.mapleson@tgac.ac.uk],[portcullis],[http://www.tgac.ac.uk]) AC_CONFIG_SRCDIR([src/portcullis.cc]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_MACRO_DIR([m4]) @@ -143,6 +143,7 @@ AX_BOOST_CHRONO AX_BOOST_PROGRAM_OPTIONS AX_BOOST_TIMER + # Combine BOOST variables BOOST_LIBS="${BOOST_TIMER_STATIC_LIB} ${BOOST_CHRONO_STATIC_LIB} ${BOOST_FILESYSTEM_STATIC_LIB} ${BOOST_PROGRAM_OPTIONS_STATIC_LIB} ${BOOST_SYSTEM_STATIC_LIB}" AC_SUBST([BOOST_LIBS]) diff --git a/data/selftrain_initial_neg.json b/data/selftrain_initial_neg.json.old similarity index 65% rename from data/selftrain_initial_neg.json rename to data/selftrain_initial_neg.json.old index 23e962fb..86eb3f59 100644 --- a/data/selftrain_initial_neg.json +++ b/data/selftrain_initial_neg.json.old @@ -9,20 +9,24 @@ "value": 1 }, "M3-nb_dist_aln": { - "operator": "eq", - "value": 1 + "operator": "gte", + "value": 2 }, "M4-nb_rel_aln": { "operator": "eq", "value": 0 }, "M11-entropy": { - "operator": "eq", - "value": 0.0 + "operator": "lt", + "value": 1.0 }, "M12-maxmmes": { "operator": "lt", - "value": 8 + "value": 9 + }, + "M12-maxmmes.2": { + "operator": "lte", + "value": 12 }, "M8-max_min_anc": { "operator": "lt", @@ -38,16 +42,16 @@ }, "M13-hamming5p": { "operator": "lte", - "value": 1 + "value": 2 }, "M14-hamming3p": { "operator": "lte", - "value": 1 + "value": 2 }, "M19-mean_mismatches": { "operator": "gte", "value": 5.0 } }, - "expression": "M2-nb_reads | M12-maxmmes | Suspect" + "expression": "M19-mean_mismatches | ( M1-canonical_ss & M11-entropy ) | ( M4-nb_rel_aln & M11-entropy ) | ( M12-maxmmes & M11-entropy ) | ( M12-maxmmes.2 & M2-nb_reads ) | ( M13-hamming5p & M14-hamming3p ) | PFP | ( Suspect & M2-nb_reads )" } diff --git a/data/selftrain_initial_neg.layer1.json b/data/selftrain_initial_neg.layer1.json new file mode 100644 index 00000000..9b1a5844 --- /dev/null +++ b/data/selftrain_initial_neg.layer1.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( M20-nb_usrs & M21-nb_msrs & M12-maxmmes.2 & M22-rel2raw.2 )" +} diff --git a/data/selftrain_initial_neg.layer2.json b/data/selftrain_initial_neg.layer2.json new file mode 100644 index 00000000..e9a9fc33 --- /dev/null +++ b/data/selftrain_initial_neg.layer2.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( M20-nb_usrs.2 & M1-canonical_ss & M12-maxmmes.2 )" +} diff --git a/data/selftrain_initial_neg.layer3.json b/data/selftrain_initial_neg.layer3.json new file mode 100644 index 00000000..ec5cbc5d --- /dev/null +++ b/data/selftrain_initial_neg.layer3.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( Suspect & M11-entropy.2 )" +} diff --git a/data/selftrain_initial_neg.layer4.json b/data/selftrain_initial_neg.layer4.json new file mode 100644 index 00000000..44890918 --- /dev/null +++ b/data/selftrain_initial_neg.layer4.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( M12-maxmmes & M22-rel2raw )" +} diff --git a/data/selftrain_initial_neg.layer5.json b/data/selftrain_initial_neg.layer5.json new file mode 100644 index 00000000..945bd289 --- /dev/null +++ b/data/selftrain_initial_neg.layer5.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( M1-canonical_ss & M22-rel2raw.2 )" +} diff --git a/data/selftrain_initial_neg.layer6.json b/data/selftrain_initial_neg.layer6.json new file mode 100644 index 00000000..0265d1eb --- /dev/null +++ b/data/selftrain_initial_neg.layer6.json @@ -0,0 +1,93 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "M2-nb_reads": { + "operator": "lte", + "value": 1 + }, + "M3-nb_dist_aln": { + "operator": "gte", + "value": 2 + }, + "M4-nb_rel_aln": { + "operator": "eq", + "value": 0 + }, + "M11-entropy": { + "operator": "lt", + "value": 1.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M11-entropy.3": { + "operator": "eq", + "value": 0.0 + }, + "M12-maxmmes": { + "operator": "lt", + "value": 7 + }, + "M12-maxmmes.2": { + "operator": "lt", + "value": 10 + }, + "M12-maxmmes.3": { + "operator": "lt", + "value": 20 + }, + "M8-max_min_anc": { + "operator": "lt", + "value": 16 + }, + "Suspect": { + "operator": "eq", + "value": 1 + }, + "PFP": { + "operator": "eq", + "value": 1 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 2 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 2 + }, + "M19-mean_mismatches": { + "operator": "gte", + "value": 5.0 + }, + "M19-mean_mismatches.2": { + "operator": "gte", + "value": 2.0 + }, + "M20-nb_usrs": { + "operator": "eq", + "value": 0 + }, + "M20-nb_usrs.2": { + "operator": "eq", + "value": 1 + }, + "M21-nb_msrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "lt", + "value": 0.9 + }, + "M22-rel2raw.2": { + "operator": "eq", + "value": 0.0 + } + }, + "expression": "( PFP )" +} diff --git a/data/selftrain_initial_neg.layer7.json b/data/selftrain_initial_neg.layer7.json new file mode 100644 index 00000000..5b6bde7c --- /dev/null +++ b/data/selftrain_initial_neg.layer7.json @@ -0,0 +1,21 @@ +{ + "parameters": { + "M2-nb_reads": { + "operator": "gt", + "value": 100 + }, + "M22-rel2raw": { + "operator": "eq", + "value": 0.0 + }, + "M13-hamming5p": { + "operator": "lte", + "value": 4 + }, + "M14-hamming3p": { + "operator": "lte", + "value": 4 + } + }, + "expression": "( M2-nb_reads & M22-rel2raw & M13-hamming5p & M14-hamming3p )" +} diff --git a/data/selftrain_initial_pos.json b/data/selftrain_initial_pos.json.old similarity index 70% rename from data/selftrain_initial_pos.json rename to data/selftrain_initial_pos.json.old index d850bd63..c4552792 100644 --- a/data/selftrain_initial_pos.json +++ b/data/selftrain_initial_pos.json.old @@ -10,7 +10,7 @@ }, "M12-maxmmes": { "operator": "gte", - "value": 20 + "value": 18 }, "M11-entropy": { "operator": "gt", @@ -18,16 +18,20 @@ }, "M13-hamming5p": { "operator": "gte", - "value": 8 + "value": 7 }, "M14-hamming3p": { "operator": "gte", - "value": 8 + "value": 7 }, "M19-mean_mismatches": { - "operator": "lte", - "value": 1.0 - } + "operator": "eq", + "value": 0.0 + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 5 + } }, - "expression": "M1-canonical_ss & M4-nb_rel_aln & M11-entropy & M12-maxmmes & M13-hamming5p & M14-hamming3p & M19-mean_mismatches" + "expression": "M1-canonical_ss & M4-nb_rel_aln & M11-entropy & M12-maxmmes & M13-hamming5p & M14-hamming3p & M19-mean_mismatches & M20-nb_usrs" } diff --git a/data/selftrain_initial_pos.layer1.json b/data/selftrain_initial_pos.layer1.json new file mode 100644 index 00000000..1bc58557 --- /dev/null +++ b/data/selftrain_initial_pos.layer1.json @@ -0,0 +1,33 @@ +{ + "parameters": { + "M4-nb_rel_aln": { + "operator": "gte", + "value": 1 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 8 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 5 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 5 + }, + "M19-mean_mismatches": { + "operator": "lte", + "value": 1.5 + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "gte", + "value": 0.25 + } + }, + "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw" +} diff --git a/data/selftrain_initial_pos.layer2.json b/data/selftrain_initial_pos.layer2.json new file mode 100644 index 00000000..62c0273e --- /dev/null +++ b/data/selftrain_initial_pos.layer2.json @@ -0,0 +1,45 @@ +{ + "parameters": { + "M4-nb_rel_aln": { + "operator": "gte", + "value": 3 + }, + "M4-nb_rel_aln.2": { + "operator": "gte", + "value": 2 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 25 + }, + "M12-maxmmes.2": { + "operator": "gt", + "value": 12 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 8 + }, + "M13-hamming5p.2": { + "operator": "gte", + "value": 10 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 8 + }, + "M14-hamming3p.2": { + "operator": "gte", + "value": 10 + }, + "M19-mean_mismatches": { + "operator": "lt", + "value": 0 + }, + "M19-mean_mismatches.2": { + "operator": "lt", + "value": 0.5 + } + }, + "expression": "( M4-nb_rel_aln & M12-maxmmes ) | ( M4-nb_rel_aln.2 & M12-maxmmes.2 & M13-hamming5p & M14-hamming3p & M19-mean_mismatches.2 ) | ( M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches )" +} diff --git a/data/selftrain_initial_pos.layer3.json b/data/selftrain_initial_pos.layer3.json new file mode 100644 index 00000000..349d1317 --- /dev/null +++ b/data/selftrain_initial_pos.layer3.json @@ -0,0 +1,77 @@ +{ + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["C"] + }, + "M1-canonical_ss.2": { + "operator": "in", + "value": ["S"] + }, + "M1-canonical_ss.3": { + "operator": "in", + "value": ["N"] + }, + "M4-nb_rel_aln": { + "operator": "gte", + "value": 5 + }, + "M4-nb_rel_aln.2": { + "operator": "gte", + "value": 1 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 20 + }, + "M12-maxmmes.2": { + "operator": "gte", + "value": 10 + }, + "M11-entropy": { + "operator": "gt", + "value": 5.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 2.0 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 7 + }, + "M13-hamming5p.2": { + "operator": "gte", + "value": 8 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 7 + }, + "M14-hamming3p.2": { + "operator": "gte", + "value": 8 + }, + "M19-mean_mismatches": { + "operator": "eq", + "value": 0 + }, + "M19-mean_mismatches": { + "operator": "lt", + "value": 0.1 + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 5 + }, + "M22-rel2raw": { + "operator": "gte", + "value": 0.5 + }, + "M22-rel2raw.2": { + "operator": "gte", + "value": 0.75 + } + }, + "expression": "( M1-canonical_ss ) | ( M1-canonical_ss.2 & M22-rel2raw & M13-hamming5p & M14-hamming3p ) | ( M1-canonical_ss.3 & M22-rel2raw.2 & M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches & M11-entropy.2 )" +} diff --git a/deps/ranger-0.3.8/include/ranger/Data.h b/deps/ranger-0.3.8/include/ranger/Data.h index 0894fdb3..56dd3810 100644 --- a/deps/ranger-0.3.8/include/ranger/Data.h +++ b/deps/ranger-0.3.8/include/ranger/Data.h @@ -31,107 +31,131 @@ #include #include +#include +#include #include "globals.h" class Data { public: - Data(); - virtual ~Data(); + Data(); + virtual ~Data(); - virtual double get(size_t row, size_t col) const = 0; + virtual double get(size_t row, size_t col) const = 0; - size_t getVariableID(std::string variable_name); + size_t getVariableID(std::string variable_name); - virtual void reserveMemory() = 0; - virtual void set(size_t col, size_t row, double value, bool& error) = 0; + virtual void reserveMemory() = 0; + virtual void set(size_t col, size_t row, double value, bool& error) = 0; - void addSparseData(unsigned char* sparse_data, size_t num_cols_sparse); + void addSparseData(unsigned char* sparse_data, size_t num_cols_sparse); - bool loadFromFile(std::string filename); - bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line); - bool loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator); + bool loadFromFile(std::string filename); + bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line); + bool loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator); - void getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID); + void getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID); - size_t getIndex(size_t row, size_t col) const { - if (col < num_cols_no_sparse) { - return index_data[col * num_rows + row]; - } else { - // Get data out of sparse storage. -1 because of GenABEL coding. - size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; - size_t result = (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); + size_t getIndex(size_t row, size_t col) const { + if (col < num_cols_no_sparse) { + return index_data[col * num_rows + row]; + } else { + // Get data out of sparse storage. -1 because of GenABEL coding. + size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; + size_t result = (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); - // TODO: Better way to treat missing values? - if (result > 2) { - return 0; - } else { - return result; - } + // TODO: Better way to treat missing values? + if (result > 2) { + return 0; + } else { + return result; + } + } } - } - - double getUniqueDataValue(size_t varID, size_t index) const { - if (varID < num_cols_no_sparse) { - return unique_data_values[varID][index]; - } else { - // For GWAS data the index is the value - return (index); + + double getUniqueDataValue(size_t varID, size_t index) const { + if (varID < num_cols_no_sparse) { + return unique_data_values[varID][index]; + } else { + // For GWAS data the index is the value + return (index); + } + } + + size_t getNumUniqueDataValues(size_t varID) const { + if (varID < num_cols_no_sparse) { + return unique_data_values[varID].size(); + } else { + // For GWAS data 0,1,2 + return (3); + } + } + + void sort(); + + const std::vector& getVariableNames() const { + return variable_names; + } + + size_t getNumCols() const { + return num_cols; } - } - - size_t getNumUniqueDataValues(size_t varID) const { - if (varID < num_cols_no_sparse) { - return unique_data_values[varID].size(); - } else { - // For GWAS data 0,1,2 - return (3); + + size_t getNumRows() const { + return num_rows; + } + + size_t getMaxNumUniqueValues() const { + if (sparse_data == 0 || max_num_unique_values > 3) { + // If no sparse data or one variable with more than 3 unique values, return that value + return max_num_unique_values; + } else { + // If sparse data and no variable with more than 3 unique values, return 3 + return 3; + } + } + + void setExternalData(bool external) { + this->externalData = external; + } + + void saveToTSV(const std::string& file); + + std::string getRow(size_t row) const { + std::stringstream ss; + ss << this->get(row, 0); + for (size_t j = 1; j < num_cols; j++) { + ss << "\t" << this->get(row, j); + } + return ss.str(); } - } - - void sort(); - - const std::vector& getVariableNames() const { - return variable_names; - } - size_t getNumCols() const { - return num_cols; - } - size_t getNumRows() const { - return num_rows; - } - - size_t getMaxNumUniqueValues() const { - if (sparse_data == 0 || max_num_unique_values > 3) { - // If no sparse data or one variable with more than 3 unique values, return that value - return max_num_unique_values; - } else { - // If sparse data and no variable with more than 3 unique values, return 3 - return 3; + + std::string getHeader() const { + std::stringstream ss; + ss << variable_names[0]; + for (size_t i = 1; i < variable_names.size(); i++) { + ss << "\t" << variable_names[i]; + } + return ss.str(); } - } - - void setExternalData(bool external) { - this->externalData = external; - } protected: - std::vector variable_names; - size_t num_rows; - size_t num_rows_rounded; - size_t num_cols; + std::vector variable_names; + size_t num_rows; + size_t num_rows_rounded; + size_t num_cols; - unsigned char* sparse_data; - size_t num_cols_no_sparse; + unsigned char* sparse_data; + size_t num_cols_no_sparse; - bool externalData; + bool externalData; - size_t* index_data; - std::vector> unique_data_values; - size_t max_num_unique_values; + size_t* index_data; + std::vector> unique_data_values; + size_t max_num_unique_values; private: - DISALLOW_COPY_AND_ASSIGN(Data); + DISALLOW_COPY_AND_ASSIGN(Data); }; #endif /* DATA_H_ */ diff --git a/deps/ranger-0.3.8/src/Data.cpp b/deps/ranger-0.3.8/src/Data.cpp index 03b09203..712afb81 100644 --- a/deps/ranger-0.3.8/src/Data.cpp +++ b/deps/ranger-0.3.8/src/Data.cpp @@ -36,181 +36,201 @@ #include Data::Data() : - num_rows(0), num_rows_rounded(0), num_cols(0), sparse_data(0), num_cols_no_sparse(0), externalData(true), index_data( - 0), max_num_unique_values(0) { +num_rows(0), num_rows_rounded(0), num_cols(0), sparse_data(0), num_cols_no_sparse(0), externalData(true), index_data( +0), max_num_unique_values(0) { } Data::~Data() { - if (index_data != 0) { - delete[] index_data; - } + if (index_data != 0) { + delete[] index_data; + } } size_t Data::getVariableID(std::string variable_name) { - std::vector::iterator it = std::find(variable_names.begin(), variable_names.end(), variable_name); - if (it == variable_names.end()) { - throw std::runtime_error("Variable " + variable_name + " not found."); - } - return (std::distance(variable_names.begin(), it)); + std::vector::iterator it = std::find(variable_names.begin(), variable_names.end(), variable_name); + if (it == variable_names.end()) { + throw std::runtime_error("Variable " + variable_name + " not found."); + } + return (std::distance(variable_names.begin(), it)); } void Data::addSparseData(unsigned char* sparse_data, size_t num_cols_sparse) { - num_cols = num_cols_no_sparse + num_cols_sparse; - num_rows_rounded = roundToNextMultiple(num_rows, 4); - this->sparse_data = sparse_data; + num_cols = num_cols_no_sparse + num_cols_sparse; + num_rows_rounded = roundToNextMultiple(num_rows, 4); + this->sparse_data = sparse_data; } bool Data::loadFromFile(std::string filename) { - bool result; - - // Open input file - std::ifstream input_file; - input_file.open(filename); - if (!input_file.good()) { - throw std::runtime_error("Could not open input file."); - } - - // Count number of rows - size_t line_count = 0; - std::string line; - while (getline(input_file, line)) { - ++line_count; - } - num_rows = line_count - 1; - input_file.close(); - input_file.open(filename); - - // Check if comma, semicolon or whitespace seperated - std::string header_line; - getline(input_file, header_line); - - // Find out if comma, semicolon or whitespace seperated and call appropriate method - if (header_line.find(",") != std::string::npos) { - result = loadFromFileOther(input_file, header_line, ','); - } else if (header_line.find(";") != std::string::npos) { - result = loadFromFileOther(input_file, header_line, ';'); - } else { - result = loadFromFileWhitespace(input_file, header_line); - } - - externalData = false; - input_file.close(); - return result; + bool result; + + // Open input file + std::ifstream input_file; + input_file.open(filename); + if (!input_file.good()) { + throw std::runtime_error("Could not open input file."); + } + + // Count number of rows + size_t line_count = 0; + std::string line; + while (getline(input_file, line)) { + ++line_count; + } + num_rows = line_count - 1; + input_file.close(); + input_file.open(filename); + + // Check if comma, semicolon or whitespace seperated + std::string header_line; + getline(input_file, header_line); + + // Find out if comma, semicolon or whitespace seperated and call appropriate method + if (header_line.find(",") != std::string::npos) { + result = loadFromFileOther(input_file, header_line, ','); + } else if (header_line.find(";") != std::string::npos) { + result = loadFromFileOther(input_file, header_line, ';'); + } else { + result = loadFromFileWhitespace(input_file, header_line); + } + + externalData = false; + input_file.close(); + return result; } bool Data::loadFromFileWhitespace(std::ifstream& input_file, std::string header_line) { - // Read header - std::string header_token; - std::stringstream header_line_stream(header_line); - while (header_line_stream >> header_token) { - variable_names.push_back(header_token); - } - num_cols = variable_names.size(); - num_cols_no_sparse = num_cols; - - // Read body - reserveMemory(); - bool error = false; - std::string line; - size_t row = 0; - while (getline(input_file, line)) { - double token; - std::stringstream line_stream(line); - size_t column = 0; - while (line_stream >> token) { - set(column, row, token, error); - ++column; + // Read header + std::string header_token; + std::stringstream header_line_stream(header_line); + while (header_line_stream >> header_token) { + variable_names.push_back(header_token); } - if (column > num_cols) { - throw std::runtime_error("Could not open input file. Too many columns in a row."); - } else if (column < num_cols) { - throw std::runtime_error("Could not open input file. Too few columns in a row. Are all values numeric?"); + num_cols = variable_names.size(); + num_cols_no_sparse = num_cols; + + // Read body + reserveMemory(); + bool error = false; + std::string line; + size_t row = 0; + while (getline(input_file, line)) { + double token; + std::stringstream line_stream(line); + size_t column = 0; + while (line_stream >> token) { + set(column, row, token, error); + ++column; + } + if (column > num_cols) { + throw std::runtime_error("Could not open input file. Too many columns in a row."); + } else if (column < num_cols) { + throw std::runtime_error("Could not open input file. Too few columns in a row. Are all values numeric?"); + } + ++row; } - ++row; - } - num_rows = row; - return error; + num_rows = row; + return error; } bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator) { - // Read header - std::string header_token; - std::stringstream header_line_stream(header_line); - while (getline(header_line_stream, header_token, seperator)) { - variable_names.push_back(header_token); - } - num_cols = variable_names.size(); - num_cols_no_sparse = num_cols; - - // Read body - reserveMemory(); - bool error = false; - std::string line; - size_t row = 0; - while (getline(input_file, line)) { - std::string token_string; - double token; - std::stringstream line_stream(line); - size_t column = 0; - while (getline(line_stream, token_string, seperator)) { - std::stringstream token_stream(token_string); - token_stream >> token; - set(column, row, token, error); - ++column; + // Read header + std::string header_token; + std::stringstream header_line_stream(header_line); + while (getline(header_line_stream, header_token, seperator)) { + variable_names.push_back(header_token); + } + num_cols = variable_names.size(); + num_cols_no_sparse = num_cols; + + // Read body + reserveMemory(); + bool error = false; + std::string line; + size_t row = 0; + while (getline(input_file, line)) { + std::string token_string; + double token; + std::stringstream line_stream(line); + size_t column = 0; + while (getline(line_stream, token_string, seperator)) { + std::stringstream token_stream(token_string); + token_stream >> token; + set(column, row, token, error); + ++column; + } + ++row; } - ++row; - } - num_rows = row; - return error; + num_rows = row; + return error; } void Data::getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID) { - // All values for varID (no duplicates) for given sampleIDs - if (varID < num_cols_no_sparse) { - - all_values.reserve(sampleIDs.size()); - for (size_t i = 0; i < sampleIDs.size(); ++i) { - all_values.push_back(get(sampleIDs[i], varID)); + // All values for varID (no duplicates) for given sampleIDs + if (varID < num_cols_no_sparse) { + + all_values.reserve(sampleIDs.size()); + for (size_t i = 0; i < sampleIDs.size(); ++i) { + all_values.push_back(get(sampleIDs[i], varID)); + } + std::sort(all_values.begin(), all_values.end()); + all_values.erase(unique(all_values.begin(), all_values.end()), all_values.end()); + } else { + // If GWA data just use 0, 1, 2 + all_values = std::vector({0, 1, 2}); } - std::sort(all_values.begin(), all_values.end()); - all_values.erase(unique(all_values.begin(), all_values.end()), all_values.end()); - } else { - // If GWA data just use 0, 1, 2 - all_values = std::vector( { 0, 1, 2 }); - } } void Data::sort() { - // Reserve memory - index_data = new size_t[num_cols_no_sparse * num_rows]; + // Reserve memory + index_data = new size_t[num_cols_no_sparse * num_rows]; + + // For all columns, get unique values and save index for each observation + for (size_t col = 0; col < num_cols_no_sparse; ++col) { + + // Get all unique values + std::vector unique_values(num_rows); + for (size_t row = 0; row < num_rows; ++row) { + unique_values[row] = get(row, col); + } + std::sort(unique_values.begin(), unique_values.end()); + unique_values.erase(unique(unique_values.begin(), unique_values.end()), unique_values.end()); + + // Get index of unique value + for (size_t row = 0; row < num_rows; ++row) { + size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get(row, col)) - unique_values.begin(); + index_data[col * num_rows + row] = idx; + } + + // Save unique values + unique_data_values.push_back(unique_values); + if (unique_values.size() > max_num_unique_values) { + max_num_unique_values = unique_values.size(); + } + } +} - // For all columns, get unique values and save index for each observation - for (size_t col = 0; col < num_cols_no_sparse; ++col) { +void Data::saveToTSV(const std::string& file) { + std::ofstream fout(file.c_str(), std::ofstream::out); - // Get all unique values - std::vector unique_values(num_rows); - for (size_t row = 0; row < num_rows; ++row) { - unique_values[row] = get(row, col); + fout << variable_names[0]; + for (size_t i = 1; i < variable_names.size(); i++) { + fout << "\t" << variable_names[i]; } - std::sort(unique_values.begin(), unique_values.end()); - unique_values.erase(unique(unique_values.begin(), unique_values.end()), unique_values.end()); - - // Get index of unique value - for (size_t row = 0; row < num_rows; ++row) { - size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get(row, col)) - unique_values.begin(); - index_data[col * num_rows + row] = idx; + fout << std::endl; + + for (size_t i = 0; i < num_rows; i++) { + fout << this->get(i, 0); + for (size_t j = 1; j < num_cols; j++) { + fout << "\t" << this->get(i, j); + } + fout << std::endl; } - // Save unique values - unique_data_values.push_back(unique_values); - if (unique_values.size() > max_num_unique_values) { - max_num_unique_values = unique_values.size(); - } - } + fout.close(); } diff --git a/doc/source/conf.py b/doc/source/conf.py index cec4720f..419f9401 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -53,9 +53,9 @@ # built documents. # # The short X.Y version. -version = '0.15.0' +version = '0.16.0' # The full version, including alpha/beta/rc tags. -release = '0.15.0' +release = '0.16.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/lib/Makefile.am b/lib/Makefile.am index dd10476e..643163b7 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -15,6 +15,8 @@ libportcullis_la_SOURCES = \ src/bam_writer.cc \ src/depth_parser.cc \ src/genome_mapper.cc \ + src/markov_model.cc \ + src/model_features.cc \ src/python_exe.cc \ src/intron.cc \ src/junction.cc \ @@ -30,6 +32,9 @@ library_include_HEADERS = \ $(PI)/bam/bam_writer.hpp \ $(PI)/bam/depth_parser.hpp \ $(PI)/bam/genome_mapper.hpp \ + $(PI)/kmer.hpp \ + $(PI)/markov_model.hpp \ + $(PI)/model_features.hpp \ $(PI)/python_exe.hpp \ $(PI)/intron.hpp \ $(PI)/junction.hpp \ @@ -41,6 +46,7 @@ library_include_HEADERS = \ libportcullis_la_CPPFLAGS = \ -isystem $(top_srcdir)/deps/htslib-1.3 \ + -isystem $(top_srcdir)/deps/ranger-0.3.8/include \ -isystem $(top_srcdir)/lib/include \ -DDATADIR=\"$(datadir)\" \ @AM_CPPFLAGS@ \ diff --git a/lib/include/portcullis/bam/genome_mapper.hpp b/lib/include/portcullis/bam/genome_mapper.hpp index 2bf028ed..91c55f9c 100644 --- a/lib/include/portcullis/bam/genome_mapper.hpp +++ b/lib/include/portcullis/bam/genome_mapper.hpp @@ -97,7 +97,7 @@ class GenomeMapper { /** * @abstract Fetch the sequence in a region. * @param reg Region in the format "chr2:20,000-30,000" - * @param len Length of the region + * @param len Length of the region returned * @return The sequence as a string; empty string if no seq found */ string fetchBases(const char* reg, int* len) const; @@ -107,7 +107,7 @@ class GenomeMapper { * @param name Region name * @param start Start location on region (zero-based, inclusive) * @param end End position (zero-based, exclusive) - * @param len Length of the region + * @param len Length of the region returned * @return The sequence as a string; empty string if no seq found */ string fetchBases(const char* name, int start, int end, int* len) const; diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp index ed510d25..b18c3d2a 100644 --- a/lib/include/portcullis/junction.hpp +++ b/lib/include/portcullis/junction.hpp @@ -24,6 +24,7 @@ #include #include using std::ostream; +using std::cout; using std::endl; using std::min; using std::max; @@ -44,6 +45,9 @@ using namespace portcullis::bam; #include "intron.hpp" #include "seq_utils.hpp" +#include "markov_model.hpp" +using portcullis::KmerMarkovModel; +using portcullis::PosMarkovModel; using portcullis::Intron; @@ -97,14 +101,16 @@ const vector METRIC_NAMES = { "M17-primary_junc", "M18-mm_score", "M19-mean_mismatches", - "M20-nb_msrs", - "M21-nb_up_juncs", - "M22-nb_down_juncs", - "M23-up_aln", - "M24-down_aln", - "M25-dist_2_up_junc", - "M26-dist_2_down_junc", - "M27-dist_nearest_junc" + "M20-nb_usrs", + "M21-nb_msrs", + "M22-rel2raw", + "M23-nb_up_juncs", + "M24-nb_down_juncs", + "M25-up_aln", + "M26-down_aln", + "M27-dist_2_up_junc", + "M28-dist_2_down_junc", + "M29-dist_nearest_junc" }; const vector STRAND_NAMES = { @@ -174,7 +180,10 @@ static string cssToString(CanonicalSS css) { return string("No"); } - +struct SplicingScores { + double positionWeighting = 0.0; + double splicingSignal = 0.0; +}; struct AlignmentInfo { BamAlignmentPtr ba; @@ -257,15 +266,17 @@ class Junction { bool primaryJunction; // Metric 17 double multipleMappingScore; // Metric 18 double meanMismatches; // Metric 19 - uint32_t nbMultipleSplicedReads; // Metric 20 - uint16_t nbUpstreamJunctions; // Metric 21 - uint16_t nbDownstreamJunctions; // Metric 22 - uint32_t nbUpstreamFlankingAlignments; // Metric 23 - uint32_t nbDownstreamFlankingAlignments; // Metric 24 - int32_t distanceToNextUpstreamJunction; // Metric 25 - int32_t distanceToNextDownstreamJunction; // Metric 26 - int32_t distanceToNearestJunction; // Metric 27 - vector junctionOverhangs; // Metric 28-37 + //uint32_t nbUniquelySplicedReads; // Metric 20 (Use getter) + uint32_t nbMultipleSplicedReads; // Metric 21 + //double reliableVsRawReadRatio; // Metric 22 (Use getter) + uint16_t nbUpstreamJunctions; // Metric 23 + uint16_t nbDownstreamJunctions; // Metric 24 + uint32_t nbUpstreamFlankingAlignments; // Metric 25 + uint32_t nbDownstreamFlankingAlignments; // Metric 26 + int32_t distanceToNextUpstreamJunction; // Metric 27 + int32_t distanceToNextDownstreamJunction; // Metric 28 + int32_t distanceToNearestJunction; // Metric 29 + vector junctionOverhangs; // Metric 30-49 // **** Predictions **** @@ -689,6 +700,14 @@ class Junction { return nbMultipleSplicedReads; } + uint32_t getNbUniquelySplicedReads() const { + return nbJunctionAlignments - nbMultipleSplicedReads; + } + + double getReliable2RawRatio() const { + return (double)nbReliableAlignments / (double)nbJunctionAlignments; + } + uint16_t getNbDownstreamJunctions() const { return nbDownstreamJunctions; } @@ -813,7 +832,7 @@ class Junction { void setMeanMismatches(double meanMismatches) { this->meanMismatches = meanMismatches; } - + void setNbMultipleSplicedReads(uint32_t nbMultipleSplicedReads) { this->nbMultipleSplicedReads = nbMultipleSplicedReads; } @@ -829,7 +848,15 @@ class Junction { void setJunctionOverhangs(size_t index, uint32_t val) { junctionOverhangs[index] = val; } - + + double getJunctionOverhangLogDeviation(size_t i) const { + double Ni = junctionOverhangs[i]; // Actual count at this position + if (Ni == 0.0) Ni = 0.000000000001; // Ensure some value > 0 here otherwise we get -infinity later. + double Pi = 1.0 - ((double)i / (double)(this->meanQueryLength / 2.0)); // Likely scale at this position + double Ei = (double)this->getNbJunctionAlignments() * Pi; // Expected count at this position + double Xi = log2(Ni / Ei); + return Xi; + } bool isGenuine() const { return genuine; @@ -842,7 +869,25 @@ class Junction { string locationAsString() const { return this->intron->toString() + strandToChar(this->consensusStrand); } - + + /** + * Calculates a score for this intron size based on how this intron size fits + * into an expected distribution specified by the length at the threhsold percentile + * (threshold) provided by the user. Introns of length < threshold have a score of 0. + * Introns with length > threshold have score: -ln(size - length_threshold) + * @param Length of threshold Intron size + * @return A score for this intron size given the threshold value + */ + double calcIntronScore(const uint32_t threshold) const { + return this->intron->size() <= threshold ? 0.0 : log(this->intron->size() - threshold); + } + + + double calcCodingPotential(GenomeMapper& gmap, KmerMarkovModel& exon, KmerMarkovModel& intron) const; + + SplicingScores calcSplicingScores(GenomeMapper& gmap, KmerMarkovModel& donorT, KmerMarkovModel& donorF, + KmerMarkovModel& acceptorT, KmerMarkovModel& acceptorF, + PosMarkovModel& donorP, PosMarkovModel& acceptorP) const; // **** Output methods **** @@ -921,7 +966,9 @@ class Junction { << j.primaryJunction << "\t" << j.multipleMappingScore << "\t" << j.meanMismatches << "\t" + << j.getNbUniquelySplicedReads() << "\t" << j.nbMultipleSplicedReads << "\t" + << j.getReliable2RawRatio() << "\t" << j.nbDownstreamJunctions << "\t" << j.nbUpstreamJunctions << "\t" << j.nbUpstreamFlankingAlignments << "\t" diff --git a/lib/include/portcullis/junction_system.hpp b/lib/include/portcullis/junction_system.hpp index 61b7bc8e..47b91d03 100644 --- a/lib/include/portcullis/junction_system.hpp +++ b/lib/include/portcullis/junction_system.hpp @@ -152,8 +152,6 @@ class JunctionSystem { void calcJunctionStats(bool verbose); - void calcTrimmedOverhangScore(); - void sort(); void saveAll(const path& outputPrefix, const string& source); diff --git a/lib/include/portcullis/kmer.hpp b/lib/include/portcullis/kmer.hpp new file mode 100644 index 00000000..76247a22 --- /dev/null +++ b/lib/include/portcullis/kmer.hpp @@ -0,0 +1,94 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include +#include +#include +#include +using std::ostream; +using std::string; +using std::unordered_map; +using std::vector; + +#include +#include + +namespace portcullis { + +typedef boost::error_info KmerErrorInfo; +struct KmerException: virtual boost::exception, virtual std::exception { }; + +/** + * An extremely basic kmer hash class that doesn't worry about canonical kmers + * or reverse complementing the data. + */ +class KmerHash { +private: + unordered_map khash; + uint16_t k; + +public: + KmerHash(const uint16_t _k, const string& seq) { + k = _k; + + for(size_t i = k; i <= seq.size(); i++) { + khash[boost::to_upper_copy(seq.substr(i - k, k))]++; + } + } + + uint32_t getCount(const string& kmer) { + if (kmer.size() != k) { + BOOST_THROW_EXCEPTION(KmerException() << KmerErrorInfo(string( + "Given kmer size is ") + std::to_string(kmer.size()) + ". Expected kmer of size " + std::to_string(k))); + } + + string kup = boost::to_upper_copy(kmer); + return (khash.find(kup) != khash.end()) ? khash[kup] : 0; + } + + size_t nbDistinctKmers() const { + return khash.size(); + } + + void print(ostream& out) const { + for(auto& kmer : khash) { + out << kmer.first << "\t" << kmer.second << endl; + } + } + + void printAbundanceHistogram(ostream& out, const uint32_t hist_size) { + + vector hist(hist_size, 0); + + for(auto& kmer : khash) { + if (hist.size() > kmer.second) { + hist[kmer.second]++; + } + else { + hist[hist.size() -1]++; + } + } + + for(auto& e : hist) { + out << e << endl; + } + } +}; + +} \ No newline at end of file diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/markov_model.hpp new file mode 100644 index 00000000..613fb177 --- /dev/null +++ b/lib/include/portcullis/markov_model.hpp @@ -0,0 +1,109 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include +#include +#include +using std::string; +using std::unordered_map; +using std::vector; + + +namespace portcullis { + +typedef boost::error_info MMErrorInfo; +struct MMException: virtual boost::exception, virtual std::exception {}; + +/** + * This datastructure consists of a map of kmers to a map of nucleotides with assoicated + * probabilities. + */ +typedef unordered_map> KMMU; +typedef unordered_map> PMMU; + +/** + * Simple Markov chain implementation derived originally from Truesight. + * Constructor trains the model on a set of sequences. "getScore" returns the score + * for a given sequence based on the pre-trained model. + * Automatically converts sequences to uppercase + */ +class MarkovModel { + +protected: + uint16_t order; + +public: + + MarkovModel() : MarkovModel(1) {} + + MarkovModel(const uint32_t _order) { + order = _order; + } + + MarkovModel(const vector& input, const uint32_t _order) { + order = _order; + train(input); + } + + void train(const vector& input) { + train(input, order); + } + + virtual void train(const vector& input, const uint32_t order) = 0; + + uint16_t getOrder() const { + return order; + } + + virtual double getScore(const string& seq) = 0; +}; + +class KmerMarkovModel : public portcullis::MarkovModel { +private: + portcullis::KMMU model; +public: + KmerMarkovModel() : MarkovModel(1) {} + KmerMarkovModel(const uint32_t _order) : MarkovModel(_order) {}; + KmerMarkovModel(const vector& input, const uint32_t _order) : MarkovModel(input, _order) {} + + void train(const vector& input, const uint32_t order); + double getScore(const string& seq); + size_t size() const { + return model.size(); + } +}; + +class PosMarkovModel : public portcullis::MarkovModel { +private: + portcullis::PMMU model; +public: + PosMarkovModel() : MarkovModel(1) {} + PosMarkovModel(const uint32_t _order) : MarkovModel(_order) {}; + PosMarkovModel(const vector& input, const uint32_t _order) : MarkovModel(input, _order) {} + + void train(const vector& input, const uint32_t order); + double getScore(const string& seq); + size_t size() const { + return model.size(); + } +}; + + +} + diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/model_features.hpp new file mode 100644 index 00000000..1820cc01 --- /dev/null +++ b/lib/include/portcullis/model_features.hpp @@ -0,0 +1,134 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include +using std::shared_ptr; + +#include +using boost::filesystem::path; + +#include +#include + +#include +#include +#include +using portcullis::bam::GenomeMapper; +using portcullis::MarkovModel; +using portcullis::Junction; +using portcullis::JunctionPtr; +using portcullis::JunctionList; +using portcullis::SplicingScores; + +namespace portcullis { + +typedef shared_ptr ForestPtr; + +// List of variable names +const vector VAR_NAMES = { + "Genuine", + "rna_usrs", + "rna_dist", + "rna_rel", + "rna_entropy", + "rna_rel2raw", + "rna_maxminanc", + "rna_maxmmes", + "rna_missmatch", + "rna_intron", + "dna_minhamm", + "dna_coding", + "dna_pws", + "dna_ss" +}; + +struct Feature { + string name; + bool active; +}; + +class ModelFeatures { +private: + size_t fi; +public: + uint32_t L95; + KmerMarkovModel exonModel; + KmerMarkovModel intronModel; + KmerMarkovModel donorTModel; + KmerMarkovModel donorFModel; + KmerMarkovModel acceptorTModel; + KmerMarkovModel acceptorFModel; + PosMarkovModel donorPWModel; + PosMarkovModel acceptorPWModel; + GenomeMapper gmap; + vector features; + + ModelFeatures() : L95(0) { + fi = 1; + features.clear(); + for(size_t i = 0; i < VAR_NAMES.size(); i++) { + Feature f; + f.name = VAR_NAMES[i]; + f.active = true; + features.push_back(f); + } + for(size_t i = 0; i < JO_NAMES.size(); i++) { + Feature f; + f.name = JO_NAMES[i]; + f.active = true; + features.push_back(f); + } + } + + bool isCodingPotentialModelEmpty() { + return exonModel.size() == 0 || intronModel.size() == 0; + } + + bool isPWModelEmpty() { + return donorPWModel.size() == 0 || acceptorPWModel.size() == 0; + } + + void initGenomeMapper(const path& genomeFile); + + uint32_t calcIntronThreshold(const JunctionList& juncs); + + void trainCodingPotentialModel(const JunctionList& in); + + void trainSplicingModels(const JunctionList& pass, const JunctionList& fail); + + Data* juncs2FeatureVectors(const JunctionList& x); + + ForestPtr trainInstance(const JunctionList& x, string outputPrefix, + uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose); + + void resetActiveFeatureIndex() { + fi = 0; + } + int16_t getNextActiveFeatureIndex() { + for(int16_t i = fi+1; i < features.size(); i++) { + if (features[i].active) { + fi = i; + return i; + } + } + return -1; + } + +}; +} \ No newline at end of file diff --git a/lib/include/portcullis/rule_parser.hpp b/lib/include/portcullis/rule_parser.hpp index 27dfd2b5..31f39ec0 100644 --- a/lib/include/portcullis/rule_parser.hpp +++ b/lib/include/portcullis/rule_parser.hpp @@ -88,11 +88,13 @@ const unordered_set numericParams = { "M17-primary_junc", "M18-mm_score", "M19-nb_mismatches", - "M20-nb_msrs", - "M21-nb_up_juncs", - "M22-nb_down_juncs", - "M23-up_aln", - "M24-down_aln" + "M20-nb_usrs", + "M21-nb_msrs", + "M22-rel2raw", + "M23-nb_up_juncs", + "M24-nb_down_juncs", + "M25-up_aln", + "M26-down_aln" }; const unordered_set stringParams = { diff --git a/lib/include/portcullis/seq_utils.hpp b/lib/include/portcullis/seq_utils.hpp index 09bcc0a8..867f8b37 100644 --- a/lib/include/portcullis/seq_utils.hpp +++ b/lib/include/portcullis/seq_utils.hpp @@ -40,12 +40,24 @@ typedef boost::error_info SeqUtilsErrorInfo; struct SeqUtilsException: virtual boost::exception, virtual std::exception { }; - - class SeqUtils { public: + static bool dnaNt(const char c) { + return c == 'A' || c == 'T' || c == 'G' || c == 'C'; + } + + static string makeClean(const string& s) { + string sequp = boost::to_upper_copy(s); + + for(size_t i = 0; i < sequp.size(); i++) { + sequp[i] = dnaNt(sequp[i]) ? sequp[i] : 'N'; + } + + return sequp; + } + static int16_t hammingDistance(const string& s1, const string& s2) { if (s1.size() != s2.size()) diff --git a/lib/src/junction.cc b/lib/src/junction.cc index 5e0d4c2d..d791b272 100644 --- a/lib/src/junction.cc +++ b/lib/src/junction.cc @@ -45,10 +45,6 @@ using portcullis::bam::CigarOp; using portcullis::bam::GenomeMapper; using portcullis::bam::Strand; -#include -#include -using portcullis::Intron; - #include @@ -823,10 +819,10 @@ void portcullis::Junction::calcMismatchStats() { // Assuming we have some mismatches determine if this junction has no overhangs // extending beyond first mismatch. If so determine if that distance is small // enough to consider the junction as suspicious - if (nbMismatches > 0) { + if (nbMismatches > 0 && firstMismatch < 20) { bool found = false; for(const auto& a : alignments) { - if (a->mmes > firstMismatch) { + if (a->minMatch > firstMismatch) { found = true; break; } @@ -941,14 +937,15 @@ void portcullis::Junction::outputDescription(std::ostream &strm, string delimite << "M17: Primary Junction: " << boolalpha << primaryJunction << delimiter << "M18: Multiple mapping score: " << multipleMappingScore << delimiter << "M19: Mean mismatches: " << meanMismatches << delimiter - << "M20: # Multiple Spliced Reads: " << nbMultipleSplicedReads << delimiter - << "M21: # Upstream Junctions: " << nbUpstreamJunctions << delimiter - << "M22: # Downstream Junctions: " << nbDownstreamJunctions << delimiter - << "M23: # Upstream Non-Spliced Alignments: " << nbUpstreamFlankingAlignments << delimiter - << "M24: # Downstream Non-Spliced Alignments: " << nbDownstreamFlankingAlignments << delimiter - << "M25: Distance to next upstream junction: " << distanceToNextUpstreamJunction << delimiter - << "M26: Distance to next downstream junction: " << distanceToNextDownstreamJunction << delimiter - << "M27: Distance to nearest junction: " << distanceToNearestJunction; + << "M20: # Uniquely Spliced Reads: " << getNbUniquelySplicedReads() << delimiter + << "M21: # Multiple Spliced Reads: " << nbMultipleSplicedReads << delimiter + << "M22: # Upstream Junctions: " << nbUpstreamJunctions << delimiter + << "M23: # Downstream Junctions: " << nbDownstreamJunctions << delimiter + << "M24: # Upstream Non-Spliced Alignments: " << nbUpstreamFlankingAlignments << delimiter + << "M25: # Downstream Non-Spliced Alignments: " << nbDownstreamFlankingAlignments << delimiter + << "M26: Distance to next upstream junction: " << distanceToNextUpstreamJunction << delimiter + << "M27: Distance to next downstream junction: " << distanceToNextDownstreamJunction << delimiter + << "M28: Distance to nearest junction: " << distanceToNearestJunction; } /** @@ -979,14 +976,17 @@ void portcullis::Junction::condensedOutputDescription(std::ostream &strm, string << "M17-PrimaryJunction=" << boolalpha << primaryJunction << delimiter << "M18-MultipleMappingScore=" << multipleMappingScore << delimiter << "M19-MeanMismatches=" << meanMismatches << delimiter - << "M20-NbMultipleSplicedReads=" << nbMultipleSplicedReads << delimiter - << "M21-NbUpstreamJunctions=" << nbUpstreamJunctions << delimiter - << "M22-NbDownstreamJunctions=" << nbDownstreamJunctions << delimiter - << "M23-NbUpstreamNonSplicedAlignments=" << nbUpstreamFlankingAlignments << delimiter - << "M24-NbDownstreamNonSplicedAlignments=" << nbDownstreamFlankingAlignments << delimiter - << "M25-DistanceToNextUpstreamJunction=" << distanceToNextUpstreamJunction << delimiter - << "M26-DistanceToNextDownstreamJunction=" << distanceToNextDownstreamJunction << delimiter - << "M27-DistanceToNearestJunction=" << distanceToNearestJunction << delimiter + << "M20-NbUniquelySplicedReads=" << getNbUniquelySplicedReads() << delimiter + << "M21-NbMultipleSplicedReads=" << nbMultipleSplicedReads << delimiter + << "M22-Reliable2RawRatio=" << getReliable2RawRatio() << delimiter + << "M23-NbUpstreamJunctions=" << nbUpstreamJunctions << delimiter + << "M24-NbDownstreamJunctions=" << nbDownstreamJunctions << delimiter + << "M25-NbUpstreamNonSplicedAlignments=" << nbUpstreamFlankingAlignments << delimiter + << "M26-NbDownstreamNonSplicedAlignments=" << nbDownstreamFlankingAlignments << delimiter + << "M27-DistanceToNextUpstreamJunction=" << distanceToNextUpstreamJunction << delimiter + << "M28-DistanceToNextDownstreamJunction=" << distanceToNextDownstreamJunction << delimiter + << "M29-DistanceToNearestJunction=" << distanceToNearestJunction << delimiter + << "Suspect=" << boolalpha << suspicious << delimiter << "PFP=" << boolalpha << pfp; } @@ -1200,22 +1200,93 @@ shared_ptr portcullis::Junction::parse(const string& line) j->setPrimaryJunction(lexical_cast(parts[29])); j->setMultipleMappingScore(lexical_cast(parts[30])); j->setMeanMismatches(lexical_cast(parts[31])); - j->setNbMultipleSplicedReads(lexical_cast(parts[32])); - j->setNbUpstreamJunctions(lexical_cast(parts[33])); - j->setNbDownstreamJunctions(lexical_cast(parts[34])); - j->setNbUpstreamFlankingAlignments(lexical_cast(parts[35])); - j->setNbDownstreamFlankingAlignments(lexical_cast(parts[36])); - j->setDistanceToNextUpstreamJunction(lexical_cast(parts[37])); - j->setDistanceToNextDownstreamJunction(lexical_cast(parts[38])); - j->setDistanceToNearestJunction(lexical_cast(parts[39])); + //j->setNbUniquelySplicedReads(lexical_cast(parts[32])); + j->setNbMultipleSplicedReads(lexical_cast(parts[33])); + //j->setNbMultipleSplicedReads(lexical_cast(parts[33])); + j->setNbUpstreamJunctions(lexical_cast(parts[35])); + j->setNbDownstreamJunctions(lexical_cast(parts[36])); + j->setNbUpstreamFlankingAlignments(lexical_cast(parts[37])); + j->setNbDownstreamFlankingAlignments(lexical_cast(parts[38])); + j->setDistanceToNextUpstreamJunction(lexical_cast(parts[39])); + j->setDistanceToNextDownstreamJunction(lexical_cast(parts[40])); + j->setDistanceToNearestJunction(lexical_cast(parts[41])); // Read Junction overhangs - j->setMeanQueryLength(lexical_cast(parts[40])); - j->setSuspicious(lexical_cast(parts[41])); - j->setPotentialFalsePositive(lexical_cast(parts[42])); + j->setMeanQueryLength(lexical_cast(parts[42])); + j->setSuspicious(lexical_cast(parts[43])); + j->setPotentialFalsePositive(lexical_cast(parts[44])); for(size_t i = 0; i < JO_NAMES.size(); i++) { - j->setJunctionOverhangs(i, lexical_cast(parts[43 + i])); + j->setJunctionOverhangs(i, lexical_cast(parts[45 + i])); } return j; } + +double portcullis::Junction::calcCodingPotential(GenomeMapper& gmap, KmerMarkovModel& exon, KmerMarkovModel& intron) const { + + int len = 0; + + const char* ref = this->intron->ref.name.c_str(); + const bool neg = getConsensusStrand() == Strand::NEGATIVE; + + string left_exon = gmap.fetchBases(ref, this->intron->start - 82, this->intron->start-2, &len); + if (neg) { + left_exon = SeqUtils::reverseComplement(left_exon); + } + + string left_intron = gmap.fetchBases(ref, this->intron->start, this->intron->start+80, &len); + if (neg) { + left_intron = SeqUtils::reverseComplement(left_intron); + } + + string right_intron = gmap.fetchBases(ref, this->intron->end-80, this->intron->end, &len); + if (neg) { + right_intron = SeqUtils::reverseComplement(right_intron); + } + + string right_exon = gmap.fetchBases(ref, this->intron->end + 1, this->intron->end + 81, &len); + if (neg) { + right_exon = SeqUtils::reverseComplement(right_exon); + } + /* + cout << "Left exon : " << this->consensusStrand << " : " << left_exon << endl; + cout << "Left intron : " << this->consensusStrand << " : " << left_intron << endl; + cout << "Right intron: " << this->consensusStrand << " : " << right_intron << endl; + cout << "Right exon : " << this->consensusStrand << " : " << right_exon << endl; + */ + double score =( exon.getScore(left_exon) - intron.getScore(left_exon) ) + + ( intron.getScore(left_intron) - exon.getScore(left_intron) ) + + ( intron.getScore(right_intron) - exon.getScore(right_intron) ) + + ( exon.getScore(right_exon) - intron.getScore(right_exon) ); + + return score; +} + + +portcullis::SplicingScores portcullis::Junction::calcSplicingScores(GenomeMapper& gmap, KmerMarkovModel& donorT, KmerMarkovModel& donorF, + KmerMarkovModel& acceptorT, KmerMarkovModel& acceptorF, + PosMarkovModel& donorP, PosMarkovModel& acceptorP) const { + + int len = 0; + const char* ref = this->intron->ref.name.c_str(); + const bool neg = getConsensusStrand() == Strand::NEGATIVE; + + string left = gmap.fetchBases(ref, intron->start - 3, intron->start + 20, &len); + if (neg) { + left = SeqUtils::reverseComplement(left); + } + + string right = gmap.fetchBases(ref, intron->end - 20, intron->end + 2, &len); + if (neg) { + right = SeqUtils::reverseComplement(right); + } + + string donorseq = neg ? right : left; + string acceptorseq = neg ? left : right; + + SplicingScores ss; + ss.positionWeighting = donorP.getScore(donorseq) + acceptorP.getScore(acceptorseq); + ss.splicingSignal = (donorT.getScore(donorseq) - donorF.getScore(donorseq)) + + (acceptorT.getScore(acceptorseq) - acceptorF.getScore(acceptorseq)); + return ss; +} diff --git a/lib/src/junction_system.cc b/lib/src/junction_system.cc index 7104d459..6a6c0b2a 100644 --- a/lib/src/junction_system.cc +++ b/lib/src/junction_system.cc @@ -368,19 +368,6 @@ void portcullis::JunctionSystem::calcJunctionStats(bool verbose) { } } -/** - * Used of calculating metric 27. Trimmed Overhang score. This provide a probability - * that this junction is genuine based on comparisons of expected distribution - * of coverage to observed distribution of coverage, after reads have been trimmed - * to the first mismatch up and downstream of the junction. An L1 regularized - * logistic regression is then fitted to - * - * This metric similar to this is used in FineSplice. - */ -void portcullis::JunctionSystem::calcTrimmedOverhangScore() { - - -} void portcullis::JunctionSystem::sort() { diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc new file mode 100644 index 00000000..2e7828d7 --- /dev/null +++ b/lib/src/markov_model.cc @@ -0,0 +1,101 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::endl; + +#include +#include + +#include +using portcullis::SeqUtils; + +#include + +void portcullis::KmerMarkovModel::train(const vector& input, const uint32_t _order) { + order = _order; + KMMU temp; + for(auto& seq : input) { + string s = SeqUtils::makeClean(seq); + if (s.size() > order+1) { + for(uint16_t i = order; i < s.size(); i++) { + temp[s.substr(i-order, order)][s.substr(i, 1)]++; + } + } + } + model.clear(); + for(auto& i : temp) { + double sum = 0; + for(auto& j : i.second) { + sum += j.second; + } + for(auto& j : i.second) { + //cout << i.first << " " << j.first << " " << j.second << endl; + model[i.first][j.first] = j.second / sum; + } + } +} + + +double portcullis::KmerMarkovModel::getScore(const string& seq) { + string s = SeqUtils::makeClean(seq); + double score = 1.0; + for(uint16_t i = order; i < s.size(); i++){ + score *= model[s.substr(i-order, order)][s.substr(i, 1)]; + } + if(score == 0.0) { + return -100.0; + } + return log(score); +} + +void portcullis::PosMarkovModel::train(const vector& input, const uint32_t _order) { + order = _order; + PMMU temp; + for(auto& seq : input) { + string s = SeqUtils::makeClean(seq); + for(uint16_t i = order; i < s.size(); i++) { + temp[i][s.substr(i, 1)]++; + } + } + model.clear(); + for(auto& i : temp) { + double sum = 0; + for(auto& j : i.second) { + sum += j.second; + } + for(auto& j : i.second) { + //cout << i.first << " " << j.first << " " << j.second << endl; + model[i.first][j.first] = j.second / sum; + } + } +} + + +double portcullis::PosMarkovModel::getScore(const string& seq) { + string s = SeqUtils::makeClean(seq); + double score = 1.0; + for(uint16_t i = order; i < s.size(); i++){ + score *= model[i][s.substr(i, 1)]; + } + if(score == 0.0) { + return -300.0; + } + return log(score); +} + diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc new file mode 100644 index 00000000..902c7813 --- /dev/null +++ b/lib/src/model_features.cc @@ -0,0 +1,303 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +#include +#include +using std::cout; +using std::cerr; +using std::endl; +using std::ofstream; +using std::make_shared; + +#include +#include +#include + +#include +using portcullis::Junction; + +#include + + + +void portcullis::ModelFeatures::initGenomeMapper(const path& genomeFile) { + + // Initialise + gmap.setGenomeFile(genomeFile); + + // Load the fasta index + gmap.loadFastaIndex(); +} + +uint32_t portcullis::ModelFeatures::calcIntronThreshold(const JunctionList& juncs) { + + vector intron_sizes; + for(auto& j : juncs) { + intron_sizes.push_back(j->getIntronSize()); + } + + std::sort(intron_sizes.begin(), intron_sizes.end()); + + L95 = intron_sizes[intron_sizes.size() * 0.95]; + + return L95; +} + +void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in) { + + vector exons; + vector introns; + for(auto& j : in) { + + int len = 0; + + string left_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 202, j->getIntron()->start - 2, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + left_exon = SeqUtils::reverseComplement(left_exon); + } + exons.push_back(left_exon); + + /*string left_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->start+80, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + left_intron = SeqUtils::reverseComplement(left_intron); + } + introns.push_back(left_intron); + + string right_intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end-80, j->getIntron()->end, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + right_intron = SeqUtils::reverseComplement(right_intron); + } + introns.push_back(right_intron);*/ + + string intron = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start, j->getIntron()->end, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + intron = SeqUtils::reverseComplement(intron); + } + introns.push_back(intron); + + + string right_exon = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end + 1, j->getIntron()->end + 201, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + right_exon = SeqUtils::reverseComplement(right_exon); + } + exons.push_back(right_exon); + } + + exonModel.train(exons, 5); + intronModel.train(introns, 5); +} + +void portcullis::ModelFeatures::trainSplicingModels(const JunctionList& pass, const JunctionList& fail) { + + vector donors; + vector acceptors; + for(auto& j : pass) { + + int len = 0; + + string left = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 3, j->getIntron()->start + 20, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + left = SeqUtils::reverseComplement(left); + } + + string right = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end - 20, j->getIntron()->end + 2, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + right = SeqUtils::reverseComplement(right); + } + + if (j->getConsensusStrand() == Strand::NEGATIVE) { + donors.push_back(right); + acceptors.push_back(left); + } + else { + donors.push_back(left); + acceptors.push_back(right); + } + + } + + donorPWModel.train(donors, 1); + acceptorPWModel.train(acceptors, 1); + donorTModel.train(donors, 5); + acceptorTModel.train(acceptors, 5); + + donors.clear(); + acceptors.clear(); + + for(auto& j : fail) { + + int len = 0; + + string left = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->start - 3, j->getIntron()->start + 20, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + left = SeqUtils::reverseComplement(left); + } + + string right = gmap.fetchBases(j->getIntron()->ref.name.c_str(), j->getIntron()->end - 20, j->getIntron()->end + 2, &len); + if (j->getConsensusStrand() == Strand::NEGATIVE) { + right = SeqUtils::reverseComplement(right); + } + + if (j->getConsensusStrand() == Strand::NEGATIVE) { + donors.push_back(right); + acceptors.push_back(left); + } + else { + donors.push_back(left); + acceptors.push_back(right); + } + } + + donorFModel.train(donors, 5); + acceptorFModel.train(acceptors, 5); +} + +Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { + + vector headers; + for(auto& f : features) { + if (f.active) { + headers.push_back(f.name); + } + } + + // Convert junction list info to double* + double* d = new double[x.size() * headers.size()]; + + uint32_t row = 0; + for (const auto& j : x) { + SplicingScores ss = j->calcSplicingScores(gmap, donorTModel, donorFModel, acceptorTModel, acceptorFModel, + donorPWModel, acceptorPWModel); + + d[0 * x.size() + row] = j->isGenuine(); + + uint16_t i = 1; + + if (features[1].active) { + d[i++ * x.size() + row] = j->getNbUniquelySplicedReads(); + } + if (features[2].active) { + d[i++ * x.size() + row] = j->getNbDistinctAlignments(); + } + if (features[3].active) { + d[i++ * x.size() + row] = j->getNbReliableAlignments(); + } + if (features[4].active) { + d[i++ * x.size() + row] = j->getEntropy(); + } + if (features[5].active) { + d[i++ * x.size() + row] = j->getReliable2RawRatio(); + } + if (features[6].active) { + d[i++ * x.size() + row] = j->getMaxMinAnchor(); + } + if (features[7].active) { + d[i++ * x.size() + row] = j->getMaxMMES(); + } + if (features[8].active) { + d[i++ * x.size() + row] = j->getMeanMismatches(); + } + if (features[9].active) { + d[i++ * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); + } + if (features[10].active) { + d[i++ * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); + } + if (features[11].active) { + d[i++ * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); + } + if (features[12].active) { + d[i++ * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.positionWeighting; + } + if (features[13].active) { + d[i++ * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.splicingSignal; + } + + + //Junction overhang values at each position are first converted into deviation from expected distributions + for(size_t joi = 0; joi < JO_NAMES.size(); joi++) { + if (features[joi + 14].active) { + d[i++ * x.size() + row] = j->getJunctionOverhangLogDeviation(joi); + } + } + + row++; + } + + Data* data = new DataDouble(d, headers, x.size(), headers.size()); + data->setExternalData(false); // This causes 'd' to be deleted when 'data' is deleted + return data; +} + + + +portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionList& x, + string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) { + + cout << "Creating feature vector" << endl; + Data* trainingData = juncs2FeatureVectors(x); + + path feature_file = outputPrefix + ".features"; + cout << "Saving feature vector to disk: " << feature_file << endl; + + ofstream fout(feature_file.c_str(), std::ofstream::out); + + fout << Intron::locationOutputHeader() << "\t" << trainingData->getHeader() << endl; + for(size_t i = 0; i < x.size(); i++) { + fout << *(x[i]->getIntron()) << "\t" << trainingData->getRow(i) << endl; + } + + fout.close(); + + cout << "Initialising random forest" << endl; + ForestPtr f = nullptr; + if (probabilityMode) { + f = make_shared(); + } + else { + f = make_shared(); + } + + vector catVars; + + f->init( + "Genuine", // Dependant variable name + MEM_DOUBLE, // Memory mode + trainingData, // Data object + 0, // M Try (0 == use default) + outputPrefix, // Output prefix + 250, //trees, // Number of trees + 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible + threads, // Number of threads + IMP_GINI, // Importance measure + probabilityMode ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size + "", // Status var name + false, // Prediction mode + true, // Replace + catVars, // Unordered categorical variable names (vector) + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + false, // predall + 1.0); // Sample fraction + + cout << "Training" << endl; + f->setVerboseOut(&cerr); + f->run(verbose); + + return f; +} \ No newline at end of file diff --git a/lib/src/rule_parser.cc b/lib/src/rule_parser.cc index 0c9a0380..71f42b80 100644 --- a/lib/src/rule_parser.cc +++ b/lib/src/rule_parser.cc @@ -219,23 +219,26 @@ double portcullis::eval::getNumericFromJunc(const var& fullname) const { case 18: return junc->getMeanMismatches(); case 19: - return junc->getNbMultipleSplicedReads(); + return junc->getNbUniquelySplicedReads(); case 20: - return junc->getNbUpstreamJunctions(); + return junc->getNbMultipleSplicedReads(); case 21: - return junc->getNbDownstreamJunctions(); + return junc->getReliable2RawRatio(); case 22: - return junc->getNbUpstreamFlankingAlignments(); + return junc->getNbUpstreamJunctions(); case 23: + return junc->getNbDownstreamJunctions(); + case 24: + return junc->getNbUpstreamFlankingAlignments(); + case 25: return junc->getNbDownstreamFlankingAlignments(); - } if (boost::iequals(name, "Suspect")) { return junc->isSuspicious(); } else if (boost::iequals(name, "PFP")) { - return junc->getSpliceSiteType(); + return junc->isPotentialFalsePositive(); } @@ -257,7 +260,7 @@ string portcullis::eval::getStringFromJunc(const var& fullname) const { } BOOST_THROW_EXCEPTION(RuleParserException() << RuleParserErrorInfo(string( - "Unrecognised param: ") + name)); + "Unrecognised param: ") + name)); } bool portcullis::eval::evalNumberLeaf(Operator op, double threshold, double value) const { diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 2e3bd4f6..b3e78942 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -49,9 +49,10 @@ namespace po = boost::program_options; #include -#include +#include #include +#include #include #include #include @@ -63,15 +64,14 @@ using portcullis::Intron; using portcullis::IntronHasher; using portcullis::Performance; using portcullis::eval; - -#include "train.hpp" -using portcullis::Train; +using portcullis::bam::GenomeMapper; #include "junction_filter.hpp" portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile, const path& _output) { junctionFile = _junctionFile; + genomeFile = ""; modelFile = ""; genuineFile = ""; output = _output; @@ -85,6 +85,7 @@ portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile, filterNovel = false; source = DEFAULT_FILTER_SOURCE; verbose = false; + threshold = DEFAULT_FILTER_THRESHOLD; } @@ -196,101 +197,114 @@ void portcullis::JunctionFilter::filter() { currentJuncs.push_back(j); } + // To be overridden if we are training + ModelFeatures mf; + mf.initGenomeMapper(this->genomeFile); + + mf.features[1].active=false; // NB USRS (BAD) + mf.features[2].active=false; // NB DISTRS (BAD) + //mf.features[3].active=false; // NB RELRS (GOOD) + mf.features[4].active = false; // ENTROPY (BAD - JO LOGDEV ARE BETTER) + //mf.features[5].active = false; // REL2RAW (GOOD) + mf.features[6].active=false; // MAXMINANC (BAD - MAXMMES IS BETTER) + //mf.features[7].active=false; // MAXMMES (GOOD) + //mf.features[8].active=false; // MEAN MISMATCH (GOOD) + //mf.features[9].active=false; // INTRON (GOOD) + //mf.features[10].active=false; // MIN_HAMM (GOOD) + mf.features[11].active=false; // CODING POTENTIAL (BAD) + //mf.features[12].active=false; // POS WEIGHTS (GOOD) + //mf.features[13].active=false; // SPLICE SIGNAL (GOOD) + /*mf.features[14].active=false; // JO LOGDEV FEATURES BETTER THAN ENTROPY + mf.features[15].active=false; + mf.features[16].active=false; + mf.features[17].active=false; + mf.features[18].active=false; + mf.features[19].active=false; + mf.features[20].active=false; + mf.features[21].active=false; + mf.features[22].active=false; + mf.features[23].active=false; + mf.features[24].active=false; + mf.features[25].active=false; + mf.features[26].active=false; + mf.features[27].active=false; + mf.features[28].active=false; + mf.features[29].active=false;*/ + + + + if (train) { - JunctionList initialSet; - uint32_t pos = 0; - uint32_t neg = 0; + // The initial positive and negative sets + JunctionList pos, unlabelled, neg; - cout << "Self training mode activated. Collecting initial positive and negative sets from input." << endl; + cout << "Self training mode activated." << endl << endl; - JunctionList passJuncs; - JunctionList failJuncs; - JuncResultMap resultMap; - - doRuleBasedFiltering(this->getIntitalPosRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial positive set for training", resultMap); + createPositiveSet(currentJuncs, pos, unlabelled, mf); - cout << "Saving initial positive set:" << endl; - JunctionSystem isp(passJuncs); - isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp"); - pos = passJuncs.size(); + createNegativeSet(mf.L95, unlabelled, neg); - for(auto& j : passJuncs) { - initialSet.push_back(j); - } - if (!genuineFile.empty()) { - shared_ptr p = calcPerformance(passJuncs, failJuncs); - cout << "Performance of initial positive set (High PPV is important):" << endl; - cout << Performance::longHeader() << endl; - cout << p->toLongString() << endl << endl; - - JunctionSystem invalidPos; - for(auto& j : passJuncs) { - if (!j->isGenuine()) { - invalidPos.addJunction(j); - } - } - JunctionSystem missedPos; - for(auto& j : failJuncs) { - if (j->isGenuine()) { - missedPos.addJunction(j); - } - } - - cout << "Saving invalid junctions in initial positive set to disk:" << endl; - invalidPos.saveAll(output.string() + ".selftrain.initialset.invalidpos", "portcullis_invalid_isp"); - - cout << "Saving missed positive junctions to disk:" << endl; - missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp"); - } - + cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl; - passJuncs.clear(); - failJuncs.clear(); - resultMap.clear(); - - doRuleBasedFiltering(this->getIntitalNegRulesFile(), currentJuncs, passJuncs, failJuncs, "Creating initial negative set for training", resultMap); + cout << "Training markov models ..."; + cout.flush(); + mf.trainCodingPotentialModel(pos); + mf.trainSplicingModels(pos, neg); + cout << " done." << endl << endl; - cout << "Saving initial negative set:" << endl; - JunctionSystem isn(passJuncs); - isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn"); - neg = passJuncs.size(); - for(auto& j : passJuncs) { - initialSet.push_back(j); - } - if (!genuineFile.empty()) { - shared_ptr p = calcPerformance(passJuncs, failJuncs, true); - cout << "Performance of initial negative set (High NPV is important):" << endl; - cout << Performance::longHeader() << endl; - cout << p->toLongString() << endl << endl; - - JunctionSystem invalidNeg; - JunctionSystem missedNeg; - - for(auto& j : passJuncs) { - if (j->isGenuine()) { - invalidNeg.addJunction(j); + // Build the training set by combining the positive and negative sets + JunctionList training; + training.reserve(pos.size() + neg.size()); + training.insert(training.end(), pos.begin(), pos.end()); + training.insert(training.end(), neg.begin(), neg.end()); + + JunctionSystem trainingSystem(training); + trainingSystem.sort(); + + cout << "Training Random Forest" << endl + << "----------------------" << endl << endl; + bool done = false; + shared_ptr forest = nullptr; + while(!done) { + + forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); + const vector importance = forest->getVariableImportance(); + mf.resetActiveFeatureIndex(); + uint16_t min_importance_idx = 10000; + double min_importance_val = 100000.0; + uint16_t min_feature_idx = 10000; + // This approach to feature selection (i.e. removing worst features by importance) + // doesn't appear to have much impact on final results and is computationally + // expensive. Removing for now. + /*for(size_t i = 0; i < importance.size(); i++) { + double imp = importance[i]; + int16_t j = mf.getNextActiveFeatureIndex(); + if (imp < 0.1 && imp < min_importance_val) { + min_importance_idx = i; + min_importance_val = imp; + min_feature_idx = j; } + }*/ + if (min_importance_idx != 10000) { + mf.features[min_feature_idx].active = false; + cout << "Deactivating feature: " << mf.features[min_feature_idx].name << " - " << min_importance_val << endl; } - for(auto& j : failJuncs) { - if (!j->isGenuine()) { - missedNeg.addJunction(j); - } + else { + done = true; } - - cout << "Saving genuine valid junctions in initial negative set to disk:" << endl; - invalidNeg.saveAll(output.string() + ".selftrain.initialset.invalidneg", "portcullis_invalid_isn"); - - cout << "Saving missed negative junctions to disk:" << endl; - missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); } - - cout << "Initial training set consists of " << pos << " positive and " << neg << " negative junctions." << endl << endl; - cout << "Training a random forest model using the initial positive and negative datasets" << endl; - shared_ptr forest = Train::trainInstance(initialSet, output.string() + ".selftrain", DEFAULT_TRAIN_TREES, threads, true, true); + const vector importance = forest->getVariableImportance(); + bool foundIrrelevant = false; + mf.resetActiveFeatureIndex(); + cout << "Active features remaining:" << endl; + for(auto& i : importance) { + int16_t j = mf.getNextActiveFeatureIndex(); + cout << mf.features[j].name << " - " << i << endl; + } forest->saveToFile(); forest->writeOutput(&cout); @@ -298,22 +312,21 @@ void portcullis::JunctionFilter::filter() { modelFile = output.string() + ".selftrain.forest"; cout << endl; } - - - + // Manage a junction system of all discarded junctions JunctionSystem discardedJuncs; // Do ML based filtering if requested if(!modelFile.empty() && exists(modelFile)){ - cout << "Predicting valid junctions using random forest model" << endl << endl; + cout << "Predicting valid junctions using random forest model" << endl + << "----------------------------------------------------" << endl << endl; JunctionList passJuncs; JunctionList failJuncs; - forestPredict(currentJuncs, passJuncs, failJuncs); + forestPredict(currentJuncs, passJuncs, failJuncs, mf); - printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering")); + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering results")); // Reset currentJuncs currentJuncs.clear(); @@ -387,7 +400,7 @@ void portcullis::JunctionFilter::filter() { } } - printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Post filtering (length and/or canonical)")); + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Post filtering (length and/or canonical) results")); // Reset currentJuncs currentJuncs.clear(); @@ -396,6 +409,8 @@ void portcullis::JunctionFilter::filter() { } } + cout << endl; + JunctionSystem filteredJuncs; JunctionSystem refKeptJuncs; @@ -404,7 +419,7 @@ void portcullis::JunctionFilter::filter() { } else { - cout << "Recalculating junction grouping and distance stats based on new junction list that passed filters ..."; + cout << "Recalculating junction grouping and distance stats based on new junction list that passed filters ..."; cout.flush(); for(auto& j : currentJuncs) { @@ -434,7 +449,7 @@ void portcullis::JunctionFilter::filter() { discardedJuncs.getJunctions(), string("Overall results")); - cout << "Saving junctions passing filter to disk:" << endl; + cout << endl << "Saving junctions passing filter to disk:" << endl; filteredJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".pass", source + "_pass"); @@ -450,12 +465,224 @@ void portcullis::JunctionFilter::filter() { } } } + +void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf) { + JuncResultMap resultMap; + + cout << "Creating initial positive set for training" << endl + << "------------------------------------------" << endl << endl + << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial positive set." << endl << endl; + + if (!genuineFile.empty()) { + cout << "Performance of each positive filter layer (Low FPs is important):" << endl; + } + cout << "LAYER\t"; + if (!genuineFile.empty()) { + cout << Performance::longHeader(); + } + + JunctionList p1, p2, p3; + + cout << endl << "1\t"; + RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, unlabelled, "Creating initial positive set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p1, unlabelled)->toLongString(); + } + cout << endl << "2\t"; + RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, unlabelled, "Creating initial positive set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p2, unlabelled)->toLongString(); + } + cout << endl << "3\t"; + RuleFilter::filter(this->getIntitalPosRulesFile(3), p2, p3, unlabelled, "Creating initial positive set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p3, unlabelled)->toLongString(); + } + cout << endl << "L95x1.5\t"; + + const uint32_t L95 = mf.calcIntronThreshold(p2); + const uint32_t pos_length_limit = L95 * 1.5; + + JunctionList passJuncs; + for(auto& j : p3) { + if (j->getIntronSize() <= pos_length_limit) { + passJuncs.push_back(j); + } + else { + unlabelled.push_back(j); + } + } + if (!genuineFile.empty()) { + cout << calcPerformance(passJuncs, unlabelled)->toLongString(); + } + + cout << endl << endl << "Found " << passJuncs.size() << " junctions for the positive set" << endl << endl; + + // Analyse positive set to get L0.05 of intron size + cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl; + + + cout << "Saving initial positive set:" << endl; + JunctionSystem isp(passJuncs); + isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp"); + + + for(auto& j : passJuncs) { + JunctionPtr copy = make_shared(*j); + copy->setGenuine(true); + pos.push_back(copy); + } + if (!genuineFile.empty()) { + + JunctionSystem invalidPos; + for(auto& j : passJuncs) { + if (!j->isGenuine()) { + invalidPos.addJunction(j); + } + } + JunctionSystem missedPos; + for(auto& j : unlabelled) { + if (j->isGenuine()) { + missedPos.addJunction(j); + } + } + + cout << "Saving invalid junctions in initial positive set to disk:" << endl; + invalidPos.saveAll(output.string() + ".selftrain.initialset.invalidpos", "portcullis_invalid_isp"); + + cout << "Saving missed positive junctions to disk:" << endl; + missedPos.saveAll(output.string() + ".selftrain.initialset.missedpos", "portcullis_missed_isp"); + } +} + +void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg) { + + JuncResultMap resultMap; + + cout << "Creating initial negative set for training" << endl + << "------------------------------------------" << endl << endl + << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial negative set." << endl << endl; + + if (!genuineFile.empty()) { + cout << "Performance of each negative filter layer (Low FNs is important):" << endl; + } + cout << "LAYER\t"; + if (!genuineFile.empty()) { + cout << Performance::longHeader(); + } + + JunctionList p1, p2, p3, p4, p5, p6, p7, p8, f1, f2, f3, f4, f5, f6, f7, f8; + cout << endl << "1\t"; + RuleFilter::filter(this->getIntitalNegRulesFile(1), all, p1, f1, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p1, f1, true)->toLongString(); + } + cout << endl << "2\t"; + RuleFilter::filter(this->getIntitalNegRulesFile(2), f1, p2, f2, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p2, f2, true)->toLongString(); + } + cout << endl << "3\t"; + RuleFilter::filter(this->getIntitalNegRulesFile(3), f2, p3, f3, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p3, f3, true)->toLongString(); + } + cout << endl << "4\t"; + RuleFilter::filter(this->getIntitalNegRulesFile(4), f3, p4, f4, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p4, f4, true)->toLongString(); + } + cout << endl << "5\t"; + RuleFilter::filter(this->getIntitalNegRulesFile(5), f4, p5, f5, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p5, f5, true)->toLongString(); + } + cout << endl << "6\t"; + RuleFilter::filter(this->getIntitalNegRulesFile(6), f5, p6, f6, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p6, f6, true)->toLongString(); + } + cout << endl << "7\t"; + RuleFilter::filter(this->getIntitalNegRulesFile(7), f6, p7, f7, "Creating initial negative set for training", resultMap); + if (!genuineFile.empty()) { + cout << calcPerformance(p7, f7, true)->toLongString(); + } + cout << endl << "L95x5\t"; + + JunctionList passJuncs, failJuncs; + const uint32_t L95x5 = L95 * 5; + for(auto& j : f7) { + if (j->getIntronSize() > L95x5 && j->getMaxMMES() < 10 ) { + p7.push_back(j); + } + else { + failJuncs.push_back(j); + } + } + if (!genuineFile.empty()) { + cout << calcPerformance(p8, failJuncs, true)->toLongString(); + } + + cout << endl << endl << "Concatenating negatives from each layer to create negative set" << endl << endl; + + passJuncs.insert(passJuncs.end(), p1.begin(), p1.end()); + passJuncs.insert(passJuncs.end(), p2.begin(), p2.end()); + passJuncs.insert(passJuncs.end(), p3.begin(), p3.end()); + passJuncs.insert(passJuncs.end(), p4.begin(), p4.end()); + passJuncs.insert(passJuncs.end(), p5.begin(), p5.end()); + passJuncs.insert(passJuncs.end(), p6.begin(), p6.end()); + passJuncs.insert(passJuncs.end(), p7.begin(), p7.end()); + passJuncs.insert(passJuncs.end(), p8.begin(), p8.end()); + + // This will remove any duplicates + JunctionSystem isn(passJuncs); + + if (!genuineFile.empty()) { + cout << "Final\t" << calcPerformance(isn.getJunctions(), failJuncs, true)->toLongString() << endl << endl; + } + + cout << "Found " << isn.getJunctions().size() << " junctions for the negative set" << endl << endl; + + cout << endl << "Saving initial negative set:" << endl; + isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn"); + + for(auto& j : isn.getJunctions()) { + JunctionPtr copy = make_shared(*j); + copy->setGenuine(false); + neg.push_back(copy); + } + + if (!genuineFile.empty()) { + + JunctionSystem invalidNeg; + JunctionSystem missedNeg; + + for(auto& j : isn.getJunctions()) { + if (j->isGenuine()) { + invalidNeg.addJunction(j); + } + } + for(auto& j : failJuncs) { + if (!j->isGenuine()) { + missedNeg.addJunction(j); + } + } + + cout << "Saving genuine valid junctions in initial negative set to disk:" << endl; + invalidNeg.saveAll(output.string() + ".selftrain.initialset.invalidneg", "portcullis_invalid_isn"); + + cout << "Saving missed negative junctions to disk:" << endl; + missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); + } +} + + void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) { // Output stats size_t diff = in.size() - pass.size(); - cout << prefix << endl + cout << endl << prefix << endl << "-------------------------" << endl << "Input contained " << in.size() << " junctions." << endl << "Output contains " << pass.size() << " junctions." << endl @@ -465,7 +692,7 @@ void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, c shared_ptr p = calcPerformance(pass, fail); cout << Performance::longHeader() << endl; cout << p->toLongString() << endl << endl; - } + } } shared_ptr portcullis::JunctionFilter::calcPerformance(const JunctionList& pass, const JunctionList& fail, bool invert) { @@ -512,23 +739,17 @@ void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, cons } -void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail) { +void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf) { - if (verbose) { - cerr << endl << "Preparing junction metrics into matrix" << endl; - } + cout << "Creating feature vector" << endl; - Data* testingData = Train::juncs2FeatureVectors(all); + Data* testingData = mf.juncs2FeatureVectors(all); - // Load forest from disk - - if (verbose) { - cerr << "Initialising random forest" << endl; - } + cout << "Initialising random forest" << endl; shared_ptr f = nullptr; if (train) { - f = make_shared(); + f = make_shared(); } else { f = make_shared(); @@ -542,11 +763,12 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction testingData, // Data object 0, // M Try (0 == use default) "", // Output prefix - DEFAULT_TRAIN_TREES, // Number of trees (will be overwritten when loading the model) - DEFAULT_SEED, + 250, //DEFAULT_SELFTRAIN_TREES, // Number of trees (will be overwritten when loading the model) + 1234567890, // Seed for random generator threads, // Number of threads IMP_GINI, // Importance measure - train ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size + train ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size + //DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, "", // Status var name true, // Prediction mode true, // Replace @@ -558,46 +780,100 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction f->setVerboseOut(&cerr); - if (verbose) { - cerr << "Loading forest model from disk" << endl; - } - // Load trees from saved model f->loadFromFile(modelFile.string()); - if (verbose) { - cerr << "Making predictions" << endl; - } + cout << "Making predictions" << endl; f->run(verbose); - if (verbose) { - cerr << "Separating original junction data into pass and fail categories" << endl; + path scorepath = output.string() + ".scores"; + ofstream scoreStream(scorepath.c_str()); + + scoreStream << "Score\t" << Intron::locationOutputHeader() << "\tStrand\tSS\t" << testingData->getHeader() << endl; + + for(size_t i = 0; i < all.size(); i++) { + + scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) + << "\t" << strandToChar(all[i]->getConsensusStrand()) + << "\t" << cssToChar(all[i]->getSpliceSiteType()) + << "\t" << testingData->getRow(i) << endl; } + + scoreStream.close(); + + + if (!genuineFile.empty() && exists(genuineFile)) { + vector thresholds; + for(double i = 0.0; i <= 1.0; i += 0.01) { + thresholds.push_back(i); + } + double max_mcc = 0.0; + double max_f1 = 0.0; + double best_t_mcc = 0.0; + double best_t_f1 = 0.0; + + cout << "Threshold\t" << Performance::longHeader() << endl; + for(auto& t : thresholds) { + JunctionList pjl; + JunctionList fjl; + categorise(f, all, pjl, fjl, t); + shared_ptr perf = calcPerformance(pjl, fjl); + double mcc = perf->getMCC(); + double f1 = perf->getF1Score(); + cout << t << "\t" << perf->toLongString() << endl; + if (mcc > max_mcc) { + max_mcc = mcc; + best_t_mcc = t; + } + if (f1 > max_f1) { + max_f1 = f1; + best_t_f1 = t; + } + } + + cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; + cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; + cout << "Actual threshold set at " << threshold << endl; + categorise(f, all, pass, fail, best_t_f1); + } + else { + categorise(f, all, pass, fail, threshold); + } + + cout << "Saved junction scores to: " << scorepath << endl; + + + delete testingData; +} + +void portcullis::JunctionFilter::categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t) { + for(size_t i = 0; i < all.size(); i++) { - if (f->getPredictions()[i][0] >= 0.5) { + if (f->getPredictions()[i][0] <= t) { pass.push_back(all[i]); } else { fail.push_back(all[i]); } } - - delete testingData; } int portcullis::JunctionFilter::main(int argc, char *argv[]) { path junctionFile; + path genomeFile; path modelFile; path genuineFile; path filterFile; path referenceFile; path output; uint16_t threads; + bool no_ml; bool saveBad; int32_t max_length; string canonical; string source; + double threshold; bool verbose; bool help; @@ -618,6 +894,8 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { "If you have a list of line separated boolean values in a file, indicating whether each junction in your input is genuine or not, then we can use that information here to gauge the accuracy of the predictions.") ("reference,r", po::value(&referenceFile), "Reference annotation of junctions in BED format. Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria. If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.") + ("no_ml,n", po::bool_switch(&no_ml)->default_value(false), + "Disables machine learning filtering") ("save_bad,b", po::bool_switch(&saveBad)->default_value(false), "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") ("source", po::value(&source)->default_value(DEFAULT_FILTER_SOURCE), @@ -637,11 +915,15 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { // in config file, but will not be shown to the user. po::options_description hidden_options("Hidden options"); hidden_options.add_options() - ("junction-file", po::value(&junctionFile), "Path to the junction file to process.") + ("genome-file", po::value(&genomeFile), "Path to the genome file to process.") + ("junction-file", po::value(&junctionFile), "Path to the junction file to process.") + ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), + "The threshold score at which we determine a junction to be genuine or not.") ; // Positional option for the input bam file po::positional_options_description p; + p.add("genome-file", 1); p.add("junction-file", 1); @@ -679,14 +961,18 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { // Only set the filter rules if specified. filter.setFilterFile(filterFile); filter.setGenuineFile(genuineFile); - if (modelFile.empty()) { + if (modelFile.empty() && !no_ml) { filter.setTrain(true); } else { filter.setTrain(false); - filter.setModelFile(modelFile); + if (!no_ml) { + filter.setModelFile(modelFile); + } } filter.setReferenceFile(referenceFile); + filter.setGenomeFile(genomeFile); + filter.setThreshold(threshold); filter.filter(); diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index 12642bcd..d21b14e2 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -46,16 +46,20 @@ using boost::filesystem::create_symlink; using boost::filesystem::create_directory; using boost::filesystem::symbolic_link_exists; +#include #include #include #include #include #include +#include +using portcullis::bam::GenomeMapper; using portcullis::PortcullisFS; using portcullis::Intron; using portcullis::IntronHasher; using portcullis::Performance; using portcullis::JuncResultMap; +using portcullis::ModelFeatures; namespace portcullis { @@ -68,9 +72,11 @@ const string DEFAULT_FILTER_OUTPUT = "portcullis_filter/portcullis"; const string DEFAULT_FILTER_SOURCE = "portcullis"; const string DEFAULT_FILTER_RULE_FILE = "default_filter.json"; const string DEFAULT_FILTER_MODEL_FILE = "default_model.forest"; -const string ST_IPOS_RULES_FILE = "selftrain_initial_pos.json"; -const string ST_INEG_RULES_FILE = "selftrain_initial_neg.json"; +const string ST_IPOS_RULES_FILE = "selftrain_initial_pos"; +const string ST_INEG_RULES_FILE = "selftrain_initial_neg"; const uint16_t DEFAULT_FILTER_THREADS = 1; +const uint16_t DEFAULT_SELFTRAIN_TREES = 100; +const double DEFAULT_FILTER_THRESHOLD = 0.625; class JunctionFilter { @@ -78,6 +84,7 @@ class JunctionFilter { private: path junctionFile; + path genomeFile; path modelFile; path filterFile; path genuineFile; @@ -91,6 +98,7 @@ class JunctionFilter { bool filterSemi; bool filterNovel; string source; + double threshold; bool verbose; @@ -124,6 +132,23 @@ class JunctionFilter { void setJunctionFile(path junctionFile) { this->junctionFile = junctionFile; } + + path getGenomeFile() const { + return genomeFile; + } + + void setGenomeFile(path genomeFile) { + this->genomeFile = genomeFile; + } + + double getThreshold() const { + return threshold; + } + + void setThreshold(double threshold) { + this->threshold = threshold; + } + path getOutput() const { return output; @@ -248,12 +273,12 @@ class JunctionFilter { this->maxLength = maxLength; } - path getIntitalPosRulesFile() const { - return path(dataDir.string() + "/" + ST_IPOS_RULES_FILE); + path getIntitalPosRulesFile(uint16_t index) const { + return path(dataDir.string() + "/" + ST_IPOS_RULES_FILE + ".layer" + std::to_string(index) + ".json"); } - path getIntitalNegRulesFile() const { - return path(dataDir.string() + "/" + ST_INEG_RULES_FILE); + path getIntitalNegRulesFile(uint16_t index) const { + return path(dataDir.string() + "/" + ST_INEG_RULES_FILE + ".layer" + std::to_string(index) + ".json"); } void filter(); @@ -261,7 +286,7 @@ class JunctionFilter { protected: - void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail); + void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf); shared_ptr calcPerformance(const JunctionList& pass, const JunctionList& fail) { return calcPerformance(pass, fail, false); @@ -272,6 +297,12 @@ class JunctionFilter { void doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail, const string& prefix, JuncResultMap& resultMap); + void categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t); + + void createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf); + + void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg); + public: static string helpMessage() { diff --git a/src/portcullis.cc b/src/portcullis.cc index 10f9b5fc..74af944f 100644 --- a/src/portcullis.cc +++ b/src/portcullis.cc @@ -260,7 +260,8 @@ int mainFull(int argc, char *argv[]) { cout << "Identifying junctions and calculating metrics" << endl << "---------------------------------------------" << endl << endl; - path juncOut = outputDir.string() + "/2-junc/portcullis_all"; + path juncDir = outputDir.string() + "/2-junc"; + path juncOut = juncDir.string() + "/portcullis_all"; // Identify junctions and calculate metrics JunctionBuilder jb(prepDir.string(), juncOut); @@ -281,7 +282,7 @@ int mainFull(int argc, char *argv[]) { << "-------------------" << endl << endl; path filtOut = outputDir.string() + "/3-filt/portcullis_filtered"; - path juncTab = juncOut.string() + "/portcullis_all.junctions.tab"; + path juncTab = juncDir.string() + "/portcullis_all.junctions.tab"; JunctionFilter filter(juncTab, filtOut); filter.setVerbose(verbose); diff --git a/src/train.cc b/src/train.cc index 82c2abc3..689fa1c5 100644 --- a/src/train.cc +++ b/src/train.cc @@ -68,95 +68,14 @@ portcullis::Train::Train(const path& _junctionFile, const path& _refFile){ verbose = false; } -Data* portcullis::Train::juncs2FeatureVectors(const JunctionList& x) { - - vector headers; - headers.reserve( VAR_NAMES.size() + JO_NAMES.size() ); - headers.insert( headers.end(), VAR_NAMES.begin(), VAR_NAMES.end() ); - headers.insert( headers.end(), JO_NAMES.begin(), JO_NAMES.end() ); - - // Convert junction list info to double* - double* d = new double[x.size() * headers.size()]; - - uint32_t row = 0; - for (const auto& j : x) { - - d[0 * x.size() + row] = j->getNbJunctionAlignments(); - d[1 * x.size() + row] = j->getNbDistinctAlignments(); - d[2 * x.size() + row] = j->getNbReliableAlignments(); - d[3 * x.size() + row] = j->getMaxMinAnchor(); - d[4 * x.size() + row] = j->getDiffAnchor(); - d[5 * x.size() + row] = j->getNbDistinctAnchors(); - d[6 * x.size() + row] = j->getEntropy(); - d[7 * x.size() + row] = j->getMaxMMES(); - d[8 * x.size() + row] = j->getHammingDistance5p(); - d[9 * x.size() + row] = j->getHammingDistance3p(); - d[10 * x.size() + row] = j->isGenuine(); - // Junction overhang values at each position are first converted into deviation from expected distributions - double half_read_length = (double)j->getMeanQueryLength() / 2.0; - for(size_t i = 0; i < JO_NAMES.size(); i++) { - double Ni = j->getJunctionOverhangs(i); // Actual count at this position - if (Ni == 0.0) Ni = 0.000000000001; // Ensure some value > 0 here otherwise we get -infinity later. - double Pi = 1.0 - ((double)i / half_read_length); // Likely scale at this position - double Ei = (double)j->getNbJunctionAlignments() * Pi; // Expected count at this position - double Xi = abs(log2(Ni / Ei)); // Log deviation - - d[(i + 11) * x.size() + row] = Xi; - } - - row++; - } - - Data* data = new DataDouble(d, headers, x.size(), headers.size()); - data->setExternalData(false); // This causes 'd' to be deleted when 'data' is deleted - return data; -} -shared_ptr portcullis::Train::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool regressionMode, bool verbose) { - - Data* trainingData = juncs2FeatureVectors(x); - - shared_ptr f = nullptr; - if (regressionMode) { - f = make_shared(); - } - else { - f = make_shared(); - } - - vector catVars; - - f->init( - "Genuine", // Dependant variable name - MEM_DOUBLE, // Memory mode - trainingData, // Data object - 0, // M Try (0 == use default) - outputPrefix, // Output prefix - trees, // Number of trees - DEFAULT_SEED, // Use fixed seed to avoid non-deterministic behaviour - threads, // Number of threads - IMP_GINI, // Importance measure - regressionMode ? DEFAULT_MIN_NODE_SIZE_REGRESSION : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size - "", // Status var name - false, // Prediction mode - true, // Replace - catVars, // Unordered categorical variable names (vector) - false, // Memory saving - DEFAULT_SPLITRULE, // Split rule - false, // predall - 1.0); // Sample fraction - - - f->setVerboseOut(&cerr); - f->run(verbose); - - return f; -} + void portcullis::Train::testInstance(shared_ptr f, const JunctionList& x) { // Convert testing set junctions into feature vector - Data* testingData = juncs2FeatureVectors(x); + ModelFeatures mf; + Data* testingData = mf.juncs2FeatureVectors(x); f->setPredictionMode(true); f->setData(testingData); @@ -237,7 +156,7 @@ void portcullis::Train::train() { cout << "Training on full dataset" << endl; - shared_ptr f = trainInstance(junctions, outputPrefix.string(), trees, threads, false, true); + ForestPtr f = ModelFeatures().trainInstance(junctions, outputPrefix.string(), trees, threads, false, true); f->saveToFile(); f->writeOutput(&cout); @@ -272,7 +191,7 @@ void portcullis::Train::train() { kf.getFold(i, back_inserter(train), back_inserter(test)); // Train on this particular set - shared_ptr f = trainInstance(train, outputPrefix.string(), trees, threads, false, false); + ForestPtr f = ModelFeatures().trainInstance(train, outputPrefix.string(), trees, threads, false, false); // Test model instance testInstance(f, test); diff --git a/src/train.hpp b/src/train.hpp index 68fefdff..40fe2b34 100644 --- a/src/train.hpp +++ b/src/train.hpp @@ -36,7 +36,10 @@ using boost::filesystem::path; #include #include - +#include +#include +using portcullis::bam::GenomeMapper; +using portcullis::ModelFeatures; namespace portcullis { @@ -50,11 +53,7 @@ const uint16_t DEFAULT_TRAIN_THREADS = 1; const double DEFAULT_TRAIN_FRACTION = 1.0; const int DEFAULT_SEED = 1234567; // To avoid non-deterministic behaviour -// List of variable names -const vector VAR_NAMES = { "M2-nb-reads", "M3-nb_dist_aln", "M4-nb_rel_aln", - "M8-max_min_anc", "M9-dif_anc", "M10-dist_anc", "M11-entropy", - "M12-maxmmes", "M13-hammping5p", "M14-hamming3p", - "Genuine" }; + // Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/ template @@ -220,15 +219,11 @@ class Train { static int main(int argc, char *argv[]); - static Data* juncs2FeatureVectors(const JunctionList& x); - - static shared_ptr trainInstance(const JunctionList& x, string outputPrefix, - uint16_t trees, uint16_t threads, bool regressionMode, bool verbose); protected: - void testInstance(shared_ptr f, const JunctionList& y); + void testInstance(shared_ptr f, const JunctionList& y); void getRandomSubset(const JunctionList& in, JunctionList& out); diff --git a/tests/Makefile.am b/tests/Makefile.am index 9cda768c..d6d27253 100755 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -24,13 +24,15 @@ check_PROGRAMS = check_unit_tests check_unit_tests_SOURCES = bam_tests.cpp \ seq_utils_tests.cpp \ + kmer_tests.cpp \ intron_tests.cpp \ - junction_tests.cpp \ + junction_tests.cpp \ check_portcullis.cc check_unit_tests_CXXFLAGS = -O0 @AM_CXXFLAGS@ @CXXFLAGS@ check_unit_tests_CPPFLAGS = -isystem $(top_srcdir)/deps/htslib-1.3 \ + -isystem $(top_srcdir)/deps/ranger-0.3.8/include \ -I$(top_srcdir)/lib/include \ -DRESOURCESDIR=\"$(top_srcdir)/tests/resources\" \ -DDATADIR=\"$(datadir)\" \ @@ -39,12 +41,14 @@ check_unit_tests_CPPFLAGS = -isystem $(top_srcdir)/deps/htslib-1.3 \ check_unit_tests_LDFLAGS = -L../lib \ -L../deps/htslib-1.3 \ + -L../deps/ranger-0.3.8 \ @AM_LDFLAGS@ \ @LDFLAGS@ check_unit_tests_LDADD = libgtest.la \ -lportcullis \ -lphts \ + -lranger \ @AM_LIBS@ clean-local: clean-local-check diff --git a/tests/kmer_tests.cpp b/tests/kmer_tests.cpp new file mode 100644 index 00000000..d7895d2d --- /dev/null +++ b/tests/kmer_tests.cpp @@ -0,0 +1,53 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include + +#include +#include +#include +using std::cout; +using std::endl; +using std::stringstream; + + +#include +#include +#include +namespace bfs = boost::filesystem; +using bfs::path; + +#include +using portcullis::KmerHash; + + +TEST(kmer, make) { + + string seq = "ATGCATGCATCGNATATATATTGAC"; + + KmerHash kh(4, seq); + + //kh.print(std::cout); + + EXPECT_EQ(1, kh.getCount("tgac")); + + EXPECT_EQ(0, kh.getCount("kjls")); + + EXPECT_EQ(16, kh.nbDistinctKmers()); + +} +