From e2cf463893dadcdeba97a7c518fbf657f54639f6 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 10 Mar 2017 17:30:13 +0000 Subject: [PATCH 01/11] Added functionality to junctools to compare one or more GTF files based on a set of provided junctions. --- scripts/junctools/__init__.py.in | 6 +- scripts/junctools/gtf.py | 173 ++++++++++++++++++++----------- 2 files changed, 114 insertions(+), 65 deletions(-) diff --git a/scripts/junctools/__init__.py.in b/scripts/junctools/__init__.py.in index 1bac3fea..8b2f8f77 100644 --- a/scripts/junctools/__init__.py.in +++ b/scripts/junctools/__init__.py.in @@ -78,10 +78,12 @@ truesight = Truesight style tab delimited format.''') help="Filter or markup GTF files based on provided junctions", description='''GTF modes: filter = Filters out transcripts from GTF file that are not supported by the provided - junction file + junction file. markup = Marks transcripts from GTF file with \'portcullis\' attribute, which indicates if transcript has a fully supported set of junctions, or if not, which ones are - not supported.''') + not supported. +compare = For each GTF provided in the input. compare mode creates statistics describing + how many transcripts contain introns that are supported by a junction file.''') gtf.add_options(gtf_parser) gtf_parser.set_defaults(func=gtf.gtf) diff --git a/scripts/junctools/gtf.py b/scripts/junctools/gtf.py index ca228b30..b0909579 100755 --- a/scripts/junctools/gtf.py +++ b/scripts/junctools/gtf.py @@ -3,11 +3,13 @@ from enum import Enum, unique from .junction import Junction, BedJunction +from .performance import Performance @unique class Mode(Enum): MARKUP = 1 FILTER = 2 + COMPARE = 3 def decstart(junctions): last_id = "" @@ -96,79 +98,123 @@ def loadgtf(filepath, use_strand=False): return intron_chains, junc_key_set, nb_transcripts, nb_introns +def run_compare(args, ref_juncs): + print("\t".join(["file", "j_distinct", "j_total", "j_tp", "j_fp", "j_fn", "j_recall", "j_precision", "j_f1", + "t_transcripts", "t_monoexonic", "t_multiexonic", "t_supported", "t_unsupported", "t_precision"])) + for i in args.input: + intron_chains, junc_set, nb_transcripts, nb_introns = loadgtf(i, use_strand=not args.ignore_strand) + nb_monoexonic = nb_transcripts - len(intron_chains) + nb_multiexonic = len(intron_chains) + + jr_tp = len(ref_juncs & junc_set) + jr_fn = len(ref_juncs - junc_set) + jr_fp = len(junc_set - ref_juncs) + jr_perf = Performance(tp=jr_tp, fn=jr_fn, fp=jr_fp) + + nb_in_ref = 0 + for t, introns in intron_chains.items(): + + is_valid_by_ref = True + + for index, j in enumerate(introns): + in_ref = j.key in ref_juncs + + if not in_ref: + is_valid_by_ref = False + break + + if is_valid_by_ref: + nb_in_ref += 1 + + nb_unsupported = nb_multiexonic - nb_in_ref + t_precision = (nb_in_ref / nb_multiexonic) * 100.0 + + print("\t".join(str(_) for _ in [i, len(junc_set), nb_introns, jr_tp, jr_fp, jr_fn, + "{0:.2f}".format(jr_perf.recall()), "{0:.2f}".format(jr_perf.precision()), "{0:.2f}".format(jr_perf.F1()), + nb_transcripts, nb_monoexonic, nb_multiexonic, nb_in_ref, nb_unsupported, "{0:.2f}".format(t_precision)])) + + def gtf(args): mode = Mode[args.mode.upper()] - print("Running junctools GTF in", mode.name, "mode") + print("# Running junctools GTF in", mode.name, "mode") - print("Loading junctions ...",end="") + if mode != Mode.COMPARE and (len(args.input) > 1 or len(args.input) == 0): + raise SyntaxError("This mode can takes a single GTF file as input") + + print("# Loading junctions ...",end="") port_juncs, port_count = Junction.createJuncSet(args.junctions, use_strand=not args.ignore_strand) print(" done. Found", port_count, "junctions.") - print("Loading transcripts ...",end="") - intron_chains, junc_set, nb_transcripts, nb_introns = loadgtf(args.input, use_strand=not args.ignore_strand) - print(" done.") - nb_monoexonic = nb_transcripts - len(intron_chains) - nb_multiexonic = len(intron_chains) - print("Found", nb_transcripts, "transcripts. ", nb_monoexonic, "are monoexonic and", nb_multiexonic, "are multiexonic.") - print("Found", len(junc_set), "distinct junctions and", nb_introns, "total junctions in GTF.") - print() - - print("Doing junction level comparison ...", end="") - juncs_inport = port_juncs & junc_set - print(" done.") - recall = (len(juncs_inport) / port_count) * 100.0 - precision = (len(juncs_inport) / len(junc_set)) * 100.0 - print(len(juncs_inport), "/", port_count, "(" + "{0:.2f}".format(recall) + ") of valid junctions.") - print(len(juncs_inport), "/", len(junc_set), "(" + "{0:.2f}".format(precision) + ") junctions supported by input.") - print() - - - print("Identifying invalid multi-exonic transcripts ... ", end="") - invalid_transcripts = collections.defaultdict(list) - for t, introns in intron_chains.items(): - for index, j in enumerate(introns): - if not j.key in port_juncs: - invalid_transcripts[t].append(str(j.start+1) + "_" + str(j.end+1)) - print(" done. Found", len(invalid_transcripts), "/", nb_multiexonic, "invalid multi-exonic transcripts.") - print() - - print("Writing output to", args.output," ... ", end="") - o = open(args.output, mode='w') - - # Create dict of transcripts with all exons in memory - with open(args.input) as f: - for l in f: - line = l.strip() - if not line.startswith("#"): - parts = line.split('\t') - if len(parts) == 9 and (parts[2] == "exon" or parts[2] == "transcript"): - tags = parts[8].split(';') - for tag in tags: - t = tag.strip() - if t: - tag_parts = t.split() - if tag_parts[0] == "transcript_id": - tid = tag_parts[1].strip() - transcript_id = tid[1:-1] if tid[0] == '\"' else tid - if transcript_id in invalid_transcripts: - if mode != Mode.FILTER: + if mode == Mode.COMPARE: + run_compare(args, port_juncs) + + else: + print("Loading transcripts ...",end="") + intron_chains, junc_set, nb_transcripts, nb_introns = loadgtf(args.input[0], use_strand=not args.ignore_strand) + print(" done.") + nb_monoexonic = nb_transcripts - len(intron_chains) + nb_multiexonic = len(intron_chains) + print("Found", nb_transcripts, "transcripts. ", nb_monoexonic, "are monoexonic and", nb_multiexonic, "are multiexonic.") + print("Found", len(junc_set), "distinct junctions and", nb_introns, "total junctions in GTF.") + print() + + print("Doing junction level comparison ...", end="") + juncs_inport = port_juncs & junc_set + print(" done.") + recall = (len(juncs_inport) / port_count) * 100.0 + precision = (len(juncs_inport) / len(junc_set)) * 100.0 + print(len(juncs_inport), "/", port_count, "(" + "{0:.2f}".format(recall) + ") of valid junctions.") + print(len(juncs_inport), "/", len(junc_set), "(" + "{0:.2f}".format(precision) + ") junctions supported by input.") + print() + + + print("Identifying invalid multi-exonic transcripts ... ", end="") + invalid_transcripts = collections.defaultdict(list) + for t, introns in intron_chains.items(): + for index, j in enumerate(introns): + if not j.key in port_juncs: + invalid_transcripts[t].append(str(j.start+1) + "_" + str(j.end+1)) + print(" done. Found", len(invalid_transcripts), "/", nb_multiexonic, "invalid multi-exonic transcripts.") + print() + + + print("Writing output to", args.output," ... ", end="") + o = open(args.output, mode='w') + + # Create dict of transcripts with all exons in memory + with open(args.input[0]) as f: + for l in f: + line = l.strip() + if not line.startswith("#"): + parts = line.split('\t') + if len(parts) == 9 and (parts[2] == "exon" or parts[2] == "transcript"): + tags = parts[8].split(';') + for tag in tags: + t = tag.strip() + if t: + tag_parts = t.split() + if tag_parts[0] == "transcript_id": + tid = tag_parts[1].strip() + transcript_id = tid[1:-1] if tid[0] == '\"' else tid + if transcript_id in invalid_transcripts: + if mode != Mode.FILTER: + if mode == Mode.MARKUP and parts[2] == "transcript": + bad_juncs = ",".join(invalid_transcripts[transcript_id]) + o.write(line + " introns \"invalid(" + bad_juncs + ")\";\n") + else: + o.write(line + "\n") + else: if mode == Mode.MARKUP and parts[2] == "transcript": - bad_juncs = ",".join(invalid_transcripts[transcript_id]) - o.write(line + " introns \"invalid(" + bad_juncs + ")\";\n") + o.write(line + " introns \"valid\";\n") else: o.write(line + "\n") - else: - if mode == Mode.MARKUP and parts[2] == "transcript": - o.write(line + " introns \"valid\";\n") - else: - o.write(line + "\n") + else: + o.write(line + "\n") else: o.write(line + "\n") - else: - o.write(line + "\n") - o.close() - print(" done.") + o.close() + print(" done.") def add_options(parser): @@ -179,12 +225,13 @@ def add_options(parser): parser.add_argument("-j", "--junctions", required=True, help='''The file containing junctions that should be found in the GTF.''') - parser.add_argument("-o", "--output", required=True, default="junctools.out.gtf", + parser.add_argument("-o", "--output", default="junctools.out.gtf", help="The filtered or markedup GTF output. By default we print to stdout.") parser.add_argument("mode", help='''GTF operation to apply. See above for details. Available options: - filter - markup + - compare ''') - parser.add_argument("input", help="The input GTF file to convert") + parser.add_argument("input", nargs="+", help="The input GTF file to convert") From ff0bd40a4f87f62a6013d9d1f3361cfc72fb41d3 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Mon, 13 Mar 2017 12:04:54 +0000 Subject: [PATCH 02/11] Added stack trace for segfaults. Also updated init.py --- scripts/junctools/__init__.py | 6 ++++-- src/portcullis.cc | 24 ++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/scripts/junctools/__init__.py b/scripts/junctools/__init__.py index 54f78bc2..47566c98 100644 --- a/scripts/junctools/__init__.py +++ b/scripts/junctools/__init__.py @@ -78,10 +78,12 @@ def main(): help="Filter or markup GTF files based on provided junctions", description='''GTF modes: filter = Filters out transcripts from GTF file that are not supported by the provided - junction file + junction file. markup = Marks transcripts from GTF file with \'portcullis\' attribute, which indicates if transcript has a fully supported set of junctions, or if not, which ones are - not supported.''') + not supported. +compare = For each GTF provided in the input. compare mode creates statistics describing + how many transcripts contain introns that are supported by a junction file.''') gtf.add_options(gtf_parser) gtf_parser.set_defaults(func=gtf.gtf) diff --git a/src/portcullis.cc b/src/portcullis.cc index 2c1abde3..70847fac 100644 --- a/src/portcullis.cc +++ b/src/portcullis.cc @@ -19,6 +19,8 @@ #include #endif +#include +#include #include #include #include @@ -357,11 +359,29 @@ int mainFull(int argc, char *argv[]) { return 0; } + +void handler(int sig) { + void *array[10]; + size_t size; + + // get void*'s for all entries on the stack + size = backtrace(array, 10); + + // print out all the frames to stderr + fprintf(stderr, "Error: signal %d:\n", sig); + fprintf(stderr, "Stack trace:\n"); + backtrace_symbols_fd(array, size, STDERR_FILENO); + exit(1); +} + + /** * Start point for portcullis. */ int main(int argc, char *argv[]) { - try { + + signal(SIGSEGV, handler); + try { // Portcullis args string modeStr; std::vector others; @@ -400,7 +420,7 @@ int main(int argc, char *argv[]) { #define PACKAGE_NAME "Portcullis" #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.13.X" +#define PACKAGE_VERSION "1.X" #endif portcullis::pfs = PortcullisFS(argv[0]); portcullis::pfs.setVersion(PACKAGE_VERSION); From 3cc9628c727911d7998cb372cb28c89074958fe3 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Tue, 14 Mar 2017 10:22:48 +0000 Subject: [PATCH 03/11] Updated documentation to include GTF section in junctools. --- doc/source/junctools.rst | 48 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/doc/source/junctools.rst b/doc/source/junctools.rst index 4fe8c733..9592a4db 100644 --- a/doc/source/junctools.rst +++ b/doc/source/junctools.rst @@ -10,7 +10,7 @@ junctools can be displayed by typing ``junctools --help`` at the command line: :: usage: This script contains a number of tools for manipulating junction files. - [-h] {compare,convert,markup,set,split} ... + [-h] {compare,convert,gtf,markup,set,split} ... optional arguments: -h, --help show this help message and exit @@ -19,6 +19,7 @@ junctools can be displayed by typing ``junctools --help`` at the command line: {compare,convert,markup,set,split} compare Compares junction files. convert Converts junction files between various formats. + gtf Filter or markup GTF files based on provided junctions markup Marks whether each junction in the input can be found in the reference or not. set Apply set operations to two or more junction files. split Splits portcullis pass and fail juncs into 4 sets (TP, TN, FP, FN) based on whether or not the junctions are found in the reference or not. @@ -208,6 +209,51 @@ The usage information for the conversion tool looks like this:: .. note:: The user can also use the conversion tool to deduplicate, sort and reindex junction files. + +.. _gtf: + +GTF +--- + +Provides a means of manipulating or analysing GTF files using a junctions file. +Three modes are currently supported, the first two filter and markup, will process +a GTF file and either remove, or mark, transcripts and their associated exons, if +junctions within that transcript are not supported by the provided junction file. +In compare mode, we benchmark GTF files based on whether junctions present are +found in a reference junction file. This gives junction-level accuracy statistics +as well as transcript-level stats. + +Usage information follows:: + + usage: This script contains a number of tools for manipulating junction files. gtf + [-h] [-is] -j JUNCTIONS [-o OUTPUT] mode input [input ...] + + GTF modes: + filter = Filters out transcripts from GTF file that are not supported by the provided + junction file. + markup = Marks transcripts from GTF file with 'portcullis' attribute, which indicates + if transcript has a fully supported set of junctions, or if not, which ones are + not supported. + compare = For each GTF provided in the input. compare mode creates statistics describing + how many transcripts contain introns that are supported by a junction file. + + positional arguments: + mode GTF operation to apply. See above for details. Available options: + - filter + - markup + - compare + input The input GTF file to convert + + optional arguments: + -h, --help show this help message and exit + -is, --ignore_strand Whether or not to ignore strand when looking for junctions + -j JUNCTIONS, --junctions JUNCTIONS + The file containing junctions that should be found in the GTF. + -o OUTPUT, --output OUTPUT + The filtered or markedup GTF output. By default we print to stdout. + + + .. _markup: Markup From d735d8211e3f1421c9ef3453535467d555279cb2 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Tue, 14 Mar 2017 10:24:20 +0000 Subject: [PATCH 04/11] Updated PyPI script to fix a typo in email address. --- scripts/setup.py.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/setup.py.in b/scripts/setup.py.in index d4b60d7c..b25e09ef 100644 --- a/scripts/setup.py.in +++ b/scripts/setup.py.in @@ -21,7 +21,7 @@ setup( file conversion, set operations and comparisons''', url="https://github.com/maplesond/portcullis", author="Daniel Mapleson", - author_email="danile.mapleson@earlham.ac.uk", + author_email="daniel.mapleson@earlham.ac.uk", license="GPLV3", zip_safe=False, keywords="rna-seq annotation genomics transcriptomics", From 21f90b0afeda1ca88c2a691a4c47b66f0125e7e0 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Wed, 15 Mar 2017 17:16:30 +0000 Subject: [PATCH 05/11] Fixed a bug when calculating the number of transcripts in junctools gtf when a GTF file contains no transcript entries (only exons). --- scripts/junctools/gtf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/junctools/gtf.py b/scripts/junctools/gtf.py index b0909579..4439cd22 100755 --- a/scripts/junctools/gtf.py +++ b/scripts/junctools/gtf.py @@ -50,8 +50,6 @@ def loadgtf(filepath, use_strand=False): t = tag_parts[1].strip() transcript_id = t[1:-1] if t[0] == '\"' else t transcripts[transcript_id].append([parts[0], parts[3], parts[4], parts[6]]) - elif len(parts) == 9 and parts[2] == "transcript": - nb_transcripts += 1 intron_chains = collections.defaultdict(list) junc_set = set() @@ -96,7 +94,7 @@ def loadgtf(filepath, use_strand=False): for j in junc_set: junc_key_set.add(j.key) - return intron_chains, junc_key_set, nb_transcripts, nb_introns + return intron_chains, junc_key_set, len(transcripts), nb_introns def run_compare(args, ref_juncs): print("\t".join(["file", "j_distinct", "j_total", "j_tp", "j_fp", "j_fn", "j_recall", "j_precision", "j_f1", From 17273fdc14201c5f1a8b4d3289d3ad5705659483 Mon Sep 17 00:00:00 2001 From: Dan Date: Sat, 18 Mar 2017 20:47:10 +0000 Subject: [PATCH 06/11] Ignore swp file. --- src/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/src/.gitignore b/src/.gitignore index 4694c057..2e0a375d 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -11,3 +11,4 @@ /portcullis /tags *.junction_filter.cc.swp +*.portcullis.cc.swp From ab23ea3833460ee903f5752b037edb8f46e3fdb6 Mon Sep 17 00:00:00 2001 From: Dan Date: Sat, 18 Mar 2017 23:37:48 +0000 Subject: [PATCH 07/11] Avoid potential problem if nb_multiexonics is 0, causing a divide by zero error, in junctools gtf compare. --- scripts/junctools/gtf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/junctools/gtf.py b/scripts/junctools/gtf.py index 4439cd22..e0d22835 100755 --- a/scripts/junctools/gtf.py +++ b/scripts/junctools/gtf.py @@ -103,6 +103,9 @@ def run_compare(args, ref_juncs): intron_chains, junc_set, nb_transcripts, nb_introns = loadgtf(i, use_strand=not args.ignore_strand) nb_monoexonic = nb_transcripts - len(intron_chains) nb_multiexonic = len(intron_chains) + if nb_multiexonic <= 0: + print("skipped...nb_multiexonic=0") + continue jr_tp = len(ref_juncs & junc_set) jr_fn = len(ref_juncs - junc_set) From a58aafd90d8d958491100a5c2a03fced864887d5 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 24 Mar 2017 14:45:12 +0000 Subject: [PATCH 08/11] Modified GTF compare code to include intron-chain level performance analysis. --- scripts/junctools/gtf.py | 84 +++++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 14 deletions(-) diff --git a/scripts/junctools/gtf.py b/scripts/junctools/gtf.py index e0d22835..319291fc 100755 --- a/scripts/junctools/gtf.py +++ b/scripts/junctools/gtf.py @@ -96,9 +96,10 @@ def loadgtf(filepath, use_strand=False): return intron_chains, junc_key_set, len(transcripts), nb_introns -def run_compare(args, ref_juncs): +def run_compare(args, ref_juncs, ref_ics): print("\t".join(["file", "j_distinct", "j_total", "j_tp", "j_fp", "j_fn", "j_recall", "j_precision", "j_f1", - "t_transcripts", "t_monoexonic", "t_multiexonic", "t_supported", "t_unsupported", "t_precision"])) + "t_transcripts", "t_monoexonic", "t_multiexonic", "t_supported", "t_unsupported", "t_precision", + "ic_tp", "ic_fp", "ic_fn", "ic_recall", "ic_precision", "ic_f1"])) for i in args.input: intron_chains, junc_set, nb_transcripts, nb_introns = loadgtf(i, use_strand=not args.ignore_strand) nb_monoexonic = nb_transcripts - len(intron_chains) @@ -113,13 +114,23 @@ def run_compare(args, ref_juncs): jr_perf = Performance(tp=jr_tp, fn=jr_fn, fp=jr_fp) nb_in_ref = 0 + ic_tp = 0 + ic_fp = 0 + for t, introns in intron_chains.items(): is_valid_by_ref = True + # Start key to be used to represent this intron chain + key = introns[0].refseq + "_" + introns[0].strand + + # Loop through introns in this transcript for index, j in enumerate(introns): in_ref = j.key in ref_juncs + # Add intron start and stop to intron chain key + key += "_" + str(j.start) + "_" + str(j.end) + if not in_ref: is_valid_by_ref = False break @@ -127,12 +138,43 @@ def run_compare(args, ref_juncs): if is_valid_by_ref: nb_in_ref += 1 + # Check if intron chain is found + if key in ref_ics: + ic_tp += 1 + else: + ic_fp += 1 + nb_unsupported = nb_multiexonic - nb_in_ref t_precision = (nb_in_ref / nb_multiexonic) * 100.0 + ic_fn = len(ref_ics) - ic_tp + ic_perf = Performance(tp=ic_tp, fn=ic_fn, fp=ic_fp) + print("\t".join(str(_) for _ in [i, len(junc_set), nb_introns, jr_tp, jr_fp, jr_fn, "{0:.2f}".format(jr_perf.recall()), "{0:.2f}".format(jr_perf.precision()), "{0:.2f}".format(jr_perf.F1()), - nb_transcripts, nb_monoexonic, nb_multiexonic, nb_in_ref, nb_unsupported, "{0:.2f}".format(t_precision)])) + nb_transcripts, nb_monoexonic, nb_multiexonic, nb_in_ref, nb_unsupported, "{0:.2f}".format(t_precision), + ic_tp, ic_fp, ic_fn, + "{0:.2f}".format(ic_perf.recall()), "{0:.2f}".format(ic_perf.precision()), + "{0:.2f}".format(ic_perf.F1())])) + +def keyFromIC(ic): + if len(ic) > 0: + key = ic[0].refseq + "_" + ic[0].strand + for i in ic: + key += "_" + str(i.start) + "_" + str(i.end) + return key + else: + return None + +def convert_ic_map(ic): + mod = set() + for t, introns in ic.items(): + key = keyFromIC(introns) + if key: + mod.add(key) + + return mod + def gtf(args): @@ -140,14 +182,27 @@ def gtf(args): print("# Running junctools GTF in", mode.name, "mode") if mode != Mode.COMPARE and (len(args.input) > 1 or len(args.input) == 0): - raise SyntaxError("This mode can takes a single GTF file as input") + raise SyntaxError("This mode can only take a single GTF file as input") - print("# Loading junctions ...",end="") - port_juncs, port_count = Junction.createJuncSet(args.junctions, use_strand=not args.ignore_strand) - print(" done. Found", port_count, "junctions.") + if args.junctions == None and args.transcripts == None: + raise SyntaxError("You must either specify a file containing junctions (-j) or transcripts (-t) to use as a reference against your input files(s).") + elif args.junctions and args.transcripts: + raise SyntaxError( + "You must either specify a file containing junctions (-j) or transcripts (-t) to use as a reference against your input files(s). Not both.") + + + if args.junctions: + print("# Loading junctions from reference junctions file ...",end="") + ref_juncs, ref_junc_count = Junction.createJuncSet(args.junctions, use_strand=not args.ignore_strand) + print(" done. Found", ref_junc_count, "junctions (", len(ref_juncs), "distinct ) .") + else: + print("# Extracting junctions from reference transcript file ...", end="") + ref_intron_chains, ref_juncs, ref_transcript_count, ref_junc_count = loadgtf(args.transcripts, use_strand=not args.ignore_strand) + ref_ics = convert_ic_map(ref_intron_chains) + print(" done. Found", ref_junc_count, "junctions (", len(ref_juncs), "distinct ) and", ref_transcript_count, "transcripts (", len(ref_ics), "distinct intron chains).") if mode == Mode.COMPARE: - run_compare(args, port_juncs) + run_compare(args, ref_juncs, ref_ics) else: print("Loading transcripts ...",end="") @@ -160,11 +215,11 @@ def gtf(args): print() print("Doing junction level comparison ...", end="") - juncs_inport = port_juncs & junc_set + juncs_inport = ref_juncs & junc_set print(" done.") - recall = (len(juncs_inport) / port_count) * 100.0 + recall = (len(juncs_inport) / ref_junc_count) * 100.0 precision = (len(juncs_inport) / len(junc_set)) * 100.0 - print(len(juncs_inport), "/", port_count, "(" + "{0:.2f}".format(recall) + ") of valid junctions.") + print(len(juncs_inport), "/", ref_junc_count, "(" + "{0:.2f}".format(recall) + ") of valid junctions.") print(len(juncs_inport), "/", len(junc_set), "(" + "{0:.2f}".format(precision) + ") junctions supported by input.") print() @@ -173,7 +228,7 @@ def gtf(args): invalid_transcripts = collections.defaultdict(list) for t, introns in intron_chains.items(): for index, j in enumerate(introns): - if not j.key in port_juncs: + if not j.key in ref_juncs: invalid_transcripts[t].append(str(j.start+1) + "_" + str(j.end+1)) print(" done. Found", len(invalid_transcripts), "/", nb_multiexonic, "invalid multi-exonic transcripts.") print() @@ -224,7 +279,8 @@ def add_options(parser): parser.add_argument("-is", "--ignore_strand", action='store_true', default=False, help="Whether or not to ignore strand when looking for junctions") - parser.add_argument("-j", "--junctions", required=True, help='''The file containing junctions that should be found in the GTF.''') + parser.add_argument("-j", "--junctions", help='''The file containing junctions that should be found in the GTF. Either this or '-t' must be used.''') + parser.add_argument("-t", "--transcripts", help='''The file containing transcripts that should be found in the GTF. Either this or '-j' must be used.''') parser.add_argument("-o", "--output", default="junctools.out.gtf", help="The filtered or markedup GTF output. By default we print to stdout.") @@ -235,4 +291,4 @@ def add_options(parser): - compare ''') - parser.add_argument("input", nargs="+", help="The input GTF file to convert") + parser.add_argument("input", nargs="+", help="The input GTF file(s) to process") From 9d77e1dbf5febf34ebc7d5820ba70afbdefa8e06 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Mon, 3 Apr 2017 12:11:38 +0100 Subject: [PATCH 09/11] Flushing stdout when doing gtf comparison. Fixed a bug which left newlines on the string when converting junction files. --- scripts/junctools/convert.py | 2 +- scripts/junctools/gtf.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/junctools/convert.py b/scripts/junctools/convert.py index a95a5a9e..ce9c4afd 100755 --- a/scripts/junctools/convert.py +++ b/scripts/junctools/convert.py @@ -114,7 +114,7 @@ def convert(args): else: with open(args.input) as f: for line in f: - j = JuncFactory.create_from_enum(in_type, use_strand=not args.ignore_strand).parse_line(line) + j = JuncFactory.create_from_enum(in_type, use_strand=not args.ignore_strand).parse_line(line.strip()) if j: diff --git a/scripts/junctools/gtf.py b/scripts/junctools/gtf.py index 319291fc..ba137db2 100755 --- a/scripts/junctools/gtf.py +++ b/scripts/junctools/gtf.py @@ -1,5 +1,6 @@ import argparse import collections +import sys from enum import Enum, unique from .junction import Junction, BedJunction @@ -100,6 +101,7 @@ def run_compare(args, ref_juncs, ref_ics): print("\t".join(["file", "j_distinct", "j_total", "j_tp", "j_fp", "j_fn", "j_recall", "j_precision", "j_f1", "t_transcripts", "t_monoexonic", "t_multiexonic", "t_supported", "t_unsupported", "t_precision", "ic_tp", "ic_fp", "ic_fn", "ic_recall", "ic_precision", "ic_f1"])) + sys.stdout.flush() for i in args.input: intron_chains, junc_set, nb_transcripts, nb_introns = loadgtf(i, use_strand=not args.ignore_strand) nb_monoexonic = nb_transcripts - len(intron_chains) @@ -156,6 +158,7 @@ def run_compare(args, ref_juncs, ref_ics): ic_tp, ic_fp, ic_fn, "{0:.2f}".format(ic_perf.recall()), "{0:.2f}".format(ic_perf.precision()), "{0:.2f}".format(ic_perf.F1())])) + sys.stdout.flush() def keyFromIC(ic): if len(ic) > 0: From c9f3ff1eeaa1246e092134be4d5ca85294bdc1ea Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Wed, 24 May 2017 14:34:07 +0100 Subject: [PATCH 10/11] Fix for deduping when using the junctools convert tool. --- scripts/junctools/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/junctools/convert.py b/scripts/junctools/convert.py index ce9c4afd..b4504b21 100755 --- a/scripts/junctools/convert.py +++ b/scripts/junctools/convert.py @@ -120,7 +120,7 @@ def convert(args): if args.dedup: if j.key not in junction_set: - junction_set.add(j.key()) + junction_set.add(j.key) if loadall: junctions.append(j) else: From b05c5f99f80196a887ae33a2f19ea55b27dcf9d1 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Wed, 24 May 2017 14:42:38 +0100 Subject: [PATCH 11/11] Updated version to 1.0.1 --- configure.ac | 2 +- doc/source/conf.py | 4 ++-- scripts/junctools/__init__.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 9c86b0c6..055addc3 100644 --- a/configure.ac +++ b/configure.ac @@ -4,7 +4,7 @@ # Autoconf setup AC_PREREQ([2.68]) -AC_INIT([portcullis],[1.0.0],[daniel.mapleson@earlham.ac.uk],[portcullis],[http://www.earlham.ac.uk]) +AC_INIT([portcullis],[1.0.1],[daniel.mapleson@earlham.ac.uk],[portcullis],[http://www.earlham.ac.uk]) AC_CONFIG_SRCDIR([src/portcullis.cc]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_MACRO_DIR([m4]) diff --git a/doc/source/conf.py b/doc/source/conf.py index a260d3e0..9bec1648 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -53,9 +53,9 @@ # built documents. # # The short X.Y version. -version = '1.0.0' +version = '1.0.1' # The full version, including alpha/beta/rc tags. -release = '1.0.0' +release = '1.0.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/scripts/junctools/__init__.py b/scripts/junctools/__init__.py index 47566c98..2efdfb26 100644 --- a/scripts/junctools/__init__.py +++ b/scripts/junctools/__init__.py @@ -9,7 +9,7 @@ __author__ = 'Daniel Mapleson' __license__ = 'GPLV3' __copyright__ = 'Copyright 2016 Daniel Mapleson' -__version__ = '1.0.0' +__version__ = '1.0.1' import argparse import sys