From f5cebd52e9444c052188a394129ef42d82af4474 Mon Sep 17 00:00:00 2001 From: Enrico Gaffo Date: Mon, 7 Oct 2019 15:51:12 +0200 Subject: [PATCH 1/4] added python script to convert circRNA GTF into start-stop GTF; and use it in place of shell command --- src/ccp_collect_circrnas.scons | 17 ++-------- src/split_start_end_gtf.py | 61 ++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 14 deletions(-) create mode 100755 src/split_start_end_gtf.py diff --git a/src/ccp_collect_circrnas.scons b/src/ccp_collect_circrnas.scons index 6cdf6d2..1a3a61a 100644 --- a/src/ccp_collect_circrnas.scons +++ b/src/ccp_collect_circrnas.scons @@ -136,21 +136,10 @@ unique_circ = env.Command('unique_circ_ids.gtf.gz', unique_circ_cmd) ## translate backsplice intervals into start and stop single nucleotide intervals -## mind that we are writing a newline at the end of the file that will end up -## in the head of the sorted final file. This breaks bedtools intersect with -## -sorted option. Thus, we need to get rid of the newline by using grep -v '^$' snp_unique_circ_cmd = '''zcat ${SOURCES[0]} | '''\ - '''sed -r 's/([^\\t]+)\\t([^\\t]+)\\t([^\\t]+)\\t'''\ - '''([^\\t]+)\\t([^\\t]+)\\t([^\\t]+)\\t'''\ - '''([^\\t]+)\\t([^\\t]+)\\tgene_id "([^"]+)";'''\ - '''/echo -e "\\1\\t\\2\\tstart\\t'''\ - '''\\4\\t$$((\\4))\\t\\6\\t'''\ - '''\\7\\t\\8\\tgene_id @\\9@;\\n'''\ - '''\\1\\t\\2\\tstop\\t'''\ - '''$$((\\5))\\t\\5\\t\\6\\t'''\ - '''\\7\\t\\8\\tgene_id @\\9@;"/e' | '''\ - '''sed -r 's/@/"/g' | sort -k1,1 -k4,4n -k5,5n | '''\ - '''grep -v '^$' | gzip -c > $TARGET ''' + '''split_start_end_gtf.py -t - | '''\ + '''sort -k1,1 -k4,4n -k5,5n | '''\ + '''gzip -c > $TARGET ''' snp_unique_circ = env.Command('sn_unique_circ.gtf.gz', [unique_circ], snp_unique_circ_cmd) diff --git a/src/split_start_end_gtf.py b/src/split_start_end_gtf.py new file mode 100755 index 0000000..ae0e7f3 --- /dev/null +++ b/src/split_start_end_gtf.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +import argparse, sys + + +if __name__ == '__main__': + + desc = '' + parser = argparse.ArgumentParser(description = desc) + parser.add_argument('infile', + default = '-', + help = 'Input GTF file. Set - for stdin stream.') + parser.add_argument('-o', '--outfile', + type = str, + default = '-', + help = 'Output filename. Default to stdout.') + parser.add_argument('-t', '--trim', + action = 'store_true', + help = 'Trim the GTF attribute field and keep only '\ + 'gene_id.') + + args = parser.parse_args() + + if args.infile == '-': + infile = sys.stdin + else: + infile = open(args.infile, 'r') + + if args.outfile == '-': + outfile = sys.stdout + else: + outfile = open(args.outfile, 'w') + + for inline in infile: + outline = inline.split('\t') + attribute = outline[8] + + if args.trim: + for field in outline[8].split(';'): + if 'gene_id' in field.split(): + attribute = field + ';\n' + + start_out = '\t'.join(outline[0:2] + + ['start'] + + [outline[3]] + + [outline[3]] + + outline[5:8] + + [attribute]) + stop_out = '\t'.join(outline[0:2] + + ['stop'] + + [outline[4]] + + [outline[4]] + + outline[5:8] + + [attribute]) + outfile.write(start_out) + outfile.write(stop_out) + + infile.close() + outfile.close() + + From 10bdd4c22cc04c1f7c9df99bdee87f622f8cd678 Mon Sep 17 00:00:00 2001 From: Enrico Gaffo Date: Wed, 11 Dec 2019 11:57:21 +0100 Subject: [PATCH 2/4] fixed index-to-build check for dcc and circrna_finder --- src/ccp_check_indexes.scons | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ccp_check_indexes.scons b/src/ccp_check_indexes.scons index 992f031..aed9481 100644 --- a/src/ccp_check_indexes.scons +++ b/src/ccp_check_indexes.scons @@ -144,7 +144,7 @@ if genome_indexes_to_build: indexes['BOWTIE2'] = [env['BOWTIE2_INDEX'] + suffix for suffix \ in bowtie2_index_suffixes] - if any([f in env['CIRCRNA_METHODS'] for f in ['circexplorer2_star']]): + if any([f in env['CIRCRNA_METHODS'] for f in ['circexplorer2_star', 'dcc', 'circrna_finder']]): if env['STAR_INDEX'] == '': #index dir env.Replace(STAR_INDEX = os.path.dirname(os.path.abspath(str(indexes['STAR'][0])))) From 1735b7a12e9a14b22c9dcb314d4a946923860c31 Mon Sep 17 00:00:00 2001 From: Enrico Gaffo Date: Wed, 11 Dec 2019 12:08:45 +0100 Subject: [PATCH 3/4] fixed error when single method is used --- src/circRNAs_analysis.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/circRNAs_analysis.Rmd b/src/circRNAs_analysis.Rmd index 381ffd8..341751c 100644 --- a/src/circRNAs_analysis.Rmd +++ b/src/circRNAs_analysis.Rmd @@ -473,7 +473,7 @@ shared.counts.table <- crossprod(x = table(circ.per.method[, .(circ_id, method)] ## order rows and columns alphabetically shared.counts.table <- shared.counts.table[sort(colnames(shared.counts.table)), - sort(colnames(shared.counts.table))] + sort(colnames(shared.counts.table)), drop = F] ## save table write.csv(x = data.frame(shared.counts.table), From 281abae8b5ea304c8795470a77e568f847de2947 Mon Sep 17 00:00:00 2001 From: Enrico Gaffo Date: Sat, 14 Mar 2020 18:00:23 +0000 Subject: [PATCH 4/4] fixed bug that caused CIRCexplorer2-segemehl to underestimate circrna read count --- src/ccp_circexplorer2.scons | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ccp_circexplorer2.scons b/src/ccp_circexplorer2.scons index 52aab47..c745f17 100644 --- a/src/ccp_circexplorer2.scons +++ b/src/ccp_circexplorer2.scons @@ -99,7 +99,7 @@ if env['ALIGNER'].lower() == 'segemehl': ## for segemehl >= v0.3.0 modify the input BED file fixed_bed_cmd = '''grep ';B\\|C;' ${SOURCES} | cut -f1,2,3,6 | sort | '''\ - '''uniq -c | sed -r 's/.*([0-9]+) ([^\\t]+)\\t([^\\t]+)'''\ + '''uniq -c | sed -r 's/ *([0-9]+) ([^\\t]+)\\t([^\\t]+)'''\ '''\\t([^\\t]+)\\t([^\\t]+).*/echo "\\2\\t$$((\\3+1))\\t\\4\\tsplits:'''\ '''\\1:\\1:\\1:C:P\\t0\\t\\5"/e' > $TARGET'''