Merge branch 'develop'

egaffo · Mar 23, 2020 · fbd367c · fbd367c
2 parents d52c3f4 + 281abae
commit fbd367c
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 17 deletions.
diff --git a/src/ccp_check_indexes.scons b/src/ccp_check_indexes.scons
@@ -144,7 +144,7 @@ if genome_indexes_to_build:
             indexes['BOWTIE2'] = [env['BOWTIE2_INDEX'] + suffix for suffix \
 							 in bowtie2_index_suffixes]
 
-    if any([f in env['CIRCRNA_METHODS'] for f in ['circexplorer2_star']]):
+    if any([f in env['CIRCRNA_METHODS'] for f in ['circexplorer2_star', 'dcc', 'circrna_finder']]):
         if env['STAR_INDEX'] == '':
             #index dir
             env.Replace(STAR_INDEX = os.path.dirname(os.path.abspath(str(indexes['STAR'][0]))))

diff --git a/src/ccp_circexplorer2.scons b/src/ccp_circexplorer2.scons
@@ -99,7 +99,7 @@ if env['ALIGNER'].lower() == 'segemehl':
 
     ## for segemehl >= v0.3.0 modify the input BED file
     fixed_bed_cmd = '''grep ';B\\|C;' ${SOURCES} | cut -f1,2,3,6 | sort | '''\
-                    '''uniq -c | sed -r 's/.*([0-9]+) ([^\\t]+)\\t([^\\t]+)'''\
+                    '''uniq -c | sed -r 's/ *([0-9]+) ([^\\t]+)\\t([^\\t]+)'''\
                     '''\\t([^\\t]+)\\t([^\\t]+).*/echo "\\2\\t$$((\\3+1))\\t\\4\\tsplits:'''\
                     '''\\1:\\1:\\1:C:P\\t0\\t\\5"/e' > $TARGET'''
 

diff --git a/src/ccp_collect_circrnas.scons b/src/ccp_collect_circrnas.scons
@@ -136,21 +136,10 @@ unique_circ = env.Command('unique_circ_ids.gtf.gz',
                           unique_circ_cmd)
 
 ## translate backsplice intervals into start and stop single nucleotide intervals
-## mind that we are writing a newline at the end of the file that will end up
-## in the head of the sorted final file. This breaks bedtools intersect with
-## -sorted option. Thus, we need to get rid of the newline by using grep -v '^$'
 snp_unique_circ_cmd = '''zcat ${SOURCES[0]} | '''\
-                      '''sed -r 's/([^\\t]+)\\t([^\\t]+)\\t([^\\t]+)\\t'''\
-                                '''([^\\t]+)\\t([^\\t]+)\\t([^\\t]+)\\t'''\
-                                '''([^\\t]+)\\t([^\\t]+)\\tgene_id "([^"]+)";'''\
-                                '''/echo -e "\\1\\t\\2\\tstart\\t'''\
-                                       '''\\4\\t$$((\\4))\\t\\6\\t'''\
-                                       '''\\7\\t\\8\\tgene_id @\\9@;\\n'''\
-                                       '''\\1\\t\\2\\tstop\\t'''\
-                                       '''$$((\\5))\\t\\5\\t\\6\\t'''\
-                                       '''\\7\\t\\8\\tgene_id @\\9@;"/e' | '''\
-                      '''sed -r 's/@/"/g' | sort -k1,1 -k4,4n -k5,5n | '''\
-                      '''grep -v '^$' | gzip -c > $TARGET '''
+                      '''split_start_end_gtf.py -t - | '''\
+                      '''sort -k1,1 -k4,4n -k5,5n | '''\
+                      '''gzip -c > $TARGET '''
 snp_unique_circ = env.Command('sn_unique_circ.gtf.gz',
                               [unique_circ], 
                               snp_unique_circ_cmd)

diff --git a/src/circRNAs_analysis.Rmd b/src/circRNAs_analysis.Rmd
@@ -473,7 +473,7 @@ shared.counts.table <- crossprod(x = table(circ.per.method[, .(circ_id, method)]
 ## order rows and columns alphabetically
 shared.counts.table <- 
     shared.counts.table[sort(colnames(shared.counts.table)), 
-                        sort(colnames(shared.counts.table))]
+                        sort(colnames(shared.counts.table)), drop = F]
 
 ## save table
 write.csv(x = data.frame(shared.counts.table),

diff --git a/src/split_start_end_gtf.py b/src/split_start_end_gtf.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+import argparse, sys
+
+
+if __name__ == '__main__':
+
+    desc = ''
+    parser = argparse.ArgumentParser(description = desc)
+    parser.add_argument('infile', 
+                        default = '-', 
+                        help = 'Input GTF file. Set - for stdin stream.')
+    parser.add_argument('-o', '--outfile', 
+                        type = str, 
+                        default = '-',
+                        help = 'Output filename. Default to stdout.')
+    parser.add_argument('-t', '--trim', 
+                        action = 'store_true',
+                        help = 'Trim the GTF attribute field and keep only '\
+                               'gene_id.')
+
+    args = parser.parse_args()
+
+    if args.infile == '-':
+        infile = sys.stdin
+    else:
+        infile = open(args.infile, 'r')
+
+    if args.outfile == '-':
+        outfile = sys.stdout
+    else:
+        outfile = open(args.outfile, 'w')
+
+    for inline in infile:
+        outline = inline.split('\t')
+        attribute = outline[8]
+
+        if args.trim:
+            for field in outline[8].split(';'):
+                if 'gene_id' in field.split():
+                    attribute = field + ';\n'
+
+        start_out = '\t'.join(outline[0:2] +
+                             ['start'] +
+                             [outline[3]] +
+                             [outline[3]] +
+                             outline[5:8] +
+                             [attribute])
+        stop_out  = '\t'.join(outline[0:2] +
+                              ['stop'] +
+                              [outline[4]] +
+                              [outline[4]] +
+                              outline[5:8] +
+                              [attribute])
+        outfile.write(start_out)
+        outfile.write(stop_out)
+
+    infile.close()
+    outfile.close()
+
+