forked from leekgroup/derfinder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcountReads.py
155 lines (143 loc) · 6.32 KB
/
countReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/python
def enum(collection,st):
#just like "enumerate" but you define your own starting position.
#this returns indices RELATIVE TO ORIGINAL LIST
i = st
while i < len(collection):
yield (i,collection[i])
i += 1
def getfirstindex(L,st,value,K):
for pos,t in enum(L,st):
if t[1] > value-K:
return pos #returns first read in the list that CAN contain given bp
return 0
def getlastindex(L,st,value):
for pos,t in enum(L,st):
if t[1] > value:
return pos #returns first read in the list that CANNOT contain given bp
return len(L)
# ^these 2 were inspired by code here: http://stackoverflow.com/questions/946860/using-pythons-list-index-method-on-a-list-of-tuples-or-objects (see answer labeled "10", superperformant)
# function for converting SAM "flags" to bits:
def to_bin(n):
return bin(n)[2:].zfill(11)
def countReadlets(fname, outfname, k, chromosome, stranded, rev):
import pysam
#from datetime import datetime #for debugging
samfile = pysam.Samfile(fname,"rb")
id_start_end = []
cigar = []
maxpos = 0
minpos = 3000000000
for read in samfile.fetch(chromosome):
if to_bin(read.flag)[6] == '1' and not rev:
strand = "-"
elif to_bin(read.flag)[6] == '1' and rev:
strand = "+"
elif to_bin(read.flag)[6] == '0' and not rev:
strand = "+"
else:
strand = "-"
readstart = read.pos+1
readend = read.aend
if len(read.cigar)>1:
currentloc = readstart
for a in range(len(read.cigar)):
if read.cigar[a][0] == 0: #match: count this segment as covered, obviously
id_start_end.append([read.qname, currentloc, currentloc+read.cigar[a][1]-1, strand])
currentloc = currentloc + read.cigar[a][1]
if read.cigar[a][0] == 1: #insertion - not in reference, so our location doesn't move
continue
if read.cigar[a][0] == 2: #deletion - count this segment as covered, since a read overlaps it.
id_start_end.append([read.qname, currentloc, currentloc+read.cigar[a][1]-1, strand])
currentloc = currentloc + read.cigar[a][1]
if read.cigar[a][0] == 3: #skipped region: don't count and move the location
currentloc = currentloc + read.cigar[a][1]
if read.cigar[a][0] > 3: #I don't think the rest of these flags exist in these alignments, but just make sure.
print("WARNING: you have a cigar flag in here that you haven't accounted for.")
else:
id_start_end.append([read.qname, readstart, readend, strand])
if read.aend > maxpos:
maxpos = read.aend
if read.pos+1 < minpos:
minpos = read.pos+1
# sort list by starting points, since we've now introduced extra complication...:
import operator
id_start_end.sort(key=operator.itemgetter(1))
# credit: http://stackoverflow.com/questions/5201191/sorting-a-list-of-lists-in-python
if stranded:
fplus = open(outfname+'_plus', 'w')
fminus = open(outfname+'_minus', 'w')
first = 0
last = 0
for z in xrange(1,minpos):
fplus.write("%s\t%s\n" % (z,0))
fminus.write("%s\t%s\n" % (z,0))
for i in xrange(minpos, maxpos+1):
last = getlastindex(id_start_end,first,i)
first = getfirstindex(id_start_end,first,i,k)
if first == last:
fplus.write("%s\t%s\n" % (i,0))
fminus.write("%s\t%s\n" % (i,0))
continue
overlaps = id_start_end[first:last]
readnames_plus = set()
readnames_minus = set()
for j in xrange(len(overlaps)):
if i>=overlaps[j][1] and i<=overlaps[j][2]:
if overlaps[j][3] == "+":
readnames_plus.add(overlaps[j][0])
else:
readnames_minus.add(overlaps[j][0])
numreads_plus = len(readnames_plus) #count readlets only once per read
numreads_minus = len(readnames_minus) #count readlets only once per read
fplus.write("%s\t%s\n" % (i,numreads_plus))
fminus.write("%s\t%s\n" % (i,numreads_minus))
fplus.close()
fminus.close()
else:
f = open(outfname, 'w')
#g = open('keeptrackofiterations','w') #for debugging
first = 0
last = 0
#npos = 0 #for debugging
for z in xrange(1,minpos):
f.write("%s\t%s\n" % (z,0))
for i in xrange(minpos, maxpos+1):
#for i in xrange(maxpos-2999,maxpos+1): #for debugging
#npos = npos+1 #for debugging
#if npos % 10000 == 0: g.write("Did 10000 "+str(datetime.now())+"\n") #for debugging
last = getlastindex(id_start_end,first,i)
first = getfirstindex(id_start_end,first,i,k)
if first == last:
f.write("%s\t%s\n" % (i,0))
continue
overlaps = id_start_end[first:last]
readnames = set()
for j in xrange(len(overlaps)):
if i>=overlaps[j][1] and i<=overlaps[j][2]:
readnames.add(overlaps[j][0])
numreads = len(readnames) #count readlets only once per read
f.write("%s\t%s\n" % (i,numreads))
f.close()
return None
# get arguments from command line
from optparse import OptionParser
opts = OptionParser()
opts.add_option("--file","-f",type="string",help="input file name (must be .bam)")
opts.add_option("--output","-o",type="string",help="output file name")
opts.add_option("--kmer","-k",type="int",help="kmer length")
opts.add_option("--chrom","-c",type="string",help="chromosome to parse")
opts.add_option("--stranded","-s",type="string",help="stranded or reverse-stranded protocol?",default="FALSE")
options,arguments = opts.parse_args()
if options.stranded == "TRUE":
stranded = True
rev = False
elif options.stranded == "REVERSE":
stranded = True
rev = True
elif options.stranded != "FALSE":
ValueError('stranded must either be TRUE, REVERSE, or FALSE')
else:
stranded = False
rev = False
countReadlets(options.file, options.output, options.kmer, options.chrom, stranded, rev)