-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_orfs.py
76 lines (63 loc) · 1.79 KB
/
extract_orfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#! /usr/bin/python
__author__="jruiz"
__date__ ="$Sep 08, 2015 12:24:43 PM$"
'''Extract all ORFs in a transcript FASTA
'''
import sys
import os
from Bio import SeqIO
from Bio.Seq import Seq
fasta = sys.argv[1]
try:
threshold = sys.argv[2] #nucleotides
except:
threshold = 75
#OBJECTS
class orf_object:
def __init__(self, sequence, start, end):
self.sequence = sequence
self.start = start
self.end = end
#FUNCTIONS
def find_all(sequence, subsequence):
''' Returns a list of indexes within sequence that are the start of subsequence'''
start = 0
idxs = []
next_idx = sequence.find(subsequence, start)
while next_idx != -1:
idxs.append(next_idx)
start = next_idx + 1# Move past this on the next time around
next_idx = sequence.find(subsequence, start)
return idxs
def find_orfs(sequence, threshold):
""" Finds all valid open reading frames in the string 'sequence', and
returns them as a list"""
starts = find_all(sequence, 'ATG')
stop_amber = find_all(sequence, 'TAG')
stop_ochre = find_all(sequence, 'TAA')
stop_umber = find_all(sequence, 'TGA')
stops = stop_amber + stop_ochre + stop_umber
stops.sort()
orfs = []
for start in starts:
for stop in stops:
if start < stop \
and (start - stop) % 3 == 0: # Stop is in-frame
if len(sequence[start:stop+3]) >= int(threshold):
orf_obj = orf_object(sequence[start:stop+3], start, stop+3)
orfs.append(orf_obj)
break
orfs.sort(key=lambda x: len(x.sequence), reverse=True)
return orfs
cdna = SeqIO.index(fasta, "fasta")
for sequence in cdna:
ends = []
orfs = find_orfs(cdna[sequence].seq, threshold)
n = 1
for orf in orfs:
if orf.end in ends:
continue
ends.append(orf.end)
print(">" + sequence + "_" + str(n) + ":" + str(orf.start) + "-" + str(orf.end) + "\n" + str(orf.sequence))
n += 1
exit(0)