-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathscaffoldgap2bed.py
135 lines (116 loc) · 4.66 KB
/
scaffoldgap2bed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/bin/env python
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <codecell>
"""
Generates a bed-file for the gaps or contigs for a given fasta file
Splits the sequences on gaps (default: 20 bases)
Output goes to to standard out
Requires Biopython
Lex Nederbragt, September 2013
lex.nederbragt@ibv.uio.no
"""
# <codecell>
import re
from Bio import SeqIO
import argparse
# <codecell>
# help text and argument parser
desc = '\n'.join(["Generates a bed-file for the gaps for a given fasta file.",
"Splits the sequences on gaps.",
"Output goes to standard out ('the screen').",
"Input: a fasta file one or more sequences.",
"An optional argument -m/--min_gap_length can be used to set the minimum length of gaps (default: 20 bp).",
"An optional argument -c/--contigs can output contig instead of gap coordinates (default: output gap coordinates)"
])
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-i','--input', help='Input file name',required=True)
parser.add_argument('-m', '--min_gap_length', help='Minimum length of gaps (N bases)', type=int, default=20, required = False)
parser.add_argument('-c', '--contigs', help='Outputs coordinates of contigs not gaps',action='store_true', required = False)
# <codecell>
def get_gap_def(min_gap_len):
"""
Sets up the regular expression for splitting sequences into contigs and gaps
example:
get_gap_def(20)
returns
re.compile(r'(N{20,})')
"""
return re.compile('(N{%i,})' % min_gap_len)
# <codecell>
def split_seq(seq, gap_def):
"""
Splits a DNA sequence into contigs and gaps
by splitting on min_gap_length.
Gaps consisting of fewer N's than min_gap_len are ignored
"""
for part in gap_def.split(seq.upper()):
yield part
# <codecell>
def seq_type(seq):
"""
Determines whether a sequence consists of 'N's only
(i.e., represents a gap)
"""
return 'gap' if set(seq.upper()) == {'N'} else 'bases'
# <codecell>
def get_coord(seq, gap_def):
"""
Returns, for a given sequence,
a list with on each line the coordinates of a gap or a contig
when splitting on gaps of minimum lengths min_gap_len.
"""
pos = 0
coord = []
# test for empty sequence
if len(seq) < 1:
return coord
for subseq in split_seq(seq, gap_def):
start = pos
end = pos + len(subseq)
coordline = [str(start), str(end), seq_type(subseq)]
coord.append(coordline)
pos = end
return coord
# <codecell>
# unit tests
def test_split_on_0N():
assert get_coord('ACTGTNGTCTNNAGTGNNNCGATTANNNNAGTA', re.compile(r'(N{0,})')) == \
[['0', '5', 'bases'], ['5', '6', 'gap'], ['6', '10', 'bases'], ['10', '12', 'gap'], ['12', '16', 'bases'], ['16', '19', 'gap'], ['19', '25', 'bases'], ['25', '29', 'gap'], ['29', '33', 'bases']]
def test_split_on_2N():
assert get_coord('ACTGTNGTCTNNAGTGNNNCGATTANNNNAGTA', re.compile(r'(N{2,})')) == \
[['0', '10', 'bases'], ['10', '12', 'gap'], ['12', '16', 'bases'], ['16', '19', 'gap'], ['19', '25', 'bases'], ['25', '29', 'gap'], ['29', '33', 'bases']]
def test_split_on_3N():
assert get_coord('ACTGTNGTCTNNAGTGNNNCGATTANNNNAGTA', re.compile(r'(N{3,})')) == \
[['0', '16', 'bases'], ['16', '19', 'gap'], ['19', '25', 'bases'], ['25', '29', 'gap'], ['29', '33', 'bases']]
def test_split_on_20N():
assert get_coord('ACTGTNGTCTNNAGTGNNNCGATTANNNNAGTA', re.compile(r'(N{20,})')) == \
[['0', '33', 'bases']]
def test_nogap():
assert get_coord('ACTGTGTCTAGTGCGATTAAGTA', re.compile(r'(N{0,})')) == \
[['0', '23', 'bases']]
def test_empty_seq():
assert get_coord('', re.compile(r'(N{20,})')) == \
[]
def test_lower_case():
assert get_coord('ACTGTNGTCTNNAGTGNNNCGATTANNNNAGTA'.lower(), re.compile(r'(N{3,})')) == \
[['0', '16', 'bases'], ['16', '19', 'gap'], ['19', '25', 'bases'], ['25', '29', 'gap'], ['29', '33', 'bases']]
# <codecell>
if __name__ == "__main__":
args = parser.parse_args()
infile = args.input
min_gap_len = args.min_gap_length
gap_def = get_gap_def(min_gap_len)
for seq_record in SeqIO.parse(infile, "fasta"):
seq_name = seq_record.id
for line in get_coord(str(seq_record.seq), gap_def):
if args.contigs:
if line[2] =='bases':
# add sequence identifier at the beginning
line.insert(0, seq_name)
print('\t'.join(line))
else:
if line[2] =='gap':
# add sequence identifier at the beginning
line.insert(0, seq_name)
print('\t'.join(line))