-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassembly_scaffold_BLAST.py
397 lines (274 loc) · 14.9 KB
/
assembly_scaffold_BLAST.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
import sys
import os
import subprocess
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SeqIO
from random import shuffle
class FastAreader:
'''
Define objects to read FastA files.
instantiation:
thisReader = FastAreader ('testTiny.fa')
usage:
for head, seq in thisReader.readFasta():
print (head,seq)
'''
def __init__(self, fname):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen(self):
''' Handle file opens, allowing STDIN.'''
if self.fname == '':
return sys.stdin
else:
return open(self.fname)
def readFasta(self):
''' Read an entire FastA record and return the sequence header/sequence'''
header = ''
sequence = ''
with self.doOpen() as fileH:
# header = ''
# sequence = ''
# skip to first fasta header
line = fileH.readline()
while not line.startswith('>'):
line = fileH.readline()
header = line[1:].rstrip()
for line in fileH:
if line.startswith('>'):
yield header, sequence
header = line[1:].rstrip()
sequence = ''
else:
sequence += ''.join(line.rstrip().split()).upper()
yield header, sequence
class tree_maker:
"""
"""
def __init__(self):
# for assembly
self.reverse_complement_dict = {}
self.null_model_read_dict = {}
# for baits
self.baits_by_head_dict = {}
self.seq_dict_by_head = {}
self.head_dict_by_seq = {}
self.duplicate_reads_by_file_set = set()
self.all_reads_by_file_set = set()
self.duplicate_sequence_dict = {}
self.read_bait_correspondence_dict = {}
self.unique_sequence_dict = {}
self.duplicate_header_dict = {}
# for fourmer frequency values of assembled paired end reads
self.assembled_fourmer_freq_dict = {}
# for file fourmer freq sums
self.fourmer_frequency_dict = {}
self.null_model_freq_dict = {}
self.fourmer_frequency_by_file_dict = {}
self.fourmer_sum_dict = {}
self.forward_correspondence_dict = {}
self.correspondence_node_dict = {}
self.sink_dict = {}
self.start_list = []
self.end_list = []
self.assembled_paired_end_read_dict = {}
self.path_dict = {}
self.direct_path_dict = {}
self.assembly_seq_dict = {}
self.Z_score_dict_by_read = {}
self.agreement_seq_len_sum = 0
self.total_num_agreements = 0
self.max_seq_len = 0
def file_converter(self):
"""Designed to open a folder on desktop containing .fq.gz data from sample reads.
These .fq.gz file names used to develop .fa files via bash subprocesses.
Input: files in path
Output: converted .fa files for use
"""
path = "C://Users//Quin The Conquoror!//Desktop//Bobs Files fq.gz/Bobs files .fa"
dir_list = os.listdir(path)
for infile_name in dir_list:
print(infile_name)
outfile_name = infile_name.split('.')[0] + '.fa'
print(outfile_name)
# run the seqtk bash command which converts .fq.gz files to .fa for use
# the .fa files are placed in a folder on the desktop for later use.
subprocess_command = "seqtk seq -a " + infile_name + " > " + outfile_name
print(subprocess_command)
subprocess.run([subprocess_command], cwd=path, shell=True)
def file_parse(self):
"""In parsing the fasta files, the bait sequences must first be assessed.
Afterwards, the read files are unpacked sequentially.
"""
path = "C://Users//Quin The Conquoror!//Desktop//Bobs Files fq.gz/Bobs files .fa"
dir_list = os.listdir(path)
for read_file_name in dir_list:
read_file_data = FastAreader(read_file_name)
print("file name: ", read_file_name)
seq_number = 0
self.fourmer_frequency_by_file_dict[read_file_name] = {}
for head, seq in read_file_data.readFasta():
# self.fourmer_frequency(read_file_name, head, seq, seq_number)
self.seq_dict_by_head[head] = seq
self.head_dict_by_seq[seq] = head
seq_number += 1
# if seq_number == 2500:
# break
def reverse_complement(self, seq):
"""Develops the reverse complement of a parameter sequence."""
return seq.lower().replace('a', 'T').replace('t', 'A').replace('g', 'C').replace('c', 'G').replace(' ', '')[::-1]
def paired_end_assembly(self):
"""Assembles Illumina Hiseq paired end reads.
3 cases:
1) R1 ends in R2
2) R1 begins in R2
3) No overlap
"""
for read_header in self.seq_dict_by_head.keys():
if "2:N:0" in read_header:
# provides a dictionary for the R2 reads
self.reverse_complement_dict[read_header] = self.reverse_complement(self.seq_dict_by_head[read_header])
# searching for overlap between paired sequences
for read_header_r1, read_header_r2 in zip(self.seq_dict_by_head.keys(), self.reverse_complement_dict.keys()):
seq_1, seq_2 = self.seq_dict_by_head[read_header_r1], self.reverse_complement_dict[read_header_r2]
if read_header_r1[:-15] == read_header_r2[:-15] \
and read_header_r1 != read_header_r2 and seq_1 != seq_2:
for i in range(4, len(self.seq_dict_by_head[read_header_r1])-4):
if seq_1[:-i] == seq_2[i:]:
print("case 1 agreement: ", self.seq_dict_by_head[read_header_r1][:-i])
print("Read headers: ", read_header_r1, read_header_r2)
print("Original seqs: ", self.seq_dict_by_head[read_header_r1], self.reverse_complement_dict[read_header_r2])
print("Seq:", self.reverse_complement_dict[read_header_r2][:i] + self.seq_dict_by_head[read_header_r1])
case1_seq = self.reverse_complement_dict[read_header_r2][:i] + self.seq_dict_by_head[read_header_r1]
print("paired seq len: ", len(case1_seq))
self.assembled_paired_end_read_dict[read_header_r1] = case1_seq
print(" ")
if seq_1[i:] == seq_2[:-i]:
print("case 2 agreement:", self.seq_dict_by_head[read_header_r1][i:])
print("Read headers: ", read_header_r1, read_header_r2)
print("Original seqs: ", self.seq_dict_by_head[read_header_r1], self.reverse_complement_dict[read_header_r2])
case2_seq = self.seq_dict_by_head[read_header_r1][:i] + self.reverse_complement_dict[read_header_r2]
print("paired seq len: ", len(case2_seq))
print("Assembled seq: ", case2_seq)
self.assembled_paired_end_read_dict[read_header_r1] = case2_seq
print(" ")
if seq_1[:-i] != seq_2[i:] and seq_1[i:] != seq_2[:-i]:
print("case 3")
print("Read headers: ", read_header_r1, read_header_r2)
print("Original seqs: ", self.seq_dict_by_head[read_header_r1], self.reverse_complement_dict[read_header_r2])
print("Assembled seq: ", self.seq_dict_by_head[read_header_r1][:i] + self.reverse_complement_dict[read_header_r2])
case3_seq = self.seq_dict_by_head[read_header_r1] + "_" + self.reverse_complement_dict[read_header_r2]
print("Case 3 Seq:", case3_seq)
self.assembled_paired_end_read_dict[read_header_r1] = case3_seq
print(" ")
self.assembled_fourmer_freq_dict = self.fourmer_frequency_of_paired_end_reads(self.assembled_paired_end_read_dict)
def null_model(self):
"""The null model is provided as a randomness and error threshold.
Each assembled paired end read will be scrambled and its fourmer frequencies analyzed for
comparison to later data extracted from the exact read sequences.
Input: paired end sequences
Output: scrambled paired end sequences for fourmer analysis
"""
for read_header in self.assembled_paired_end_read_dict.keys():
exact_seq = self.assembled_paired_end_read_dict[read_header]
char_list = list(exact_seq)
shuffle(char_list)
randomized_seq = ''.join(char_list)
self.null_model_read_dict[read_header] = randomized_seq
self.null_model_freq_dict = self.fourmer_frequency_of_paired_end_reads(self.null_model_read_dict)
def fourmer_frequency_of_paired_end_reads(self, analysis_dict):
""" This member function is designed """
fourmer_list_list = []
for read_header in analysis_dict.keys():
seq = analysis_dict[read_header]
assembly_fourmers = [seq[i:i + 4] for i in range(0, len(seq), 4)]
fourmer_list_list.append(assembly_fourmers)
# generate all possible fourmers as key to dictionary, save scores per fourmer by fourmer as key
#print(fourmer_list_list)
unique_fourmer_list = []
all_fourmer_list = []
for fourmer_list in fourmer_list_list:
for fourmer in fourmer_list:
all_fourmer_list.append(fourmer)
if fourmer not in unique_fourmer_list and fourmer:
unique_fourmer_list.append(fourmer)
fourmer_freq_dict = {}
for fourmer in unique_fourmer_list:
fourmer_freq_dict[fourmer] = all_fourmer_list.count(fourmer)/256
return fourmer_freq_dict
def fourmer_freq_fetcher(self, seq):
"""This function is designed to fragment the paired end read sequence agreement into fourmers and then access the fourmer frequencies from the fourmer frequency dictionary.
Input: fourmer frequency dictionary
Output: A list of fourmer frequencies as correspond to the fourmers of the agreement sequence.
"""
read_agreement_fourmers = [seq[i:i + 4] for i in range(0, len(seq), 4)]
read_agreement_fourmer_frequencies = [self.assembled_fourmer_freq_dict[fourmer] for fourmer in read_agreement_fourmers if fourmer in
self.assembled_fourmer_freq_dict.keys()]
print("seq: ", seq, "read agreement fourmer frequencies", read_agreement_fourmer_frequencies)
if read_agreement_fourmer_frequencies:
return read_agreement_fourmer_frequencies
def agreement_seq_len_avg_finder(self, agreement_seq, seq_1, seq_2):
""" """
self.agreement_seq_len_sum += len(agreement_seq)
self.total_num_agreements += 1
if len(agreement_seq) > self.max_seq_len:
self.max_seq_len = len(agreement_seq)
running_agreement_seq_avg = self.agreement_seq_len_sum / self.total_num_agreements
print("running_agreement_seq_avg: ", running_agreement_seq_avg)
agreement_position_1 = seq_1.find(agreement_seq)
agreement_position_2 = seq_2.find(agreement_seq)
print("agreement positions: ", agreement_position_1, agreement_position_2)
print("agreement seq fourmer_freq", self.fourmer_freq_fetcher((agreement_seq)))
if len(agreement_seq) > running_agreement_seq_avg:
print("longer_seq~~~~~~~~~~~", agreement_seq)
print("seq_1", seq_1)
print("seq_2", seq_2)
def standard_dev_finder(self):
pass
def null_model_comparison(self):
""""""
pass
def paired_end_sequence_correspondence_graphing(self):
"""Develops a forwards correspondence graph used in acyclic graphing alignment per taxa."""
for read_head in self.assembled_paired_end_read_dict.keys():
forward_correspondence_list = []
backwards_correspondence_list = []
self.forward_correspondence_dict[read_head] = []
self.correspondence_node_dict[read_head] = []
seq_1 = self.assembled_paired_end_read_dict[read_head]
for other_read_head in self.assembled_paired_end_read_dict.keys():
seq_2 = self.assembled_paired_end_read_dict[other_read_head]
if seq_1 != seq_2 and read_head != other_read_head:
for j in range(4, len(self.seq_dict_by_head[read_head])-4):
# end of sequence one is the start of sequence 2
if seq_1[j:] == seq_2[:len(seq_1[j:])]:
forward_correspondence_list.append(other_read_head)
agreement_seq = seq_1[j:]
print("end of sequence one is the start of sequence 2, agreement_seq", agreement_seq)
self.agreement_seq_len_avg_finder(agreement_seq, seq_1, seq_2)
# a node is developed, holding a series of attributes
self.correspondence_node_dict[read_head].append((other_read_head, agreement_seq, seq_2, self.fourmer_freq_fetcher(agreement_seq)))
# start of sequence one is the end of sequence 2
if seq_1[:j] == seq_2[len(seq_1[j:]):]:
backwards_correspondence_list.append(other_read_head)
agreement_seq = seq_1[:j]
print("start of sequence one is the end of sequence 2, agreement_seq", agreement_seq)
self.correspondence_node_dict[read_head].append((other_read_head, agreement_seq, seq_2, self.fourmer_freq_fetcher(agreement_seq)))
self.agreement_seq_len_avg_finder(agreement_seq, seq_1, seq_2)
for key in self.correspondence_node_dict.keys():
if len(self.correspondence_node_dict[key]) > 0:
print(len(self.correspondence_node_dict[key]), self.correspondence_node_dict)
print(len(self.correspondence_node_dict))
def driver(self):
# self.file_converter()
self.file_parse()
self.paired_end_assembly()
self.null_model()
self.paired_end_sequence_correspondence_graphing()
def main():
"""Access class and call driver member function."""
class_access = tree_maker()
class_access.driver()
if __name__ == '__main__':
main()