-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_1_mcflashfold_mask_generator.py
78 lines (65 loc) · 2.93 KB
/
2_1_mcflashfold_mask_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#
# This utility is part of the DecoyDB (working title) project
#
# Author: Stephen Leong Koan
#
# Copyright (C) 2013 Université de Montréal
#
"""
Command-line utility to generate an MC-FlashFold mask to make sure the part where the mature are either unpaired or opening/closing bracket
"""
import os
import argparse
import cPickle
if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--digested_data',
action="store",
required=True,
dest="digested_data",
help="The pickle file generated by 1_organize_precursor_mature.py")
parser.add_argument('--mcfold_cmd',
action="store",
required=True,
dest="mcfold_cmd",
help=("The command line template. Put '{seq}', '{mask}', '{name}', '{accession}', "
"where you want the scriot to plug the sequence, the mask, the name of the precursor "
"and/or the accession of the precursor respectively"))
ns = parser.parse_args()
digested_data = ns.digested_data
mcfold_cmd = ns.mcfold_cmd
list_digested_data = None
with open(digested_data, 'rb') as dd:
list_digested_data = cPickle.load(dd)
for hairpin_dict in list_digested_data:
hairpin_name = hairpin_dict['name']
hairpin_acc = hairpin_dict['accession']
hairpin_seq = hairpin_dict['sequence']
mature_5p_range = []
mature_3p_range = []
for mature in hairpin_dict['matures']:
if mature.get("alternative_name", mature["name"]).endswith("5p"):
start = hairpin_seq.find(mature["sequence"])
mature_5p_range = range(start, start + len(mature["sequence"]))
else:
start = hairpin_seq.find(mature["sequence"])
mature_3p_range = range(start, start + len(mature["sequence"]))
mask = ""
for i in xrange(len(hairpin_seq)):
if i in mature_5p_range:
mask += "p"
elif i in mature_3p_range:
mask += "q"
else:
mask += "x"
cmd = mcfold_cmd.format(seq=hairpin_seq,
mask=mask,
accession=hairpin_acc,
name=hairpin_name)
print cmd
# print ("python git/various-codes/DecoyDB/2_2_mcflashfold_structure_filter.py "
# "--hairpin_fasta reproduction_projet_naim/2D/hairpin/{acc} "
# "--mature_fasta reproduction_projet_naim/2D/mature/{acc} "
# "--mcfold_output reproduction_projet_naim/MC-FlashFold_part1/2d/{acc} > reproduction_projet_naim/MC-FlashFold_part1/2dclean/{acc}").format(acc=hairpin_acc)