-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathtcr_rearrangement.py
285 lines (238 loc) · 22.1 KB
/
tcr_rearrangement.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#############################################################################################################
##
## NOTE:
##
## This is a legacy version of the TCR probability code which is going to be phased out.
## Right now it doesn't get imported anywhere unless basic.pipeline_params[ 'new_probs' ] == False
## (the default value is True)
##
## The updated version is called "tcr_rearrangement_new.py"
##
#############################################################################################################
from basic import *
import sys
from paths import path_to_db
## these guys are for external sharing, as well as
## get_alpha_trim_probs
## get_beta_trim_probs
##
all_trim_probs = {}
all_rep_probs = {}
all_trbd_nucseq = {}
##############################################################################################################
# >K02545|TRBD1*01|Homo sapiens|F|D-REGION|82..93|12 nt|1| | | | |12+0=12| | |
# gggacagggggc
# >X02987|TRBD2*01|Homo sapiens|F|D-REGION|140..155|16 nt|1| | | | |16+0=16| | |
# gggactagcggggggg
# >M14159|TRBD2*02|Homo sapiens|F|D-REGION|569..584|16 nt|1| | | | |16+0=16| | |
# gggactagcgggaggg
trbd_nucseq = { ## mouse
1:'gggacagggggc', ## len 12
2:'gggactgggggggc', ## len 14
}
trbd_nucseq_human = {
1:'gggacagggggc',
2:'gggactagcggggggg', ## actually 2*01
3:'gggactagcgggaggg', ## actually 2*02
# 1234567890123456
}
all_trbd_nucseq = {'mouse':trbd_nucseq, 'human':trbd_nucseq_human}
#################################################################################################################
##
alpha_trim_prob_lines_mouse = """
PROB_A_v_trim 0.421946 0.141516 0.113666 0.114740 0.063751 0.045711 0.034423 0.029273 0.013622 0.009955 0.007622 0.002139 0.000761 0.000254 0.000527 0.000095
PROB_A_j_trim 0.209729 0.139163 0.128939 0.148367 0.110268 0.110984 0.060651 0.036761 0.027030 0.011851 0.006179 0.004318 0.001836 0.002154 0.001070 0.000701
PROB_A_vj_insert 0.086069 0.135083 0.184446 0.188326 0.156083 0.109790 0.063442 0.034039 0.019199 0.009582 0.005970 0.003617 0.001766 0.001005 0.000935 0.000647"""
alpha_trim_prob_lines_human = """
PROB_A_v_trim 0.213408 0.113598 0.099251 0.130393 0.108383 0.084676 0.064786 0.055140 0.055361 0.042462 0.012561 0.007258 0.009843 0.001693 0.000594 0.000594
PROB_A_j_trim 0.103923 0.072312 0.071505 0.094306 0.117595 0.117149 0.089869 0.092958 0.094853 0.037747 0.034006 0.036983 0.013367 0.010678 0.008272 0.004477
PROB_A_vj_insert 0.014455 0.027132 0.058214 0.102510 0.141518 0.145123 0.121780 0.101139 0.079731 0.061162 0.048514 0.035934 0.025128 0.018205 0.012033 0.007423
"""
beta_trim_prob_lines_mouse = """
PROB_B_D1_d01_trim 0,0: 0.017328 0,1: 0.019039 0,2: 0.039170 0,3: 0.051505 0,4: 0.015357 0,5: 0.049524 0,6: 0.061235 0,7: 0.017442 0,8: 0.026022 0,9: 0.010205 0,10: 0.007699 0,11: 0.004773 1,0: 0.008721 1,1: 0.009709 1,2: 0.016887 1,3: 0.023271 1,4: 0.007498 1,5: 0.021168 1,6: 0.022242 1,7: 0.006524 1,8: 0.012729 2,0: 0.009940 2,1: 0.011223 2,2: 0.018346 2,3: 0.027652 2,4: 0.009241 2,5: 0.023065 2,6: 0.020207 2,7: 0.005839 2,8: 0.011155 3,0: 0.011617 3,1: 0.010636 3,2: 0.020098 3,3: 0.025772 3,4: 0.006680 3,5: 0.014703 3,6: 0.014204 3,7: 0.006141 3,8: 0.002936 4,0: 0.019056 4,1: 0.019074 4,2: 0.032236 4,3: 0.042881 4,4: 0.013550 4,5: 0.024351 4,6: 0.013685 4,7: 0.003398 5,0: 0.010419 5,1: 0.015500 5,2: 0.021848 5,3: 0.014659 5,4: 0.007406 5,5: 0.003380 6,0: 0.011892 6,1: 0.016955 6,2: 0.009479 7,0: 0.006671 8,0: 0.007351 9,0: 0.006069 10,0: 0.002637
PROB_B_D1_v_trim 0: 0.218413 1: 0.114507 2: 0.145340 3: 0.121819 4: 0.141786 5: 0.132820 6: 0.079272 7: 0.017948 8: 0.016894 9: 0.003430 10: 0.002171 11: 0.001855 12: 0.002572 13: 0.000793 14: 0.000189 15: 0.000192
PROB_B_D1_j_trim 0: 0.300538 1: 0.113929 2: 0.110029 3: 0.141487 4: 0.118952 5: 0.068820 6: 0.058966 7: 0.057623 8: 0.014698 9: 0.005060 10: 0.004946 11: 0.002340 12: 0.001186 13: 0.000596 14: 0.000518 15: 0.000311
PROB_B_D1_vd_insert 0: 0.192147 1: 0.196064 2: 0.207559 3: 0.164503 4: 0.112413 5: 0.056638 6: 0.031895 7: 0.017549 8: 0.009929 9: 0.005574 10: 0.002926 11: 0.001532 12: 0.000759 13: 0.000334 14: 0.000130 15: 0.000047
PROB_B_D1_dj_insert 0: 0.235213 1: 0.206084 2: 0.205009 3: 0.151651 4: 0.099582 5: 0.052791 6: 0.026448 7: 0.012119 8: 0.005332 9: 0.002744 10: 0.001540 11: 0.000752 12: 0.000427 13: 0.000196 14: 0.000087 15: 0.000026
PROB_B_D1_tot_d_trim 0: 0.017328 1: 0.027760 2: 0.058819 3: 0.091232 4: 0.086666 5: 0.134265 6: 0.177044 7: 0.157783 8: 0.112495 9: 0.080803 10: 0.044697 11: 0.011108
PROB_B_D2_d01_trim 0,0: 0.018237 0,1: 0.012803 0,2: 0.028500 0,3: 0.036180 0,4: 0.029843 0,5: 0.020330 0,6: 0.010563 0,7: 0.006157 0,8: 0.013797 0,9: 0.012542 0,10: 0.018955 0,11: 0.006526 0,12: 0.004751 0,13: 0.002466 1,0: 0.013659 1,1: 0.010341 1,2: 0.020492 1,3: 0.022325 1,4: 0.017114 1,5: 0.011767 1,6: 0.006256 1,7: 0.003406 1,8: 0.005255 1,9: 0.004811 1,10: 0.008818 2,0: 0.020930 2,1: 0.015092 2,2: 0.025925 2,3: 0.025617 2,4: 0.019533 2,5: 0.014255 2,6: 0.007584 2,7: 0.003095 2,8: 0.004750 2,9: 0.004239 2,10: 0.006845 3,0: 0.015810 3,1: 0.010981 3,2: 0.018570 3,3: 0.018353 3,4: 0.014230 3,5: 0.009517 3,6: 0.004914 3,7: 0.001909 3,8: 0.006631 3,9: 0.003856 3,10: 0.001519 4,0: 0.026927 4,1: 0.019562 4,2: 0.034211 4,3: 0.034410 4,4: 0.024444 4,5: 0.017820 4,6: 0.008870 4,7: 0.004499 4,8: 0.010368 4,9: 0.001740 5,0: 0.013638 5,1: 0.012254 5,2: 0.018841 5,3: 0.017269 5,4: 0.015131 5,5: 0.008742 5,6: 0.005399 5,7: 0.002368 5,8: 0.001485 6,0: 0.021192 6,1: 0.018558 6,2: 0.018255 6,3: 0.010342 6,4: 0.006931 7,0: 0.016466 8,0: 0.005957 9,0: 0.007456 10,0: 0.007138 11,0: 0.007036 12,0: 0.002646
PROB_B_D2_v_trim 0: 0.246466 1: 0.124381 2: 0.142223 3: 0.124901 4: 0.131933 5: 0.127862 6: 0.065644 7: 0.013355 8: 0.015438 9: 0.002295 10: 0.001574 11: 0.001239 12: 0.002002 13: 0.000391 14: 0.000126 15: 0.000173
PROB_B_D2_j_trim 0: 0.310095 1: 0.123791 2: 0.110055 3: 0.135452 4: 0.115906 5: 0.061610 6: 0.067288 7: 0.057558 8: 0.008041 9: 0.003436 10: 0.003244 11: 0.001142 12: 0.001215 13: 0.000472 14: 0.000467 15: 0.000228
PROB_B_D2_vd_insert 0: 0.207097 1: 0.200057 2: 0.206337 3: 0.157210 4: 0.107401 5: 0.055060 6: 0.029684 7: 0.016179 8: 0.009612 9: 0.005406 10: 0.002867 11: 0.001583 12: 0.000916 13: 0.000381 14: 0.000164 15: 0.000047
PROB_B_D2_dj_insert 0: 0.225980 1: 0.195041 2: 0.202632 3: 0.155303 4: 0.110594 5: 0.057727 6: 0.028903 7: 0.012853 8: 0.005628 9: 0.002659 10: 0.001418 11: 0.000727 12: 0.000291 13: 0.000145 14: 0.000075 15: 0.000024
PROB_B_D2_tot_d_trim 0: 0.018237 1: 0.026462 2: 0.059771 3: 0.087573 4: 0.116000 5: 0.114831 6: 0.127873 7: 0.129173 8: 0.100228 9: 0.076556 10: 0.062106 11: 0.043147 12: 0.030833 13: 0.007210
"""
beta_trim_prob_lines_human = """
PROB_B_D1_d01_trim 0,0: 0.008959 0,1: 0.010181 0,2: 0.021161 0,3: 0.035499 0,4: 0.018361 0,5: 0.034621 0,6: 0.020377 0,7: 0.029673 0,8: 0.031598 0,9: 0.019343 0,10: 0.014188 0,11: 0.004344 1,0: 0.006810 1,1: 0.007306 1,2: 0.013538 1,3: 0.024629 1,4: 0.013699 1,5: 0.021067 1,6: 0.009669 1,7: 0.017864 1,8: 0.021899 2,0: 0.007446 2,1: 0.008260 2,2: 0.015848 2,3: 0.029185 2,4: 0.015185 2,5: 0.019443 2,6: 0.009355 2,7: 0.013085 2,8: 0.014002 3,0: 0.010192 3,1: 0.009526 3,2: 0.018864 3,3: 0.030061 3,4: 0.011804 3,5: 0.013964 3,6: 0.009019 3,7: 0.009047 3,8: 0.002086 4,0: 0.015097 4,1: 0.018513 4,2: 0.031019 4,3: 0.045665 4,4: 0.023664 4,5: 0.022766 4,6: 0.007757 4,7: 0.002321 5,0: 0.009825 5,1: 0.018171 5,2: 0.024254 5,3: 0.016264 5,4: 0.009627 5,5: 0.002893 6,0: 0.022482 6,1: 0.025977 6,2: 0.015532 7,0: 0.018328 8,0: 0.020640 9,0: 0.014327 10,0: 0.003721
PROB_B_D1_v_trim 0: 0.135820 1: 0.097128 2: 0.107363 3: 0.114808 4: 0.143636 5: 0.171400 6: 0.125769 7: 0.046483 8: 0.032399 9: 0.009281 10: 0.010018 11: 0.004333 12: 0.001022 13: 0.000312 14: 0.000136 15: 0.000092
PROB_B_D1_j_trim 0: 0.129996 1: 0.095461 2: 0.114521 3: 0.108524 4: 0.144917 5: 0.112931 6: 0.091571 7: 0.084043 8: 0.058269 9: 0.024138 10: 0.019387 11: 0.008293 12: 0.003694 13: 0.002520 14: 0.001352 15: 0.000382
PROB_B_D1_vd_insert 0: 0.119887 1: 0.108518 2: 0.137850 3: 0.147452 4: 0.132459 5: 0.097949 6: 0.078475 7: 0.058560 8: 0.042659 9: 0.030417 10: 0.019072 11: 0.012051 12: 0.007379 13: 0.004017 14: 0.002163 15: 0.001092
PROB_B_D1_dj_insert 0: 0.171190 1: 0.120952 2: 0.145988 3: 0.143965 4: 0.125539 5: 0.094762 6: 0.069002 7: 0.046970 8: 0.030968 9: 0.020823 10: 0.013188 11: 0.008048 12: 0.004641 13: 0.002386 14: 0.001107 15: 0.000468
PROB_B_D1_tot_d_trim 0: 0.008959 1: 0.016991 2: 0.035913 3: 0.067489 4: 0.083461 5: 0.124706 6: 0.158362 7: 0.184812 8: 0.148882 9: 0.110065 10: 0.051609 11: 0.008751
PROB_B_D2_d01_trim 0,0: 0.003289 0,1: 0.003677 0,2: 0.005399 0,3: 0.007834 0,4: 0.004833 0,5: 0.010839 0,6: 0.007479 0,7: 0.008020 0,8: 0.011684 0,9: 0.008777 0,10: 0.020960 0,11: 0.023115 0,12: 0.010256 0,13: 0.010794 0,14: 0.007619 0,15: 0.001867 1,0: 0.003141 1,1: 0.003215 1,2: 0.004911 1,3: 0.007052 1,4: 0.003465 1,5: 0.006683 1,6: 0.003886 1,7: 0.004490 1,8: 0.005597 1,9: 0.003817 1,10: 0.010321 1,11: 0.014218 1,12: 0.012314 2,0: 0.003430 2,1: 0.003400 2,2: 0.005437 2,3: 0.006660 2,4: 0.003064 2,5: 0.005614 2,6: 0.003266 2,7: 0.003922 2,8: 0.005708 2,9: 0.003220 2,10: 0.009647 2,11: 0.010858 2,12: 0.008933 3,0: 0.003959 3,1: 0.003842 3,2: 0.005540 3,3: 0.006023 3,4: 0.002654 3,5: 0.005515 3,6: 0.003826 3,7: 0.003308 3,8: 0.005992 3,9: 0.004134 3,10: 0.011269 3,11: 0.007371 3,12: 0.001117 4,0: 0.008351 4,1: 0.008092 4,2: 0.011672 4,3: 0.013611 4,4: 0.006369 4,5: 0.011818 4,6: 0.007007 4,7: 0.006919 4,8: 0.012044 4,9: 0.011039 4,10: 0.013574 4,11: 0.001322 5,0: 0.007623 5,1: 0.006285 5,2: 0.008635 5,3: 0.010012 5,4: 0.004010 5,5: 0.007253 5,6: 0.004413 5,7: 0.005517 5,8: 0.010311 5,9: 0.004165 5,10: 0.001008 6,0: 0.010347 6,1: 0.008711 6,2: 0.011636 6,3: 0.013673 6,4: 0.006004 6,5: 0.010318 6,6: 0.007756 6,7: 0.009641 6,8: 0.001543 7,0: 0.014212 7,1: 0.013431 7,2: 0.018841 7,3: 0.024357 7,4: 0.011325 7,5: 0.015290 7,6: 0.007923 7,7: 0.002218 8,0: 0.023849 8,1: 0.027292 8,2: 0.036883 8,3: 0.048234 8,4: 0.024969 8,5: 0.017605 8,6: 0.003854 9,0: 0.032838 9,1: 0.026142 9,2: 0.011460 9,3: 0.011336
PROB_B_D2_v_trim 0: 0.144219 1: 0.102658 2: 0.107575 3: 0.113975 4: 0.144439 5: 0.167369 6: 0.117520 7: 0.044833 8: 0.032156 9: 0.009115 10: 0.009848 11: 0.004507 12: 0.001189 13: 0.000321 14: 0.000168 15: 0.000109
PROB_B_D2_j_trim 0: 0.135449 1: 0.091547 2: 0.145354 3: 0.113114 4: 0.136317 5: 0.125496 6: 0.092621 7: 0.067942 8: 0.037040 9: 0.028831 10: 0.016159 11: 0.002986 12: 0.004574 13: 0.002062 14: 0.000469 15: 0.000040
PROB_B_D2_vd_insert 0: 0.124012 1: 0.110331 2: 0.136139 3: 0.142587 4: 0.124919 5: 0.097663 6: 0.078220 7: 0.059750 8: 0.043746 9: 0.031507 10: 0.020917 11: 0.013402 12: 0.008162 13: 0.004755 14: 0.002586 15: 0.001303
PROB_B_D2_dj_insert 0: 0.206566 1: 0.119499 2: 0.140631 3: 0.135260 4: 0.118512 5: 0.089665 6: 0.063451 7: 0.045261 8: 0.030320 9: 0.020507 10: 0.013581 11: 0.008099 12: 0.004542 13: 0.002504 14: 0.001107 15: 0.000495
PROB_B_D2_tot_d_trim 0: 0.003289 1: 0.006818 2: 0.012044 3: 0.020103 4: 0.029515 5: 0.042219 6: 0.051552 7: 0.065343 8: 0.090253 9: 0.130595 10: 0.141440 11: 0.135318 12: 0.115168 13: 0.101753 14: 0.049277 15: 0.005314
PROB_B_D3_d01_trim 0,0: 0.005888 0,1: 0.005414 0,2: 0.015318 0,3: 0.009903 0,4: 0.004041 0,5: 0.009062 0,6: 0.006253 0,7: 0.006706 0,8: 0.009769 0,9: 0.007339 0,10: 0.017525 0,11: 0.019327 0,12: 0.008576 0,13: 0.009025 0,14: 0.006370 0,15: 0.001561 1,0: 0.004675 1,1: 0.004226 1,2: 0.011995 1,3: 0.007646 1,4: 0.002897 1,5: 0.005587 1,6: 0.003249 1,7: 0.003754 1,8: 0.004680 1,9: 0.003192 1,10: 0.008630 1,11: 0.011888 1,12: 0.010296 2,0: 0.004887 2,1: 0.004106 2,2: 0.011266 2,3: 0.006485 2,4: 0.002562 2,5: 0.004694 2,6: 0.002731 2,7: 0.003279 2,8: 0.004772 2,9: 0.002692 2,10: 0.008066 2,11: 0.009079 2,12: 0.007469 3,0: 0.005248 3,1: 0.004218 3,2: 0.011359 3,3: 0.005438 3,4: 0.002219 3,5: 0.004611 3,6: 0.003199 3,7: 0.002766 3,8: 0.005010 3,9: 0.003457 3,10: 0.009422 3,11: 0.006163 3,12: 0.000934 4,0: 0.011050 4,1: 0.008934 4,2: 0.023625 4,3: 0.012711 4,4: 0.005325 4,5: 0.009881 4,6: 0.005859 4,7: 0.005785 4,8: 0.010070 4,9: 0.009230 4,10: 0.011349 4,11: 0.001106 5,0: 0.010206 5,1: 0.006664 5,2: 0.017964 5,3: 0.009151 5,4: 0.003353 5,5: 0.006065 5,6: 0.003690 5,7: 0.004613 5,8: 0.008621 5,9: 0.003483 5,10: 0.000842 6,0: 0.011524 6,1: 0.007029 6,2: 0.018565 6,3: 0.010126 6,4: 0.005020 6,5: 0.008627 6,6: 0.006485 6,7: 0.008061 6,8: 0.001290 7,0: 0.013971 7,1: 0.008434 7,2: 0.019998 7,3: 0.013881 7,4: 0.009469 7,5: 0.012784 7,6: 0.006624 7,7: 0.001855 8,0: 0.011888 8,1: 0.007358 8,2: 0.019102 8,3: 0.018148 8,4: 0.020877 8,5: 0.014720 8,6: 0.003223 9,0: 0.024577 9,1: 0.017379 9,2: 0.023944 10,0: 0.021372 10,1: 0.014671 10,2: 0.020960 11,0: 0.022857 11,1: 0.015700 11,2: 0.014514 12,0: 0.009908 12,1: 0.004560
PROB_B_D3_v_trim 0: 0.146486 1: 0.104938 2: 0.109566 3: 0.114675 4: 0.142373 5: 0.164670 6: 0.115655 7: 0.044459 8: 0.032275 9: 0.009069 10: 0.009602 11: 0.004474 12: 0.001151 13: 0.000334 14: 0.000158 15: 0.000114
PROB_B_D3_j_trim 0: 0.126877 1: 0.081759 2: 0.131928 3: 0.114675 4: 0.138339 5: 0.129452 6: 0.099551 7: 0.074752 8: 0.042348 9: 0.032542 10: 0.017119 11: 0.002997 12: 0.004771 13: 0.002334 14: 0.000504 15: 0.000054
PROB_B_D3_vd_insert 0: 0.117023 1: 0.106011 2: 0.135202 3: 0.142546 4: 0.126806 5: 0.098625 6: 0.079346 7: 0.061604 8: 0.045508 9: 0.033575 10: 0.022364 11: 0.014154 12: 0.008527 13: 0.004833 14: 0.002536 15: 0.001337
PROB_B_D3_dj_insert 0: 0.196824 1: 0.119816 2: 0.135436 3: 0.137680 4: 0.124377 5: 0.095529 6: 0.065802 7: 0.045414 8: 0.029925 9: 0.020168 10: 0.013142 11: 0.007857 12: 0.004311 13: 0.002323 14: 0.000990 15: 0.000405
PROB_B_D3_tot_d_trim 0: 0.005888 1: 0.010089 2: 0.024431 3: 0.031252 4: 0.038221 5: 0.048943 6: 0.061654 7: 0.068543 8: 0.074228 9: 0.093789 10: 0.116932 11: 0.142851 12: 0.133383 13: 0.104151 14: 0.041201 15: 0.004443
"""
alpha_trim_prob_lines = { 'mouse': alpha_trim_prob_lines_mouse,
'human': alpha_trim_prob_lines_human }
beta_trim_prob_lines = { 'mouse': beta_trim_prob_lines_mouse,
'human': beta_trim_prob_lines_human }
## legacy files for old probability model
rep_freq_files = { 'mouse': { 'A':[ path_to_db+'/old/probs_files/old/tmp.read_sra_matches.alpha.log.N38469.U8750.arFalse.ubnTrue.dump_probs',
path_to_db+'/old/probs_files/old/scjobs_tcrseq_tmp3.read_sra_matches.log.N17930695.U204691.arFalse.ubnTrue.dump_probs',
],
'B': [ path_to_db+'/old/probs_files/old/tmp1.read_sra_matches.log.N24046228.U1407106.arFalse.ubnTrue.dump_probs',
path_to_db+'/old/probs_files/old/tmp.read_sra_matches.beta.log.N20117.U9339.arFalse.ubnTrue.dump_probs',
path_to_db+'/old/probs_files/old/tmp.read_sra_matches.log.N284176.U142210.arFalse.ubnTrue.dump_probs',
],
},
'human': { 'B': [ path_to_db+'/old/probs_files/old/tmp7.human_adaptive_beta.read_sra_matches.log.N5001894.U3967020.arFalse.ubnTrue.ssl25.fgdid.dump_probs' ],
'A': [ path_to_db+'/old/probs_files/old/tmp8.human_adaptive_alpha.read_sra_matches.log.N4271464.U2700969.arFalse.ubnTrue.ssl25.dump_probs' ] } }
for org in rep_freq_files:
for ab in rep_freq_files[org]:
for filename in rep_freq_files[org][ab]:
assert exists(filename)
for organism in ['mouse','human']:
trim_probs = {}
for line in alpha_trim_prob_lines[organism].split('\n'):
l = line.split()
if not l:continue
assert l[0].startswith('PROB_A')
tag = l[0][5:]
vals = map(float,l[1:])
trim_probs[tag] = dict( zip( range(len(vals)), vals ) )
for line in beta_trim_prob_lines[organism].split('\n'):
l = line.split()
if not l:continue
assert l[0].startswith('PROB_B')
tag = l[0][5:]
trim_probs[tag] = {}
assert len(l)%2==1
num_vals = (len(l)-1)/2
for i in range(num_vals):
assert l[2*i+1][-1] == ':'
key = l[2*i+1][:-1]
if ',' in key:
key = tuple(map(int,key.split(',')))
else:
key = int(key)
trim_probs[tag][key] = float(l[2*i+2])
## fake probability for total trimming of the D gene
for did,nucseq in all_trbd_nucseq[organism].iteritems():
trimtag = 'B_D{}_d01_trim'.format(did)
prob_trim_all_but_1 = 0.0
for d0_trim in range(len(nucseq)):
d1_trim = (len(nucseq)-1)-d0_trim
assert d0_trim + d1_trim == len(nucseq)-1
prob_trim_all_but_1 += trim_probs[trimtag].get((d0_trim,d1_trim),0)
prob_trim_all = 0.0
for d0_trim in range(len(nucseq)+1):
d1_trim = (len(nucseq))-d0_trim
prob_trim_all += trim_probs[trimtag].get((d0_trim,d1_trim),0)
assert prob_trim_all <1e-6
#print 'old_prob_trim_all:',prob_trim_all,'prob_trim_all_but_1:',prob_trim_all_but_1,'D',did
new_prob_trim_all = 0.75 * prob_trim_all_but_1
for d0_trim in range(len(nucseq)+1):
d1_trim = (len(nucseq))-d0_trim
if d0_trim == 0:
#print 'new_prob_trim_all:',new_prob_trim_all
trim_probs[trimtag][ (d0_trim,d1_trim) ] = new_prob_trim_all ## concentrate all here
else:
trim_probs[trimtag][ (d0_trim,d1_trim) ] = 0.0
total = sum( trim_probs[trimtag].values())
for k in trim_probs[trimtag]:
trim_probs[trimtag][k] /= total
assert abs( 1.0 - sum( trim_probs[trimtag].values()) ) < 1e-6
beta_prob_tags_single = ['v_trim','j_trim','vd_insert','dj_insert']
for tag in beta_prob_tags_single:
tags = [ 'B_D{}_{}'.format(x,tag) for x in all_trbd_nucseq[organism] ]
#tag1 = 'B_D1_{}'.format(tag)
#tag2 = 'B_D2_{}'.format(tag)
avgtag = 'B_{}'.format(tag)
trim_probs[avgtag] = {}
for k in trim_probs[tags[0]].keys():
trim_probs[avgtag][k] = sum( trim_probs[x].get(k,0) for x in tags ) / float(len(tags))
#print organism,avgtag,trim_probs[avgtag]
## now load frequencies of alpha and beta reps:
rep_probs = {}
for ab in rep_freq_files[organism]:
files = rep_freq_files[organism][ab]
for vj in 'VJ':
probs ={}
for file in files:
assert exists(file)
for line in popen('grep "^{}{}_REP_FREQ" {}'.format(ab,vj,file)):
l = line.split()
assert len(l) == 4
nonuniq_freq = float( l[1] ) / 100.0 ## now from 0 to 1
rep = l[3]
assert rep[2:4] == ab+vj
if rep not in probs:probs[rep] = []
probs[rep].append( nonuniq_freq )
avg_probs = {}
for rep in probs:
vals = probs[rep] + [0.0]*(len(files) - len(probs[rep]) )
if len(vals) == 2:
avg_probs[rep] = sum( vals )/2.0
elif len(vals) == 1: ## adding for human
avg_probs[rep] = vals[0]
else:
assert len(vals) == 3 ## hack
avg_probs[rep] = get_median( vals)
## do we really want to normalize here??
total = min( 1.0, sum( avg_probs.values() ) ) ##only increase probabilities...
for rep in probs:
rep_probs[rep] = avg_probs[rep] / total
if __name__ == '__main__' and len(sys.argv) == 1:
print 'rep_probs: %12.6f %s %s'%(100.0*rep_probs[rep],organism,rep)
## normalize trim_probs
for tag,probs in trim_probs.iteritems():
if type(probs) == type({}):
total = sum( probs.values())
assert abs(1.0-total)<1e-2
#print 'normalize trim_probs:',tag,total
for k in probs:
probs[k] = probs[k] / total
else:
assert False
assert type(probs) == type([])
total = sum( probs )
assert abs(1.0-total)<1e-2
#print 'normalize trim_probs:',tag,total
for i in range(len(probs)):
probs[i] = probs[i]/total
all_trim_probs[organism] = trim_probs
all_rep_probs[organism] = rep_probs
def get_alpha_trim_probs( organism, v_trim, j_trim, vj_insert ):
total_prob = 1.0
for ( val, tag ) in zip( [v_trim, j_trim, vj_insert], ['A_v_trim','A_j_trim','A_vj_insert'] ):
probs = all_trim_probs[organism][tag]
if val >= len(probs):
return 0.0
total_prob *= probs[val]
return total_prob
def get_beta_trim_probs( organism, d_id, v_trim, d0_trim, d1_trim, j_trim, vd_insert, dj_insert ): ## work in progress
assert d_id in all_trbd_nucseq[organism]
dd = (d0_trim,d1_trim)
d_trim_tag = 'B_D{}_d01_trim'.format(d_id)
total_prob = all_trim_probs[organism][d_trim_tag].get(dd,0)
#total_prob = trim_probs[d_trim_tag][dd] ## what about full trims?? will get an error
for ( val, tag ) in zip( [v_trim, j_trim, vd_insert, dj_insert], beta_prob_tags_single ):
probs = all_trim_probs[organism]['B_'+tag] ## a dictionary for beta (a list for alpha)
if val not in probs:
return 0.0
total_prob *= probs[val]
return total_prob
def probs_data_exist( organism, chain ):
return True #legacy hack
if __name__ == '__main__':
pass