tcr_rearrangement.py

#############################################################################################################
##
## NOTE:
##
## This is a legacy version of the TCR probability code which is going to be phased out.
## Right now it doesn't get imported anywhere unless basic.pipeline_params[ 'new_probs' ] == False
## (the default value is True)
##
## The updated version is called "tcr_rearrangement_new.py"
##
#############################################################################################################

from basic import *
import sys
from paths import path_to_db

## these guys are for external sharing, as well as
## get_alpha_trim_probs
## get_beta_trim_probs
##
all_trim_probs = {}
all_rep_probs = {}
all_trbd_nucseq = {}


##############################################################################################################

# >K02545|TRBD1*01|Homo sapiens|F|D-REGION|82..93|12 nt|1| | | | |12+0=12| | |
# gggacagggggc
# >X02987|TRBD2*01|Homo sapiens|F|D-REGION|140..155|16 nt|1| | | | |16+0=16| | |
# gggactagcggggggg
# >M14159|TRBD2*02|Homo sapiens|F|D-REGION|569..584|16 nt|1| | | | |16+0=16| | |
# gggactagcgggaggg

trbd_nucseq = { ## mouse
    1:'gggacagggggc',   ## len 12
    2:'gggactgggggggc', ## len 14
}

trbd_nucseq_human = {
    1:'gggacagggggc',
    2:'gggactagcggggggg', ## actually 2*01
    3:'gggactagcgggaggg', ## actually 2*02
    #  1234567890123456

}

all_trbd_nucseq = {'mouse':trbd_nucseq, 'human':trbd_nucseq_human}


#################################################################################################################
##

alpha_trim_prob_lines_mouse = """
PROB_A_v_trim  0.421946  0.141516  0.113666  0.114740  0.063751  0.045711  0.034423  0.029273  0.013622  0.009955  0.007622  0.002139  0.000761  0.000254  0.000527  0.000095
PROB_A_j_trim  0.209729  0.139163  0.128939  0.148367  0.110268  0.110984  0.060651  0.036761  0.027030  0.011851  0.006179  0.004318  0.001836  0.002154  0.001070  0.000701
PROB_A_vj_insert  0.086069  0.135083  0.184446  0.188326  0.156083  0.109790  0.063442  0.034039  0.019199  0.009582  0.005970  0.003617  0.001766  0.001005  0.000935  0.000647"""

alpha_trim_prob_lines_human = """
PROB_A_v_trim  0.213408  0.113598  0.099251  0.130393  0.108383  0.084676  0.064786  0.055140  0.055361  0.042462  0.012561  0.007258  0.009843  0.001693  0.000594  0.000594
PROB_A_j_trim  0.103923  0.072312  0.071505  0.094306  0.117595  0.117149  0.089869  0.092958  0.094853  0.037747  0.034006  0.036983  0.013367  0.010678  0.008272  0.004477
PROB_A_vj_insert  0.014455  0.027132  0.058214  0.102510  0.141518  0.145123  0.121780  0.101139  0.079731  0.061162  0.048514  0.035934  0.025128  0.018205  0.012033  0.007423
"""

beta_trim_prob_lines_mouse = """
PROB_B_D1_d01_trim 0,0:  0.017328 0,1:  0.019039 0,2:  0.039170 0,3:  0.051505 0,4:  0.015357 0,5:  0.049524 0,6:  0.061235 0,7:  0.017442 0,8:  0.026022 0,9:  0.010205 0,10:  0.007699 0,11:  0.004773 1,0:  0.008721 1,1:  0.009709 1,2:  0.016887 1,3:  0.023271 1,4:  0.007498 1,5:  0.021168 1,6:  0.022242 1,7:  0.006524 1,8:  0.012729 2,0:  0.009940 2,1:  0.011223 2,2:  0.018346 2,3:  0.027652 2,4:  0.009241 2,5:  0.023065 2,6:  0.020207 2,7:  0.005839 2,8:  0.011155 3,0:  0.011617 3,1:  0.010636 3,2:  0.020098 3,3:  0.025772 3,4:  0.006680 3,5:  0.014703 3,6:  0.014204 3,7:  0.006141 3,8:  0.002936 4,0:  0.019056 4,1:  0.019074 4,2:  0.032236 4,3:  0.042881 4,4:  0.013550 4,5:  0.024351 4,6:  0.013685 4,7:  0.003398 5,0:  0.010419 5,1:  0.015500 5,2:  0.021848 5,3:  0.014659 5,4:  0.007406 5,5:  0.003380 6,0:  0.011892 6,1:  0.016955 6,2:  0.009479 7,0:  0.006671 8,0:  0.007351 9,0:  0.006069 10,0:  0.002637
PROB_B_D1_v_trim 0:  0.218413 1:  0.114507 2:  0.145340 3:  0.121819 4:  0.141786 5:  0.132820 6:  0.079272 7:  0.017948 8:  0.016894 9:  0.003430 10:  0.002171 11:  0.001855 12:  0.002572 13:  0.000793 14:  0.000189 15:  0.000192
PROB_B_D1_j_trim 0:  0.300538 1:  0.113929 2:  0.110029 3:  0.141487 4:  0.118952 5:  0.068820 6:  0.058966 7:  0.057623 8:  0.014698 9:  0.005060 10:  0.004946 11:  0.002340 12:  0.001186 13:  0.000596 14:  0.000518 15:  0.000311
PROB_B_D1_vd_insert 0:  0.192147 1:  0.196064 2:  0.207559 3:  0.164503 4:  0.112413 5:  0.056638 6:  0.031895 7:  0.017549 8:  0.009929 9:  0.005574 10:  0.002926 11:  0.001532 12:  0.000759 13:  0.000334 14:  0.000130 15:  0.000047
PROB_B_D1_dj_insert 0:  0.235213 1:  0.206084 2:  0.205009 3:  0.151651 4:  0.099582 5:  0.052791 6:  0.026448 7:  0.012119 8:  0.005332 9:  0.002744 10:  0.001540 11:  0.000752 12:  0.000427 13:  0.000196 14:  0.000087 15:  0.000026
PROB_B_D1_tot_d_trim 0:  0.017328 1:  0.027760 2:  0.058819 3:  0.091232 4:  0.086666 5:  0.134265 6:  0.177044 7:  0.157783 8:  0.112495 9:  0.080803 10:  0.044697 11:  0.011108
PROB_B_D2_d01_trim 0,0:  0.018237 0,1:  0.012803 0,2:  0.028500 0,3:  0.036180 0,4:  0.029843 0,5:  0.020330 0,6:  0.010563 0,7:  0.006157 0,8:  0.013797 0,9:  0.012542 0,10:  0.018955 0,11:  0.006526 0,12:  0.004751 0,13:  0.002466 1,0:  0.013659 1,1:  0.010341 1,2:  0.020492 1,3:  0.022325 1,4:  0.017114 1,5:  0.011767 1,6:  0.006256 1,7:  0.003406 1,8:  0.005255 1,9:  0.004811 1,10:  0.008818 2,0:  0.020930 2,1:  0.015092 2,2:  0.025925 2,3:  0.025617 2,4:  0.019533 2,5:  0.014255 2,6:  0.007584 2,7:  0.003095 2,8:  0.004750 2,9:  0.004239 2,10:  0.006845 3,0:  0.015810 3,1:  0.010981 3,2:  0.018570 3,3:  0.018353 3,4:  0.014230 3,5:  0.009517 3,6:  0.004914 3,7:  0.001909 3,8:  0.006631 3,9:  0.003856 3,10:  0.001519 4,0:  0.026927 4,1:  0.019562 4,2:  0.034211 4,3:  0.034410 4,4:  0.024444 4,5:  0.017820 4,6:  0.008870 4,7:  0.004499 4,8:  0.010368 4,9:  0.001740 5,0:  0.013638 5,1:  0.012254 5,2:  0.018841 5,3:  0.017269 5,4:  0.015131 5,5:  0.008742 5,6:  0.005399 5,7:  0.002368 5,8:  0.001485 6,0:  0.021192 6,1:  0.018558 6,2:  0.018255 6,3:  0.010342 6,4:  0.006931 7,0:  0.016466 8,0:  0.005957 9,0:  0.007456 10,0:  0.007138 11,0:  0.007036 12,0:  0.002646
PROB_B_D2_v_trim 0:  0.246466 1:  0.124381 2:  0.142223 3:  0.124901 4:  0.131933 5:  0.127862 6:  0.065644 7:  0.013355 8:  0.015438 9:  0.002295 10:  0.001574 11:  0.001239 12:  0.002002 13:  0.000391 14:  0.000126 15:  0.000173
PROB_B_D2_j_trim 0:  0.310095 1:  0.123791 2:  0.110055 3:  0.135452 4:  0.115906 5:  0.061610 6:  0.067288 7:  0.057558 8:  0.008041 9:  0.003436 10:  0.003244 11:  0.001142 12:  0.001215 13:  0.000472 14:  0.000467 15:  0.000228
PROB_B_D2_vd_insert 0:  0.207097 1:  0.200057 2:  0.206337 3:  0.157210 4:  0.107401 5:  0.055060 6:  0.029684 7:  0.016179 8:  0.009612 9:  0.005406 10:  0.002867 11:  0.001583 12:  0.000916 13:  0.000381 14:  0.000164 15:  0.000047
PROB_B_D2_dj_insert 0:  0.225980 1:  0.195041 2:  0.202632 3:  0.155303 4:  0.110594 5:  0.057727 6:  0.028903 7:  0.012853 8:  0.005628 9:  0.002659 10:  0.001418 11:  0.000727 12:  0.000291 13:  0.000145 14:  0.000075 15:  0.000024
PROB_B_D2_tot_d_trim 0:  0.018237 1:  0.026462 2:  0.059771 3:  0.087573 4:  0.116000 5:  0.114831 6:  0.127873 7:  0.129173 8:  0.100228 9:  0.076556 10:  0.062106 11:  0.043147 12:  0.030833 13:  0.007210
"""


beta_trim_prob_lines_human = """
PROB_B_D1_d01_trim 0,0:  0.008959 0,1:  0.010181 0,2:  0.021161 0,3:  0.035499 0,4:  0.018361 0,5:  0.034621 0,6:  0.020377 0,7:  0.029673 0,8:  0.031598 0,9:  0.019343 0,10:  0.014188 0,11:  0.004344 1,0:  0.006810 1,1:  0.007306 1,2:  0.013538 1,3:  0.024629 1,4:  0.013699 1,5:  0.021067 1,6:  0.009669 1,7:  0.017864 1,8:  0.021899 2,0:  0.007446 2,1:  0.008260 2,2:  0.015848 2,3:  0.029185 2,4:  0.015185 2,5:  0.019443 2,6:  0.009355 2,7:  0.013085 2,8:  0.014002 3,0:  0.010192 3,1:  0.009526 3,2:  0.018864 3,3:  0.030061 3,4:  0.011804 3,5:  0.013964 3,6:  0.009019 3,7:  0.009047 3,8:  0.002086 4,0:  0.015097 4,1:  0.018513 4,2:  0.031019 4,3:  0.045665 4,4:  0.023664 4,5:  0.022766 4,6:  0.007757 4,7:  0.002321 5,0:  0.009825 5,1:  0.018171 5,2:  0.024254 5,3:  0.016264 5,4:  0.009627 5,5:  0.002893 6,0:  0.022482 6,1:  0.025977 6,2:  0.015532 7,0:  0.018328 8,0:  0.020640 9,0:  0.014327 10,0:  0.003721
PROB_B_D1_v_trim 0:  0.135820 1:  0.097128 2:  0.107363 3:  0.114808 4:  0.143636 5:  0.171400 6:  0.125769 7:  0.046483 8:  0.032399 9:  0.009281 10:  0.010018 11:  0.004333 12:  0.001022 13:  0.000312 14:  0.000136 15:  0.000092
PROB_B_D1_j_trim 0:  0.129996 1:  0.095461 2:  0.114521 3:  0.108524 4:  0.144917 5:  0.112931 6:  0.091571 7:  0.084043 8:  0.058269 9:  0.024138 10:  0.019387 11:  0.008293 12:  0.003694 13:  0.002520 14:  0.001352 15:  0.000382
PROB_B_D1_vd_insert 0:  0.119887 1:  0.108518 2:  0.137850 3:  0.147452 4:  0.132459 5:  0.097949 6:  0.078475 7:  0.058560 8:  0.042659 9:  0.030417 10:  0.019072 11:  0.012051 12:  0.007379 13:  0.004017 14:  0.002163 15:  0.001092
PROB_B_D1_dj_insert 0:  0.171190 1:  0.120952 2:  0.145988 3:  0.143965 4:  0.125539 5:  0.094762 6:  0.069002 7:  0.046970 8:  0.030968 9:  0.020823 10:  0.013188 11:  0.008048 12:  0.004641 13:  0.002386 14:  0.001107 15:  0.000468
PROB_B_D1_tot_d_trim 0:  0.008959 1:  0.016991 2:  0.035913 3:  0.067489 4:  0.083461 5:  0.124706 6:  0.158362 7:  0.184812 8:  0.148882 9:  0.110065 10:  0.051609 11:  0.008751
PROB_B_D2_d01_trim 0,0:  0.003289 0,1:  0.003677 0,2:  0.005399 0,3:  0.007834 0,4:  0.004833 0,5:  0.010839 0,6:  0.007479 0,7:  0.008020 0,8:  0.011684 0,9:  0.008777 0,10:  0.020960 0,11:  0.023115 0,12:  0.010256 0,13:  0.010794 0,14:  0.007619 0,15:  0.001867 1,0:  0.003141 1,1:  0.003215 1,2:  0.004911 1,3:  0.007052 1,4:  0.003465 1,5:  0.006683 1,6:  0.003886 1,7:  0.004490 1,8:  0.005597 1,9:  0.003817 1,10:  0.010321 1,11:  0.014218 1,12:  0.012314 2,0:  0.003430 2,1:  0.003400 2,2:  0.005437 2,3:  0.006660 2,4:  0.003064 2,5:  0.005614 2,6:  0.003266 2,7:  0.003922 2,8:  0.005708 2,9:  0.003220 2,10:  0.009647 2,11:  0.010858 2,12:  0.008933 3,0:  0.003959 3,1:  0.003842 3,2:  0.005540 3,3:  0.006023 3,4:  0.002654 3,5:  0.005515 3,6:  0.003826 3,7:  0.003308 3,8:  0.005992 3,9:  0.004134 3,10:  0.011269 3,11:  0.007371 3,12:  0.001117 4,0:  0.008351 4,1:  0.008092 4,2:  0.011672 4,3:  0.013611 4,4:  0.006369 4,5:  0.011818 4,6:  0.007007 4,7:  0.006919 4,8:  0.012044 4,9:  0.011039 4,10:  0.013574 4,11:  0.001322 5,0:  0.007623 5,1:  0.006285 5,2:  0.008635 5,3:  0.010012 5,4:  0.004010 5,5:  0.007253 5,6:  0.004413 5,7:  0.005517 5,8:  0.010311 5,9:  0.004165 5,10:  0.001008 6,0:  0.010347 6,1:  0.008711 6,2:  0.011636 6,3:  0.013673 6,4:  0.006004 6,5:  0.010318 6,6:  0.007756 6,7:  0.009641 6,8:  0.001543 7,0:  0.014212 7,1:  0.013431 7,2:  0.018841 7,3:  0.024357 7,4:  0.011325 7,5:  0.015290 7,6:  0.007923 7,7:  0.002218 8,0:  0.023849 8,1:  0.027292 8,2:  0.036883 8,3:  0.048234 8,4:  0.024969 8,5:  0.017605 8,6:  0.003854 9,0:  0.032838 9,1:  0.026142 9,2:  0.011460 9,3:  0.011336
PROB_B_D2_v_trim 0:  0.144219 1:  0.102658 2:  0.107575 3:  0.113975 4:  0.144439 5:  0.167369 6:  0.117520 7:  0.044833 8:  0.032156 9:  0.009115 10:  0.009848 11:  0.004507 12:  0.001189 13:  0.000321 14:  0.000168 15:  0.000109
PROB_B_D2_j_trim 0:  0.135449 1:  0.091547 2:  0.145354 3:  0.113114 4:  0.136317 5:  0.125496 6:  0.092621 7:  0.067942 8:  0.037040 9:  0.028831 10:  0.016159 11:  0.002986 12:  0.004574 13:  0.002062 14:  0.000469 15:  0.000040
PROB_B_D2_vd_insert 0:  0.124012 1:  0.110331 2:  0.136139 3:  0.142587 4:  0.124919 5:  0.097663 6:  0.078220 7:  0.059750 8:  0.043746 9:  0.031507 10:  0.020917 11:  0.013402 12:  0.008162 13:  0.004755 14:  0.002586 15:  0.001303
PROB_B_D2_dj_insert 0:  0.206566 1:  0.119499 2:  0.140631 3:  0.135260 4:  0.118512 5:  0.089665 6:  0.063451 7:  0.045261 8:  0.030320 9:  0.020507 10:  0.013581 11:  0.008099 12:  0.004542 13:  0.002504 14:  0.001107 15:  0.000495
PROB_B_D2_tot_d_trim 0:  0.003289 1:  0.006818 2:  0.012044 3:  0.020103 4:  0.029515 5:  0.042219 6:  0.051552 7:  0.065343 8:  0.090253 9:  0.130595 10:  0.141440 11:  0.135318 12:  0.115168 13:  0.101753 14:  0.049277 15:  0.005314
PROB_B_D3_d01_trim 0,0:  0.005888 0,1:  0.005414 0,2:  0.015318 0,3:  0.009903 0,4:  0.004041 0,5:  0.009062 0,6:  0.006253 0,7:  0.006706 0,8:  0.009769 0,9:  0.007339 0,10:  0.017525 0,11:  0.019327 0,12:  0.008576 0,13:  0.009025 0,14:  0.006370 0,15:  0.001561 1,0:  0.004675 1,1:  0.004226 1,2:  0.011995 1,3:  0.007646 1,4:  0.002897 1,5:  0.005587 1,6:  0.003249 1,7:  0.003754 1,8:  0.004680 1,9:  0.003192 1,10:  0.008630 1,11:  0.011888 1,12:  0.010296 2,0:  0.004887 2,1:  0.004106 2,2:  0.011266 2,3:  0.006485 2,4:  0.002562 2,5:  0.004694 2,6:  0.002731 2,7:  0.003279 2,8:  0.004772 2,9:  0.002692 2,10:  0.008066 2,11:  0.009079 2,12:  0.007469 3,0:  0.005248 3,1:  0.004218 3,2:  0.011359 3,3:  0.005438 3,4:  0.002219 3,5:  0.004611 3,6:  0.003199 3,7:  0.002766 3,8:  0.005010 3,9:  0.003457 3,10:  0.009422 3,11:  0.006163 3,12:  0.000934 4,0:  0.011050 4,1:  0.008934 4,2:  0.023625 4,3:  0.012711 4,4:  0.005325 4,5:  0.009881 4,6:  0.005859 4,7:  0.005785 4,8:  0.010070 4,9:  0.009230 4,10:  0.011349 4,11:  0.001106 5,0:  0.010206 5,1:  0.006664 5,2:  0.017964 5,3:  0.009151 5,4:  0.003353 5,5:  0.006065 5,6:  0.003690 5,7:  0.004613 5,8:  0.008621 5,9:  0.003483 5,10:  0.000842 6,0:  0.011524 6,1:  0.007029 6,2:  0.018565 6,3:  0.010126 6,4:  0.005020 6,5:  0.008627 6,6:  0.006485 6,7:  0.008061 6,8:  0.001290 7,0:  0.013971 7,1:  0.008434 7,2:  0.019998 7,3:  0.013881 7,4:  0.009469 7,5:  0.012784 7,6:  0.006624 7,7:  0.001855 8,0:  0.011888 8,1:  0.007358 8,2:  0.019102 8,3:  0.018148 8,4:  0.020877 8,5:  0.014720 8,6:  0.003223 9,0:  0.024577 9,1:  0.017379 9,2:  0.023944 10,0:  0.021372 10,1:  0.014671 10,2:  0.020960 11,0:  0.022857 11,1:  0.015700 11,2:  0.014514 12,0:  0.009908 12,1:  0.004560
PROB_B_D3_v_trim 0:  0.146486 1:  0.104938 2:  0.109566 3:  0.114675 4:  0.142373 5:  0.164670 6:  0.115655 7:  0.044459 8:  0.032275 9:  0.009069 10:  0.009602 11:  0.004474 12:  0.001151 13:  0.000334 14:  0.000158 15:  0.000114
PROB_B_D3_j_trim 0:  0.126877 1:  0.081759 2:  0.131928 3:  0.114675 4:  0.138339 5:  0.129452 6:  0.099551 7:  0.074752 8:  0.042348 9:  0.032542 10:  0.017119 11:  0.002997 12:  0.004771 13:  0.002334 14:  0.000504 15:  0.000054
PROB_B_D3_vd_insert 0:  0.117023 1:  0.106011 2:  0.135202 3:  0.142546 4:  0.126806 5:  0.098625 6:  0.079346 7:  0.061604 8:  0.045508 9:  0.033575 10:  0.022364 11:  0.014154 12:  0.008527 13:  0.004833 14:  0.002536 15:  0.001337
PROB_B_D3_dj_insert 0:  0.196824 1:  0.119816 2:  0.135436 3:  0.137680 4:  0.124377 5:  0.095529 6:  0.065802 7:  0.045414 8:  0.029925 9:  0.020168 10:  0.013142 11:  0.007857 12:  0.004311 13:  0.002323 14:  0.000990 15:  0.000405
PROB_B_D3_tot_d_trim 0:  0.005888 1:  0.010089 2:  0.024431 3:  0.031252 4:  0.038221 5:  0.048943 6:  0.061654 7:  0.068543 8:  0.074228 9:  0.093789 10:  0.116932 11:  0.142851 12:  0.133383 13:  0.104151 14:  0.041201 15:  0.004443
"""


alpha_trim_prob_lines = { 'mouse': alpha_trim_prob_lines_mouse,
                          'human': alpha_trim_prob_lines_human }

beta_trim_prob_lines = { 'mouse': beta_trim_prob_lines_mouse,
                         'human': beta_trim_prob_lines_human }


## legacy files for old probability model
rep_freq_files = { 'mouse': { 'A':[ path_to_db+'/old/probs_files/old/tmp.read_sra_matches.alpha.log.N38469.U8750.arFalse.ubnTrue.dump_probs',
                                    path_to_db+'/old/probs_files/old/scjobs_tcrseq_tmp3.read_sra_matches.log.N17930695.U204691.arFalse.ubnTrue.dump_probs',
                                ],
                              'B': [ path_to_db+'/old/probs_files/old/tmp1.read_sra_matches.log.N24046228.U1407106.arFalse.ubnTrue.dump_probs',
                                     path_to_db+'/old/probs_files/old/tmp.read_sra_matches.beta.log.N20117.U9339.arFalse.ubnTrue.dump_probs',
                                     path_to_db+'/old/probs_files/old/tmp.read_sra_matches.log.N284176.U142210.arFalse.ubnTrue.dump_probs',
                                 ],
                          },
                   'human': { 'B': [ path_to_db+'/old/probs_files/old/tmp7.human_adaptive_beta.read_sra_matches.log.N5001894.U3967020.arFalse.ubnTrue.ssl25.fgdid.dump_probs' ],
                              'A': [ path_to_db+'/old/probs_files/old/tmp8.human_adaptive_alpha.read_sra_matches.log.N4271464.U2700969.arFalse.ubnTrue.ssl25.dump_probs' ] } }

for org in rep_freq_files:
    for ab in rep_freq_files[org]:
        for filename in rep_freq_files[org][ab]:
            assert exists(filename)


for organism in ['mouse','human']:

    trim_probs = {}
    for line in alpha_trim_prob_lines[organism].split('\n'):
        l = line.split()
        if not l:continue
        assert l[0].startswith('PROB_A')
        tag = l[0][5:]
        vals = map(float,l[1:])
        trim_probs[tag] = dict( zip( range(len(vals)), vals ) )


    for line in beta_trim_prob_lines[organism].split('\n'):
        l = line.split()
        if not l:continue
        assert l[0].startswith('PROB_B')
        tag = l[0][5:]
        trim_probs[tag] = {}
        assert len(l)%2==1
        num_vals = (len(l)-1)/2
        for i in range(num_vals):
            assert l[2*i+1][-1] == ':'
            key = l[2*i+1][:-1]
            if ',' in key:
                key = tuple(map(int,key.split(',')))
            else:
                key = int(key)
            trim_probs[tag][key] = float(l[2*i+2])


    ## fake probability for total trimming of the D gene
    for did,nucseq in all_trbd_nucseq[organism].iteritems():
        trimtag = 'B_D{}_d01_trim'.format(did)
        prob_trim_all_but_1 = 0.0
        for d0_trim in range(len(nucseq)):
            d1_trim = (len(nucseq)-1)-d0_trim
            assert d0_trim + d1_trim == len(nucseq)-1
            prob_trim_all_but_1 += trim_probs[trimtag].get((d0_trim,d1_trim),0)
        prob_trim_all = 0.0
        for d0_trim in range(len(nucseq)+1):
            d1_trim = (len(nucseq))-d0_trim
            prob_trim_all += trim_probs[trimtag].get((d0_trim,d1_trim),0)
        assert prob_trim_all <1e-6
        #print 'old_prob_trim_all:',prob_trim_all,'prob_trim_all_but_1:',prob_trim_all_but_1,'D',did
        new_prob_trim_all = 0.75 * prob_trim_all_but_1
        for d0_trim in range(len(nucseq)+1):
            d1_trim = (len(nucseq))-d0_trim
            if d0_trim == 0:
                #print 'new_prob_trim_all:',new_prob_trim_all
                trim_probs[trimtag][ (d0_trim,d1_trim) ] = new_prob_trim_all ## concentrate all here
            else:
                trim_probs[trimtag][ (d0_trim,d1_trim) ] = 0.0
        total = sum( trim_probs[trimtag].values())
        for k in trim_probs[trimtag]:
            trim_probs[trimtag][k] /= total
        assert abs( 1.0 - sum( trim_probs[trimtag].values()) ) < 1e-6


    beta_prob_tags_single = ['v_trim','j_trim','vd_insert','dj_insert']
    for tag in beta_prob_tags_single:
        tags = [ 'B_D{}_{}'.format(x,tag) for x in all_trbd_nucseq[organism] ]
        #tag1 = 'B_D1_{}'.format(tag)
        #tag2 = 'B_D2_{}'.format(tag)
        avgtag = 'B_{}'.format(tag)
        trim_probs[avgtag] = {}
        for k in trim_probs[tags[0]].keys():
            trim_probs[avgtag][k] = sum( trim_probs[x].get(k,0) for x in tags ) / float(len(tags))
        #print organism,avgtag,trim_probs[avgtag]

    ## now load frequencies of alpha and beta reps:

    rep_probs = {}
    for ab in rep_freq_files[organism]:
        files = rep_freq_files[organism][ab]
        for vj in 'VJ':
            probs ={}
            for file in files:
                assert exists(file)
                for line in popen('grep "^{}{}_REP_FREQ" {}'.format(ab,vj,file)):
                    l = line.split()
                    assert len(l) == 4
                    nonuniq_freq = float( l[1] ) / 100.0 ## now from 0 to 1
                    rep = l[3]
                    assert rep[2:4] == ab+vj
                    if rep not in probs:probs[rep] = []
                    probs[rep].append( nonuniq_freq )

            avg_probs = {}
            for rep in probs:
                vals = probs[rep] + [0.0]*(len(files) - len(probs[rep]) )
                if len(vals) == 2:
                    avg_probs[rep] = sum( vals )/2.0
                elif len(vals) == 1: ## adding for human
                    avg_probs[rep] = vals[0]
                else:
                    assert len(vals) == 3 ## hack
                    avg_probs[rep] = get_median( vals)

            ## do we really want to normalize here??
            total = min( 1.0, sum( avg_probs.values() ) ) ##only increase probabilities...
            for rep in probs:
                rep_probs[rep] = avg_probs[rep] / total
                if __name__ == '__main__' and len(sys.argv) == 1:
                    print 'rep_probs: %12.6f %s %s'%(100.0*rep_probs[rep],organism,rep)

    ## normalize trim_probs
    for tag,probs in trim_probs.iteritems():
        if type(probs) == type({}):
            total = sum( probs.values())
            assert abs(1.0-total)<1e-2
            #print 'normalize trim_probs:',tag,total
            for k in probs:
                probs[k] = probs[k] / total
        else:
            assert False
            assert type(probs) == type([])
            total = sum( probs )
            assert abs(1.0-total)<1e-2
            #print 'normalize trim_probs:',tag,total
            for i in range(len(probs)):
                probs[i] = probs[i]/total


    all_trim_probs[organism] = trim_probs
    all_rep_probs[organism] = rep_probs


def get_alpha_trim_probs( organism, v_trim, j_trim, vj_insert ):
    total_prob = 1.0
    for ( val, tag ) in zip( [v_trim, j_trim, vj_insert], ['A_v_trim','A_j_trim','A_vj_insert'] ):
        probs = all_trim_probs[organism][tag]
        if val >= len(probs):
            return 0.0
        total_prob *= probs[val]
    return total_prob

def get_beta_trim_probs( organism, d_id, v_trim, d0_trim, d1_trim, j_trim, vd_insert, dj_insert ): ## work in progress
    assert d_id in all_trbd_nucseq[organism]
    dd = (d0_trim,d1_trim)
    d_trim_tag = 'B_D{}_d01_trim'.format(d_id)
    total_prob = all_trim_probs[organism][d_trim_tag].get(dd,0)
    #total_prob = trim_probs[d_trim_tag][dd] ## what about full trims?? will get an error
    for ( val, tag ) in zip( [v_trim, j_trim, vd_insert, dj_insert], beta_prob_tags_single ):
        probs = all_trim_probs[organism]['B_'+tag] ## a dictionary for beta (a list for alpha)
        if val not in probs:
            return 0.0
        total_prob *= probs[val]
    return total_prob


def probs_data_exist( organism, chain ):
    return True #legacy hack


if __name__ == '__main__':
    pass