-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNewBornReport_vip_v5.py
1607 lines (1493 loc) · 84.3 KB
/
NewBornReport_vip_v5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# import imp
# import sys
# fn_, path, desc = imp.find_module('mymodule', ['/data/module/'])
# print fn_,path,desc
# mod = imp.load_module(''mymodule'', fn_, path, desc)
# print dir(mod)
#这样就会把/data/module/mymodule.py模块导入进来,load_modul方法的第一个参数可以任意写,例如mym,
#作用相当于 import mymodule as mym
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# 代码有问题别找我!虽然是我写的,并且我觉得它是没问题的,如果不是你的操作原因,或许是它自己长歪了!
"""
A pipeline uses for writing NewBorn v5 vip report for each sample which base anno by BGIANNO !!!
"""
from __future__ import division
import sys, os, logging, imp, zipfile, json, re, sqlite3, shutil, errno
from optparse import OptionParser, OptionGroup
import xlsxwriter, time, fnmatch, gzip, copy, xlrd
from xml.sax.handler import ContentHandler
from xml.sax import make_parser
from collections import defaultdict
import pysam, tempfile
import coloredlogs
import haps_check,orcl_conn
coloredlogs.install()
#import multiprocessing
if sys.getdefaultencoding() != 'utf-8':
reload(sys)
sys.setdefaultencoding('utf-8')
logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("Running: python %s" % ' '.join(sys.argv))
#def cpu_count():
# return multiprocessing.cpu_count()
drug_and_dis=['OTC','MT-RNR1','ARG1','ASS1','NAGS','CPS1','ASL','G6PD'] #药物和疾病共有基因8个
drug_not_dis=['VKORC1','HLA-B','POLG','CYP2D6','NAT2','CYP4F2','TPMT','CYP2C9','CYP2C19'] #药物非疾病基因9个
def get_alt(strings): #获取alt变异
if re.compile('alt:(.*)\s\)').search(strings):
m = re.compile('alt:(.*)\s\)').search(strings)
return m.group(1)
else:
return strings
def extract(file_name, dirname):
try:
if not os.path.exists(dirname): os.makedirs(dirname)
filein = zipfile.ZipFile(os.path.realpath(file_name))
for f in filein.namelist():
message = filein.extract(f, dirname)##拷贝文件,返回那个文件
filein.close()
return True
except:
return False
def parsing_conf(config_file): ##加载json
try:
with open(config_file) as config_files:
config = json.load(config_files, encoding="utf-8")
return config
except:
return False
def parsing_demo(module):
try:
mod_path, mod_name = os.path.split(module)
mod_name, mod_ext = os.path.splitext(mod_name)
mod_file, mod_filename, mod_data = imp.find_module(mod_name, [mod_path])
if not mod_name in sys.modules: ##sys.modules 记录加载进来的模块,imp用来加载模块
base = imp.load_module(mod_name, mod_file, mod_filename, mod_data)
else:
logger.info("[ERRO]: %s module have existed in your python, please rename your base_config !!" % mod_name)
sys.exit(1)
return base
except:
return False
def load_demo_config(demo, config, base_xml, outdir):
demo_return = extract(demo, outdir)##拷贝demo
config_return = parsing_conf(config)##解析
base_xml_return = parsing_demo(base_xml)##加载必要的模块
assert (demo_return and config_return and base_xml_return), "Erro" ##断言一定为真,否则报错
return demo_return, config_return, base_xml_return
class NormXmlFormat(object): ##特殊字符串转化
def __init__(self, types=True):
adict = {
r'&': r'&',
r'>': r'>',
r'<': r'<',
r'"': r'"',
r"'": r"'"
}
if types is False:
adict = dict([(v, k) for k, v in adict.iteritems()])
self.adict = adict
self.rx = self.make_rx()
def make_rx(self):
return re.compile('|'.join(map(re.escape, self.adict))) ##如果字符串很长且包含很多特殊字符,而你又不想输入一大堆反斜线,可以使用这个函数re.escape
def one_xlat(self, match): ##match 是必要的参数吧
return self.adict[match.group(0)]
def __call__(self, text):
return self.rx.sub(self.one_xlat, re.sub("(\s+)?\t(\s+)?", "\t", text)) ##为啥self.one_xlat没有参数
class ParsingDatabase(object):
def __init__(self, data, model, token_pathogenic=set()):
self.dbmodel = model
self.dbdir = data
self.mutdb = defaultdict(list)
self.genes = defaultdict(set)
self.diseasedb = defaultdict(list)
self.gene_diseasedb = defaultdict(list)
self.drug = defaultdict(dict)
self.entertainment = defaultdict(list)
self.flank=defaultdict(dict)
self.token_pathogenic = token_pathogenic
self.dbrawtitle = defaultdict(lambda: 'unkown')
self.database_title = defaultdict(int)
self.definition_config=defaultdict(list)
##self.dbdir===/ifs4/HST_5B/USERS/zhaocece/Production/newborn_datasets
mut_db = os.path.join(self.dbdir, 'nm_cds_db.txt') ##过去突变位点的详细注释
flank_db = os.path.join(self.dbdir,'flank_db.txt')##位点侧翼序列
disease_db = os.path.join(self.dbdir, 'disease_all_' + self.dbmodel) ##疾病的中英文,描述NUM name 中文名称 中文描述 参考文献 disease information reference
drug_db = os.path.join(self.dbdir, 'drug')
##drug_rang是RS号位点,区间,扩展区域
##drug_gene是基因列表,纯基因名字,有不同版本
disease_gene = os.path.join(self.dbdir, 'disease_gene_' + self.dbmodel) #基因名列表
drug_gene = os.path.join(self.dbdir, 'drug_gene_' + self.dbmodel)
#drug_gene_v1是17个基因,disease_gene_v1是88个基因
#drug_gene_v5是17个基因,disease_gene_v5是167个基因,安馨可本身药物基因是17个,与疾病166基因, overlap 8个
gene_disease = os.path.join(self.dbdir, 'gene_disease')
#BCKDHA 枫糖尿病 AR,基因,疾病以及遗传模式
db_definition = os.path.join(self.dbdir, 'db_definition') #Pathogenic missense $gene基因的$muntation$突变是$disease$的$type$突变。该突变为错义突变,已有该位点致病性的相关文献报道[$num$]。 $gene基因的$muntation$突变是$disease$的$type$突变。该突变为错义突变,暂未发现该位点致病性的相关文献报道。 #不同的突变类型,有文献与无文献的陈述形式
entertainment = os.path.join(self.dbdir, 'entertainment/entertainment_last.db_' + self.dbmodel)
self.connet = sqlite3.connect(entertainment) #连接数据库或者创建数据库
self.con = self.connet.cursor()
#该例程创建一个 cursor,将在 Python 数据库编程中用到。
#数据库怎么用的还不清楚,数据库里的内容导出看看是什么,以及整合的时候用了什么字段
norm_xml_format = NormXmlFormat(True)
try:
for lines in file(db_definition):
line=lines.strip().split('\t')
self.definition_config[(line[0].lower(),line[1].lower())] = line ##创建列表字典,根据第一列致病性字符串以及第二列突变类型
except Exception as err:
logger.info("[ERRO]: definition_config database format erro :%s !!" % err)
try:
with open (gene_disease) as f:
for lines in f:
line=lines.strip().split('\t')
self.gene_diseasedb[line[0]]=line
##BCKDHA 枫糖尿病 AR
## 基因,疾病以及遗传模式
except Exception as err:
logger.info("[ERRO]: gene_disease database format erro :%s !!" % err)
try:
for lines in file(flank_db):
line=lines.strip().split('\t')
self.flank[line[0].lower()][line[1]]=line[2] ##染色体 位置 侧翼序列
except Exception as err:
logger.info("[ERRO]: flank database format erro :%s !!" % err)
try:
for lines in file(disease_db):
line = norm_xml_format(lines).strip().split('\t') ##疾病中英文描述及文献,格式化输出?格式化不知道怎么实现的
self.diseasedb[line[1].lower()] = line
except Exception as err:
logger.info("[ERRO]: disease database format erro :%s !!" % err)
raise
try:
with open(disease_gene) as d:
for g in d:
g = g.strip()
g = re.sub('^\s+|\s+$', '', g)
self.genes['disease'].add(g)
self.genes['all'].add(g)
with open(drug_gene) as d:
for g in d:
g = g.strip()
g = re.sub('^\s+|\s+$', '', g)
self.genes['drug'].add(g)
self.genes['all'].add(g)
except Exception as err:
logger.info("[ERRO]: genes database format erro :%s !!" % err)
raise
try:
haplotye = defaultdict(dict) ##yaowu haplotype[gene+hom][*1] or haplotype[chr start end ref call][*1]=sore转换的值
haplotye_score = defaultdict(dict)##yaowu haplotype_score[gene][*1]
default_gene_hap = defaultdict(lambda: ".") ##bushi yaowu de
drug_tag = defaultdict(dict) ##yaowu drug_tag[gene][*1]=B
genotype = defaultdict(dict)##yaowu genotype[gene][AB]=[IM,kuaidaixiexing]
interpretation = defaultdict(dict)##yaowu interpretation[gene][BM]=[gene,ami,IM,jianshao,]
backgroud = defaultdict(list) ##yaowu backgroud[kamaxiping]=[]
#读取数据库,存储起来
for line in file(os.path.join(drug_db, 'db_haplotype')): #/ifs4/HST_5B/USERS/zhaocece/Production/newborn_datasets/drug/db_haplotype
##Gene Chr Start End Ref Mut NM Haplotype Cds Code Score ##Haplotype是啥意思 *2怎么来的
# CYP2C19 . . . . Ref . *1 . A 0
# CYP2C19 chr10 96541615 96541616 G A NM_000769.1 *2 c.681G>A B 1
# CYP2C19 chr10 96540409 96540410 G A NM_000769.1 *3 c.636G>A B 1
if line.startswith("#"):
continue
lines = re.sub("(\s+)?\t(\s+)?", "\t", line).strip().split('\t')
gene, chrom, start, end, ref, mut, trans, haplo, chgvs, tag, score = lines
if mut == "Ref":
default_gene_hap[gene] = haplo
haplotye_score[gene][haplo] = float(score) ##CYP2C19 *3 =1
drug_tag[gene][haplo] = tag ##CYP2C19 *3 =B
if chrom is not "." and start is not "." and end is not ".":
tmp_mut = ":".join([chrom, start, end, ref, mut])
haplotye[tmp_mut][haplo] = 1.0 / haplotye_score[gene][haplo] if haplotye_score[gene][haplo] > 0.0 \
else haplotye_score[gene][haplo]
elif haplo == "Hom":
tmp_mut = ":".join([gene, "纯合"])
haplotye[tmp_mut][haplo] = 1.0 / haplotye_score[gene][haplo] if haplotye_score[gene][haplo] > 0.0 \
else haplotye_score[gene][haplo]
for line in file(os.path.join(drug_db, 'db_genotype')):
# Gene Code Genotype Description
# CYP2C19 AA EM 快代谢型
# CYP2C19 AB IM 中间代谢型
# CYP2C19 BA IM 中间代谢型
# CYP2C19 AD Unknown 临床意义未明
# CYP2C19 DA Unknown 临床意义未明
# CYP2C19 BB PM 弱代谢型
if line.startswith("#"):
continue
lines = re.sub("(\s+)?\t(\s+)?", "\t", line).strip().split('\t')
gene, code, genotypes, description = lines
genotype[gene][code] = [genotypes, description]
for line in file(os.path.join(drug_db, 'db_interpretation')):
##Gene Drug Genotype Short recommendation ch Therapeutic (dose) recommendation Reference Short recommendation Therapeutic (dose) recommendation Reference
#CYP2D6 阿米替林 IM 减少剂量 CPIC(Clinical Pharmacogenetics Implementation Consortium)剂量指导CYP2D6中间代谢型使用阿米替林时,建议降低25%的起始剂量,并使用治疗药物指导剂量调整。与CYP2D6抑制剂药物联合使用时,需要适当降低阿米替林的剂量[1,2]。 [1]Hicks JK, Swen JJ, Thorn CF et al. Clinical Pharmacogenetics Implementation Consortium guideline for CYP2D6 and CYP2C19 genotypes and dosing of tricyclic antidepressants. Clin Pharmacol Ther 2013; 93: 402-408.[2]Par Pharmaceutical, Inc. Chlordiazepoxide and Amitriptyline drug label.2011. Reduce dose The CPIC Dosing Guideline for amitriptyline a 25% dose reduction and monitor plasma concentration or select alternative drug (e.g., citalopram, sertraline) for CYP2D6 intermediate metabolizers[1,2]. [1]Hicks JK, Swen JJ, Thorn CF et al. Clinical Pharmacogenetics Implementation Consortium guideline for CYP2D6 and CYP2C19 genotypes and dosing of tricyclic antidepressants. Clin Pharmacol Ther 2013; 93: 402-408. [2]Par Pharmaceutical, Inc. Chlordiazepoxide and Amitriptyline drug label.2011.
if line.startswith("#"):
continue
rows = norm_xml_format(line).strip().split('\t')
gene, drug_ch, drug_gen, short_ch, long_ch, reference_ch, short, long, reference = rows
if drug_gen not in interpretation[gene]:
interpretation[gene][drug_gen] = [rows]
#将基因和基因型(不是变异的那个基因型,而是跟用药有关的基因型)为键,值为所有的解读信息的list
else:
interpretation[gene][drug_gen].append(rows) ##如果已经存在则加上这个list
for line in file(os.path.join(drug_db, 'db_background')):
#药物信息
##序号 药物 Drug 背景 参考文献 Drug info Reference
#1 卡马西平 Carbamazepine 卡马西平属于抗惊厥和抗癫痫药物,适用于部分性和全身性癫痫发作,同时也可作为治疗缓解三叉神经痛和舌咽神经痛的特效药物,另外用于预防或治疗躁狂-抑郁症、中枢性部分尿崩症等症状。由卡马西平引起的常见不良反应包括头晕、共济失调、嗜睡和疲劳等症状。另外有极少部分病人会产生严重、甚至致死的变态反应,包括史蒂文斯-约翰逊综合征(SJS)和中毒性表皮坏死溶解症(TEN)。2004年首次在汉族中发现由卡马西平引起的SJS/TEN与HLA-B*1502基因型紧密关联[1],其后同一个研究小组后续的研究以及在香港汉族人中研究进一步验证了这个结论[2,3]。2011年在台湾招募4877名未服用卡马西平的候选受试者,经过HLA*1502检测分型后,建议阳性携带者不服用卡马西平而是服用备选的另外一种药物,而阴性受试者继续服用卡马西平,结果显示没有出现一例SJS/TEN患者,而由卡马西平导致的SJS/TEN历史发病率为0.23%[4]。这项研究表明了对于汉族人使用卡马西平前进行HLA*1502基因型分型的必要性。
if line.startswith("#"):
continue
rows = norm_xml_format(line).strip().split('\t')
backgroud[rows[1]] = rows
self.drug['haplotye'] = haplotye
self.drug['haplotye_score'] = haplotye_score
self.drug['drug_tag'] = drug_tag
self.drug['genotype'] = genotype
self.drug['interpretation'] = interpretation
self.drug['backgroud'] = backgroud
self.drug['default_gene_hap'] = default_gene_hap
except Exception as err:
logger.info("[ERRO]: Drug database format erro :%s !!" % err)
raise
try:
enter_sql = "SELECT * FROM commits"
self.con.execute(enter_sql)
enter_meassage = self.con.fetchall()
for rows in enter_meassage:
#('TERMID_1', 'chr17', '42328620', '42328621', 'G', 'A', 'Hom', 'SLC4A1', 'rs2285644', 'AA', '2', '0.0')
num, chrom, start, end, ref, mut, zygosity, gene, rs, g_type, result, fre = rows
key = (chrom, start, end, ref, mut, zygosity)
self.entertainment[key].append(rows) ##这个数据库不知道从哪来的,个体特征?存储突变的位置refalt突变以及纯杂合,值为整行最后两列不太确定,尤其倒第二列
except Exception as err:
logger.info("[ERRO]: entertainment database select erro :%s !!" % err)
raise
try:
rawtitle = defaultdict(str)
database_title = defaultdict(int)
tmp_mut_db = [line for line in file(mut_db)] ##过去突变位点的详细注释
mut_db_title = norm_xml_format(tmp_mut_db.pop(0)).strip().split('\t') #biaotou格式化
for number_of_terms in xrange(len(mut_db_title)):
t_name = mut_db_title[number_of_terms].lower()
rawtitle[t_name] = mut_db_title[number_of_terms]
database_title[t_name] = number_of_terms
self.database_title = database_title ##过去突变位点的详细注释的表头的对应哪个列的记录
self.dbrawtitle = rawtitle ##现在的小写后的表头对应的过去突变位点的详细注释的表头
for lines in tmp_mut_db: ##一下是每一列的格式规整,提取对应字段
line = norm_xml_format(lines).strip().split('\t')
chrom = 'chr' + re.sub('^chr', '', line[database_title['chrom']])
start = line[database_title['start']]
end = line[database_title['stop']]
trans = re.sub('\.[0-9]+$', '', line[database_title['nbtrans']])
phgvs = re.sub('\.[0-9]+$', '', line[database_title['transcript']])
ref = line[database_title['ref']]
call = line[database_title['call']]
definition = line[database_title['definition']] ##致病性
gene_sym = line[database_title['#gene']]
if len(token_pathogenic) and definition.lower() not in self.token_pathogenic: ##不知道self.token_pathogenic怎么获取到的,致病性的种类
continue
if gene_sym not in self.genes['all']: #过滤掉非目的基因的
continue
term_keys = (chrom, start, end, ref, call)
if (term_keys not in self.mutdb) or (trans == phgvs): ##如果条目不在已有的数据库或者两列的转录本一致,存储成数据库mutdb,键chrom, start, end, ref, call
self.mutdb[term_keys] = line
except Exception as err:
logger.info("[ERRO]: Mut database format erro :%s !!" % err)
raise
def __del__(self):
self.connet.close() ##关闭数据库的关联
def description(self, num, factor):
sql = "SELECT * FROM descriptions WHERE Num = \"" + num + "\" AND Factor = \"" + factor + "\""
try:
self.con.execute(sql)
results = self.con.fetchone()
return "\t".join(results)
except Exception as err:
logger.info("[ERRO]: Commits select erro : %s" % err)
return None
def gender_message(sample): #获取性别
gender_dic={}
for rows in file('final_result'):
#Order Sample Ave_depth rmdup_depth Cover Cover_30X Gender Index Mass_result
# 1 18S0150055R2 852.39 202.07 99.71% 98.41% F 18L0150055R2-113 NA
# 2 18S0415555 761.26 194.55 99.25% 96.29% F 18L0415555-108 NA
# 3 18S0409209 613.46 201.79 99.55% 97.31% F 18L0409209-118 NA
if rows.startswith('Order'):continue
row = rows.strip().split('\t')
gender_dic[row[1]] = row[6][-1]
return gender_dic[sample]
def sample_messages(sample_list, message_file=None, title=None): #ganmade
if not title:
title = dict()
norm_xml_format = NormXmlFormat(True)
back_xml_format = NormXmlFormat(False)
message = defaultdict(dict)
messages = ['born_time','guardian_name','name']
connect = orcl_conn.Oracle() ##实例
with open(sample_list) as samples:##读取list时候,抽提客户的记录信息
for sample in samples:
rows = sample.strip().split('\t')
sample_name = rows[0]
sql = "select * FROM DX_SIMS.VIEW_EAR_REPORT_ANALYSIS where MAIN_SAMPLE_NUM = \'"+sample_name+"\'"
vip_info = connect.vip_result(sql) ##没有测试出来
gender = rows[1]
#name=rows[5].decode('gb2312')
message[sample_name] = defaultdict(lambda: "-")
message[sample_name]['gender'] = "女" if re.compile('f', re.I).match(gender) else '男'
# message[sample_name]['guardian_name'] = "缺省" if re.compile('null', re.I).match(guardian_name) else guardian_name
for i in range (len(messages)):
message[sample_name][messages[i]] = vip_info[i] ##orcl数据库里有客人的出生日期,名字以及guardian_name
#message[sample_name]['name'] = "-" if re.compile('null', re.I).match(name) else name
if message_file: ##这是个什么文件
with open(message_file) as messages:
for line in messages:
rows = norm_xml_format(line).strip().split('\t')
sample = rows[title['sample_name']] if 'sample_name' in title else rows[0]
sample = back_xml_format(sample)
for k, v in title.iteritems():
message[sample][k] = rows[v]
return message
def xlsx2list(filein, sheet_name=set()): #读取sheet_name指定的sheet,展开成列表
def check_value_from_xlrd(values):
if type(values) is float and values == int(values):
values = int(values)
return str(values)
tables = defaultdict(dict)
if os.path.isfile(filein + '.xls'):
filein = filein + '.xls'
elif os.path.isfile(filein + '.xlsx'):
filein = filein + '.xlsx'
if not filein.endswith('xlsx') and not filein.endswith('xls'):
return tables
data = xlrd.open_workbook(filein)
for sheet in data.sheets():
if len(sheet_name) and sheet.name not in sheet_name:
continue
nrows = sheet.nrows
ncols = sheet.ncols
if nrows < 1 or ncols < 1:
continue
tables[sheet.name] = [[check_value_from_xlrd(sheet.cell(i, j).value).replace('\r', '').replace('\n', '')
for j in range(ncols)] for i in range(nrows)] ##行列循环,修正值,并且展开成一个列表
return tables
def sample_gene_id(geneid=None): ##读取id.xls的内容,记录成每个样本在每个基因(或者基因组合)的基因型?比如g-T-A g-C-G,并返回
gene_id_dict = defaultdict(dict)
if not geneid:
return gene_id_dict
norm_xml_format = NormXmlFormat(True)
sample_gene_id_messages = xlsx2list(geneid, {"haps"})##xlsx2list读取sheet_name指定的sheet,展开成列表
sample_gene_id_message = sample_gene_id_messages["haps"]
if len(sample_gene_id_message):
gene_id_title = sample_gene_id_message.pop(0)
if gene_id_title.pop(0) == "title":
for rows in sample_gene_id_message:
sample_name = rows.pop(0)
if len(rows) != len(gene_id_title):
continue
for num in xrange(len(rows)):
word = '-'.join(gene_id_title[num].split()).upper()
gene_id_dict[sample_name][word] = norm_xml_format(rows[num].upper()).strip()
else:
logger.info("[ERRO]: Gene ID file format erro !!")
else:
logger.info("[ERRO]: Gene ID file format erro !!")
return gene_id_dict
def findfile(pathdir, sample, name, type, filters=None):
for root, dirs, files in os.walk(os.path.join(pathdir, sample)):
for filename in fnmatch.filter(files, str(name) + '*.' + type):
if os.path.isdir(filename):
continue
if filters and re.compile(filters + '.' + type + "$", re.I).search(filename):
continue
return os.path.join(root, filename)
return None
def indexfile(variation, bamdir, sample_name, indexdir):
vcffile, depthfile = findfile(variation, sample_name, str(sample_name) + '.', 'gz', 'vcf'), \
findfile(bamdir, sample_name, str(sample_name) + '.', 'bam')
if vcffile is None or depthfile is None:
return vcffile, depthfile
if not os.path.isfile(depthfile + '.bai') and not os.path.isfile(re.sub("bam$", 'bai', depthfile)):
filedir, input_file = os.path.split(depthfile)
if not os.access(filedir, os.W_OK):#os.W_OK 包含在access()的mode参数中 , 测试path是否可写。
os.symlink(depthfile, os.path.join(indexdir, input_file))
depthfile = os.path.join(indexdir, input_file)
retval = pysam.index(depthfile)##不确定这个depthfile是啥,就是bam文件
return vcffile, depthfile
def copy_file(olddir, new_dir):
tmp_time = time.strftime('%Y.%m.%d.%H.%M', time.localtime(time.time()))
try:
if os.path.exists(new_dir):
if os.path.exists("_".join([new_dir, 'bak', tmp_time])):
shutil.rmtree("_".join([new_dir, 'bak', tmp_time]), ignore_errors=True)
os.rename(new_dir, "_".join([new_dir, 'bak', tmp_time]))
shutil.copytree(olddir, new_dir)
except OSError as err:
if err.errno == errno.ENOTDIR:
shutil.copy(olddir, new_dir)
else:
raise
def isnum(v):
try:
x = float(v)
except TypeError:
return False
except ValueError:
return False
except Exception, e:
return False
else:
return True
def parsing_mut_indb(batch_num, sample, indb, check_gene, gender, newborndb, haplotype, , f_out, all_indb_out):
##batch_num 批次或者项目编号
##sample 样品名称
##indb是个临时汇总记录的字典,如果不在外部数据库或者转录本是目的转录本,则记录
####check_gene在已知库中的位点,然后基因与疾病组合键的包含的位点list集合
##message['gender'] 性别信息
##newborndb是流程数据库路径下的各种信息的组合
##haplotype 项目分析的result路径下有id.xls 以及每个样本的好像是基因型但是就两行
##titles bedanno的表头,直接看起来就是,应该没有其他改动
##tmp_out
##all_indb_out在已有的数据库中的记录
fout = open(f_out, 'w')
annotations = defaultdict(set)
drug_gene_hap = defaultdict(lambda: ".")
drug_haplo_score_a = defaultdict(float)
drug_haplo_score_b = defaultdict(float)##存?
other_drug_gene = set()
drug_usage = defaultdict(set)
default_hap = newborndb.drug['default_gene_hap'] ##如果没突变使用默认的状态
check_nat2 = True
back_xml_format = NormXmlFormat(False)
pathogenic_trans = {
"pathogenic": "已知致病",
"dm": "已知致病",
"likely pathogenic": "疑似致病",
"dp": "疑似致病",
"drug": "药物"
}
database_title = sorted(newborndb.database_title.keys(), key=lambda s: int(newborndb.database_title[s]))##过去的记录,已经解读过的位点的详细信息的表头,已有数据库表头是标杆
dis_genes = newborndb.genes['disease']##166gene
drug_genes = newborndb.genes['drug'] ##drug基因,drug_gene_v5 17个
# fout.write(back_xml_format("\t".join([newborndb.dbrawtitle[i] for i in database_title])) + '\n')
fout.write("Project" + '\t' + "Sample" + '\t' + back_xml_format("\t".join([newborndb.dbrawtitle[i] for i in database_title if newborndb.database_title[i] <= newborndb.database_title["phgvs"]]))) ##转换成原来的格式
fout.write('\t'+"ForPrimerDesign" + '\t' + back_xml_format("\t".join([newborndb.dbrawtitle[i] for i in database_title if newborndb.database_title[i] >newborndb.database_title["phgvs"]])) + '\n')
for annotation_key, annotation_value in indb.iteritems():##indb是个临时汇总记录的字典,如果突变位点在外部数据库或者再加上转录本是目的转录本,则记录,键是chr start end ref alt
database_line = newborndb.mutdb[annotation_key]##仍然是和indb感觉一样呢,##如果条目不在已有的数据库或者两列的转录本一致,存储成数据库mutdb,键chrom, start, end, ref, call,indb是目前这个样本在数据库中的位点,mutdb是过去数据库中已经解读的位点
#循环变异条目,假如目前解读的位点不在过去的数据库里怎么处理?
for db_item_value, db_item_number in newborndb.database_title.iteritems(): ##过去的记录,已经解读过的位点的详细信息的表头,用过去的表头循环现在的信息,上面最外层是现在样本的突变信息
if db_item_value in annotation_titles and not db_item_value.lower() == "flank":
database_line[db_item_number] = annotation_value[annotation_titles[db_item_value]] ##annotation_value是个list,我目前觉得多此一举的感觉,可能某个转换没意识到
database_line[newborndb.database_title["mutfunction"]] = database_line[newborndb.database_title["nbtrans"]]
database_line[newborndb.database_title["nbchgvs"]] = annotation_value[annotation_titles['chgvs']]
database_line[newborndb.database_title["nbtrans"]] = annotation_value[annotation_titles['transcript']]
chgvs = database_line[newborndb.database_title["nbchgvs"]]
trans = database_line[newborndb.database_title["nbtrans"]]
term_phgvs = "."
try:
phgvs = database_line[newborndb.database_title["phgvs"]]
for pv in re.findall('p.[^\s+|:]+', phgvs, re.I): #p.Y240C | p.Tyr240Cys
if len(pv) >= len(term_phgvs): term_phgvs = pv ##最终取非缩写的p点?看着是,看嘛去
phgvs = term_phgvs
except:
phgvs = term_phgvs
gene_name = database_line[newborndb.database_title["#gene"]]
inheritance = database_line[newborndb.database_title["tag"]]
if gene_name in newborndb.gene_diseasedb : inheritance = newborndb.gene_diseasedb[gene_name][2]##遗传模式,AR AD
gtype = database_line[newborndb.database_title["zygosity"]]
zygosity = database_line[newborndb.database_title["zygosity"]]
functions = database_line[newborndb.database_title["functionname"]]
funcregion = database_line[newborndb.database_title["funcregion"]]
flank = database_line[newborndb.database_title["flank"]] ##过去数据库中的情况
if re.compile('hom', re.I).match(gtype):
gtype = "纯合"
elif re.compile('het', re.I).match(gtype):
gtype = "杂合"
# else:
# continue
if database_line[newborndb.database_title["definition"]].lower() not in pathogenic_trans:
continue ##pathogenic dm likely pathogenic dp drug
pathogenic = pathogenic_trans[database_line[newborndb.database_title["definition"]].lower()]
mut_definiton = database_line[newborndb.database_title["chlongdescription"]]
references = database_line[newborndb.database_title["reference"]]
forPrimerDesign = (gene_name,trans,chgvs,phgvs,funcregion,zygosity,flank)
if gene_name in dis_genes:
disease_en = database_line[newborndb.database_title["disease"]]
if disease_en.lower() in newborndb.diseasedb:
disease_raw = newborndb.diseasedb[disease_en.lower()]
else:
disease_raw = ["."] * 7
disease_ch = disease_raw[2] if disease_raw[2] is not "." else disease_en
disease_descri_ch = disease_raw[3]
disease_refer_ch = disease_raw[4]
fout.write(batch_num + '\t' + sample + '\t')
all_indb_out.write(batch_num + '\t' + sample + '\t')
for i in range(len(database_line)):
if(i == newborndb.database_title["phgvs"] ):
fout.write(back_xml_format(database_line[i]) + '\t' + back_xml_format("; ".join(forPrimerDesign)) + '\t')
all_indb_out.write(back_xml_format(database_line[i]) + '\t' + back_xml_format("; ".join(forPrimerDesign)) + '\t')
else:
fout.write(back_xml_format(database_line[i]) + '\t')
all_indb_out.write(back_xml_format(database_line[i]) + '\t')
fout.write('\n')
all_indb_out.write('\n')
#### fout.write(back_xml_format("\t".join(database_line)) + '\n')
annotations["mut_definiton"].add((disease_ch, disease_en, mut_definiton, references))
annotations["disease_message"].add((disease_ch, disease_en, disease_descri_ch, disease_refer_ch))
check_gene_num = 0
for ann_key in check_gene[(gene_name, disease_en.lower())]:
if newborndb.mutdb[ann_key][newborndb.database_title["definition"]].lower() in pathogenic_trans:
check_gene_num += 1
if ( check_gene_num > 1) or \
(not re.compile('[ax]r', re.I).search(inheritance)) or \
(re.compile('x', re.I).search(inheritance) and gender == "男") or \
(gtype == "纯合" or gtype == "Hemi"):
annotations["pathgenic"].add(
(disease_ch, disease_en, gene_name, inheritance.upper(), chgvs, phgvs, functions, gtype, pathogenic,trans,references))
else:
annotations["carrier"].add(
(disease_ch, disease_en, gene_name, inheritance.upper(), chgvs, phgvs, functions, gtype, pathogenic,trans,references))
if gene_name in drug_genes:
if (not gene_name in newborndb.drug['haplotye']) and (
not ":".join(annotation_key) in newborndb.drug['haplotye']):
continue
fout.write(batch_num + '\t' + sample + '\t')
all_indb_out.write(batch_num + '\t' + sample + '\t')
for i in range(len(database_line)):
if(i == newborndb.database_title["phgvs"]):
fout.write(back_xml_format(database_line[i]) + '\t' + back_xml_format("; ".join(forPrimerDesign)) + '\t')
all_indb_out.write(back_xml_format(database_line[i]) + '\t' + back_xml_format("; ".join(forPrimerDesign)) + '\t')
else:
fout.write(back_xml_format(database_line[i]) + '\t')
all_indb_out.write(back_xml_format(database_line[i]) + '\t')
fout.write('\n')
all_indb_out.write('\n')
# fout.write(back_xml_format("\t".join(database_line)) + '\n')
mut_key = ":".join(annotation_key)
if gene_name == "NAT2":
check_nat2 = False
if mut_key in newborndb.drug['haplotye']:
for haplo, score in newborndb.drug['haplotye'][mut_key].iteritems():
drug_haplo_score_a[(gene_name, haplo)] += float(score)
if gtype == "纯合" or gtype == "Hemi":
drug_haplo_score_b[(gene_name, haplo)] += float(score)
elif ":".join([gene_name, gtype]) in newborndb.drug['haplotye']:
for haplo, score in newborndb.drug['haplotye'][":".join([gene_name, gtype])].iteritems():
drug_haplo_score_a[(gene_name, haplo)] += float(score)
drug_haplo_score_b[(gene_name, haplo)] += float(score)
else:
other_drug_gene.add(gene_name)
fout.close()
drug_lane1 = defaultdict(set)
drug_lane2 = defaultdict(set)
for gene, hap in [key for key, value in drug_haplo_score_a.iteritems() if round(value, 3) == 1.0]:
drug_lane1[gene].add(hap)
if round(drug_haplo_score_b[(gene, hap)], 3) == 1.0:
drug_lane2[gene].add(hap)
for gene in drug_lane1:
haps = sorted(drug_lane1[gene], key=lambda s: newborndb.drug['haplotye_score'][gene][s], reverse=True)
if len(haps) > 1:
drug_gene_hap[gene] = haps[:2]
elif len(haps) and len(drug_lane2[gene]):
drug_gene_hap[gene] = [haps[0], drug_lane2[gene].pop()]
elif len(haps):
drug_gene_hap[gene] = [default_hap[gene], haps[0]]
else:
drug_gene_hap[gene] = [default_hap[gene]] * 2
for gene in other_drug_gene:
if gene not in drug_gene_hap :
drug_gene_hap[gene] = [default_hap[gene]] * 2
if haplotype:
sample_haplotype_messages = xlsx2list(haplotype, {"hla-abo"})
sample_haplotype_message = sample_haplotype_messages["hla-abo"]
if len(sample_haplotype_message):
for rows in sample_haplotype_message:
type1, type2, score = rows[:3]
if not isnum(score):
continue
t_1 = ":".join(type1.split(":")[:2]).upper()
t_2 = ":".join(type2.split(":")[:2]).upper()
if t_1 in newborndb.drug['interpretation']['HLA-B']:
test_hla = t_1
elif t_2 in newborndb.drug['interpretation']['HLA-B'] and float(score) < 80:
test_hla = t_2
else:
test_hla = t_1 if float(score) >= 50 else t_2
if not len(test_hla):
test_hla = "NOVEL"
if test_hla in newborndb.drug['interpretation']['HLA-B']:
for rows in newborndb.drug['interpretation']['HLA-B'][test_hla]:
drug_gene, drug_ch, drug_gen, drug_short_ch, drug_long_ch, drug_reference_ch, \
drug_short, drug_long, drug_reference = rows
drug_en = newborndb.drug['backgroud'][drug_ch][2]
drug_back_ch = newborndb.drug['backgroud'][drug_ch][3]
drug_back_ref_ch = newborndb.drug['backgroud'][drug_ch][4]
annotations["drug"].add((drug_ch, drug_en, drug_gene,'HLA-'+test_hla, '-', drug_short_ch))
annotations["drug_definiton"].add((drug_ch, drug_en, drug_long_ch, drug_reference_ch))
annotations["drug_message"].add((drug_ch, drug_en, drug_back_ch, drug_back_ref_ch))
drug_usage[drug_en].add(drug_short_ch)
for gene, hap in drug_gene_hap.iteritems():
tag = "".join([newborndb.drug['drug_tag'][gene][t] for t in sorted(hap)])
if hap[0] == hap[1]:
if hap[0].startswith("*"):
haps = "".join(hap)
else:
haps = hap[0]
else:
if hap[0].startswith("*"):
haps = "".join(hap)
else:
haps = sorted(hap)[1]
try:
drug_genotype, drug_description = newborndb.drug['genotype'][gene][tag]
except:
continue
for rows in newborndb.drug['interpretation'][gene][drug_genotype]:
drug_gene, drug_ch, drug_gen, drug_short_ch, drug_long_ch, drug_reference_ch, \
drug_short, drug_long, drug_reference = rows
if re.compile('standard dose',re.I).search(drug_short): continue
drug_en = newborndb.drug['backgroud'][drug_ch][2]
drug_back_ch = newborndb.drug['backgroud'][drug_ch][3]
drug_back_ref_ch = newborndb.drug['backgroud'][drug_ch][4]
annotations["drug"].add((drug_ch, drug_en, drug_gene, haps, drug_description, drug_short_ch))
annotations["drug_definiton"].add((drug_ch, drug_en, drug_long_ch, drug_reference_ch))
annotations["drug_message"].add((drug_ch, drug_en, drug_back_ch, drug_back_ref_ch))
drug_usage[drug_en].add(drug_short_ch)
if check_nat2 :
drug_ch = "异烟肼"
drug_en = "Isoniazid"
drug_gene = "NAT2"
haps = "*4*4"
drug_description = "快代谢"
drug_short_ch = "增加剂量"
drug_long_ch = "快乙酰化代谢型携带者其异烟肼清除率明显高于慢乙酰化携带者,同时对异烟肼的敏感性降低,更容易导致治疗失败;" \
"而慢乙酰化在常规剂量治疗中则更容易导致肝脏或神经系统毒性[1,2]。研究显示,对于NAT2快代谢型需要增加50%" \
"左右的异烟肼剂量,以达到预期的治疗效果[3]。"
drug_reference_ch = "[1]Chen B, et al. The influence of NAT2 genotypes on the plasma concentration of isoniazid " \
"and acetylisoniazid in Chinese pulmonary tuberculosis patients. Clin Chim Acta. 2006 Mar; 365" \
"(1-2):104-8. [2]Kim SH, et al. Genetic polymorphisms of drug-metabolizing enzymes and anti-TB " \
"drug-induced hepatitis. Pharmacogenomics. 2009 Nov; 10(11):1767-79. [3]Kinzig-Schippers M, et " \
"al. Should we use N-acetyltransferase type 2 genotyping to personalize isoniazid doses? Antimic" \
"rob Agents Chemother. 2005 May; 49(5):1733-8."
drug_back_ch = "异烟肼是各器官系统、各类型结核病和结核病预防治疗的首选药物,适用于初治和复治的各型肺结核、结核性胸膜炎、腹膜炎" \
"、心包炎、及消化道、泌尿生殖器结核,骨关节结核、淋巴结结核等。是结核性脑膜炎的必选药物。本品单用易产生耐药故需" \
"联合其他抗结核病药物使用。常联合利福平、吡嗪酰胺和链霉素(或乙胺丁醇)治疗初治病例,亦可联合除上述药物以外的其" \
"他抗结核药物用于复治病例的治疗。异烟肼是治疗结核病的一线药物,最为常见的不良反应是非常容易导致神经系统毒性肝毒" \
"性。研究显示导致的肝毒性的主要因素与异烟肼的主要代谢酶—乙酰转移酶Ⅱ(NAT2)的基因多态性有关。根据NAT2代谢能力" \
"的不同分为快乙酰化和慢乙酰化代谢型,携带一个(中间代谢型)或两个(快代谢)高活性NAT2等位基因称为快乙酰化代谢型," \
"携带两个低活性NAT2等位基因的称为慢乙酰化代谢型(慢代谢)[1,2]。快乙酰化代谢型携带者其异烟肼清除率明显高于慢乙" \
"酰化携带者,同时对异烟肼的敏感性降低,更容易导致治疗失败;而慢乙酰化在常规剂量治疗中则更容易导致肝脏或神经系统" \
"毒性[3,4]。"
drug_back_ref_ch = "[1]Sanofi-aventis.Rifampin, isoniazid and pyrazinamide drug label.2010. [2]Kinzig-S" \
"chippers M, et al. Should we use N-acetyltransferase type 2 genotyping to personalize isoniazid " \
"doses? Antimicrob Agents Chemother. 2005 May; 49(5):1733-8. [3]Chen B, et al. The influence of NA" \
"T2 genotypes on the plasma concentration of isoniazid and acetylisoniazid in Chinese pulmonary tub" \
"erculosis patients. Clin Chim Acta. 2006 Mar; 365(1-2):104-8. [4]Kim SH, et al. Genetic polymorph" \
"isms of drug-metabolizing enzymes and anti-TB drug-induced hepatitis. Pharmacogenomics. 2009 Nov;" \
" 10(11):1767-79."
annotations["drug"].add((drug_ch, drug_en, drug_gene, haps, drug_description, drug_short_ch))
annotations["drug_definiton"].add((drug_ch, drug_en, drug_long_ch, drug_reference_ch))
annotations["drug_message"].add((drug_ch, drug_en, drug_back_ch, drug_back_ref_ch))
drug_usage[drug_en].add(drug_short_ch)
return annotations, drug_usage
def float_to_per(rate):
return "%.1f%%" %(float(rate)*100)
def parsefile(file):
parser = make_parser()
parser.setContentHandler(ContentHandler( ))
parser.parse(file)
def compile_dir(dir, out):
file_out = zipfile.ZipFile(out, 'w')
for root, dirs, files in os.walk(dir):
for f in files:
abs_path = os.path.join(root, f)
rel_path = os.path.relpath(abs_path,dir)
file_out.write(abs_path, rel_path, zipfile.ZIP_STORED)
shutil.rmtree(dir, ignore_errors=True)
file_out.close()
def test2xlsx(filname, *args):
excel_name = xlsxwriter.Workbook(str(filname) + '.xlsx')
sheet = 0
format=excel_name.add_format({"font":'Arial', "size":11, "bold":1, "align":'left', "valign":'top',"text_wrap":0})
format.set_border(1)
format.set_bg_color('#FFCC99')
format_black=excel_name.add_format({"font":'Arial', "size":11, "bold":1, "align":'left', "valign":'top',"text_wrap":0})
format_black.set_border(1)
format_black.set_bg_color('#CCFFFF')
for files, formact_dict in args:
path, name = os.path.split(files)
names = name.split('_')
name = "_".join(names[1:])
if len(name) > 20:
sheet += 1
name = 'sheet' + str(sheet)
worksheet = excel_name.add_worksheet(name)
row = 0
for line in file(files):
rows = line.strip().split('\t')
row += 1
row_name = 'A' + str(row)
if row % 2:
worksheet.write_row(row_name,rows,format_black)
else:
worksheet.write_row(row_name,rows,format)
if formact_dict is not None:
for colnumber, (weight, otherdict) in formact_dict.iteritems():
worksheet.set_column(colnumber, colnumber, weight, None, otherdict)
os.remove(files)
excel_name.close()
def auto_report(batch_num, sample, variation, bamdir, tmpdir, outdir, newborndb, haplotype, message, gene_id, config, base, attachment, report_type, varsheet_dict, varsheet_title, all_indb_out, all_Notindb_out):
logger.info("|_ _ _ _ Start to writing reports of %s " % sample)
copy_file(tmpdir, os.path.join(outdir, sample)) ##不知道tmpdir有什么内容
rep = open(os.path.join(outdir, sample, 'word/document.xml'), 'w')
attach = copy.copy(attachment)
try:
norm_xml_format = NormXmlFormat(True)
back_xml_format = NormXmlFormat(False)
report_times = time.strftime('%Y年%m月%d日', time.localtime(time.time()))
annofile, depthfile = indexfile(variation, bamdir, sample, tmpdir)
raw_result_xlsx = findfile(variation, sample, str(sample) + '_', 'xlsm') ##bedanno注释后的文件吧
if raw_result_xlsx is None:
raw_result_xlsx = findfile(variation, sample, str(sample) + '_', 'xlsx')
if raw_result_xlsx is not None:
attach.add(raw_result_xlsx)
try:
depth_from_bam = pysam.AlignmentFile(depthfile, 'rb')
except Exception, e:
logger.info("|_ _ _ _ Bam file error : %s - %s " % (sample, e))
depth_from_bam = None
annotations = defaultdict(list)
partlast = config["disease_drug_table"]
last_table = defaultdict(bool)
all_path_dis = []
title_change = config['title_change'] if 'title_change' in config else {"chrom": "#chr"}
config_change = dict([(v.lower(), k.lower()) for k, v in title_change.iteritems()]) ##改变的位于顺序的第几个
not_inter_function = ["utr-3","utr-5","no-change","promoter"]
try:
tgp_filter = float(config['autoInterp_freq_threshold']['TGP_AF'])
except:
tgp_filter = 0.01
try:
pvfd_filter = config['autoInterp_freq_threshold']['PVFD_AF']
except:
pvfd_filter = 0.01
try:
inter_function = config['inter_function']
except:
inter_function = ["cds-del", "cds-ins", "cds-indel", "cds-loss", "init-loss",
"splice", "splice-3", "splice-5", "stop-gain", "stop-loss",
"altstart", "frameshift", "knockout", "missense", "nonsense", "span"]
try:
filter_genes = config["filter_genes"]
except:
filter_genes = list()
panel_genes = newborndb.genes['all']
dis_genes = newborndb.genes['disease'] ##166个疾病基因 /ifs4/HST_5B/USERS/zhaocece/Production/newborn_datasets/disease_gene_v5
drug_genes = newborndb.genes['drug']##药物基因drug_gene_v5
enter_all = copy.copy(newborndb.entertainment)
titles = dict()
indb = dict()
check_gene = defaultdict(set) #check_gene,基因与疾病组合键的包含的位点集合,那有没有可能是用药相关的位点都在里面
for line in gzip.open(annofile):
if line.startswith("##"):
continue
tmp_rows = line.strip().lower().split('\t')
if not len(titles) and (title_change["chrom"] not in tmp_rows):
continue
##还没读取表头前的都舍弃
elif title_change["chrom"] in tmp_rows: ##表头行
for num in xrange(len(tmp_rows)):
if tmp_rows[num] in config_change:
titles[config_change[tmp_rows[num]]] = num
else:
titles[tmp_rows[num]] = num
else:
line = norm_xml_format(line.strip())##正常突变行,信息拆分记录
rows = line.split('\t')
try:
gene_sym = rows[titles["genesymbol"]]
if gene_sym == "NADKD1" : gene_sym = "NADK2"
if gene_sym not in panel_genes and gene_sym != ".":
continue
if re.compile('h', re.I).match(rows[titles["zygosity"]]) and re.compile('X').search(rows[titles["chrom"]]) and gender_message(sample)=="M":
rows[titles["zygosity"]] = "Hemi"
gtype = rows[titles["zygosity"]]
if re.compile('hom', re.I).match(gtype):
gtype = "Hom"
elif re.compile('het', re.I).match(gtype):
gtype = "Het"
# else:
# continue
chrom = 'chr' + re.sub('^chr', '', rows[titles["chrom"]])
start = rows[titles["start"]]
stop = rows[titles["stop"]]
ref = rows[titles["ref"]]
call = rows[titles["call"]]
mut_function = rows[titles["functionname"]]
rows[titles["flank"]] = newborndb.flank[chrom.lower()][stop]
try:
if "tgdaf" in titles:
tgf = float(rows[titles["tgdaf"]])
else:
tgf = float(rows[titles["pvfdaf"]])
except:
tgf = 0.0
try:
if "pvfdaf" in titles:
pvfd = float(rows[titles["pvfdaf"]])
else:
pvfd = tgf
except:
##点float不成功,则设置为0
pvfd = 0.0
hgmd = rows[titles['hgmdpred']].lower() if 'hgmdpred' in titles else "dm"
try:
trans = rows[titles["transcript"]]
except:
trans = "."
try:
chgvs = rows[titles["chgvs"]]
except:
chgvs = "."
annotation_key = (chrom, start, stop, ref, call)
enter_k1 = (chrom, start, stop, ref, call, "Ref")
enter_k2 = (chrom, start, stop, ref, call, "Het")
enter_k3 = (chrom, start, stop, ref, call, "Hom")
enter_key = (chrom, start, stop, ref, call, gtype) ##gtype不就是Het或者Hom吗
if mut_function in not_inter_function and gene_sym not in drug_not_dis: continue ##什么逻辑?在drug_and_dis里的不考虑的功能则会过滤掉,是的就是这个
if chgvs and mut_function == 'intron' : ##intron保留10bp以内的
chgvs_m = re.compile(r'c\.(\d+)[-\+](\d+)[A-Z]>[A-Z]').search(chgvs)
if chgvs_m:
if int(chgvs_m.group(2)) > 10 and gene_sym not in drug_not_dis: continue
if annotation_key in newborndb.mutdb:##如果突变是在已知数据库里的,则用已知数据库里的各种信息
dbtrans = newborndb.mutdb[annotation_key][newborndb.database_title["nbtrans"]]
gene_name = newborndb.mutdb[annotation_key][newborndb.database_title["#gene"]]
disease = newborndb.mutdb[annotation_key][newborndb.database_title["disease"]].lower()
if annotation_key not in indb or trans == dbtrans: ##indb是个临时汇总记录该样本在已有数据库的位点字典,如果没有记录过或者转录本是目的转录本,则记录,键是chr start end ref alt,感觉trans判定有点重复判定的意思,如果在数据库中并且也是这个转录本的不是已经记录过的
indb[annotation_key] = rows
if gtype == "Het":##为什么只有杂合才记录基因和疾病涉及的位点?
tmp_key = (gene_name, disease)
check_gene[tmp_key].add(annotation_key) ##check_gene基因与疾病组合键的包含的位点集合,这个变量的作用是啥?
elif ((re.compile('dm', re.I).search(hgmd) or ((tgf <= tgp_filter) and (pvfd <= pvfd_filter)
and (mut_function in inter_function)))
and (gene_sym not in filter_genes) and (gene_sym in dis_genes)):
##如果不在已知的数据库里,根据hgmd预测为有害或者频率很低,并且基因是疾病基因,不是需要过滤的基因,则往下走
not_tmp = [gene_sym, trans, chgvs, chrom]
for i, j in sorted(newborndb.database_title.iteritems(), key=lambda s: int(s[1]))[4:]:
if i in titles:
not_tmp.append(rows[titles[i]])
else:
not_tmp.append(".")
annotations["not_in_db"].append(back_xml_format("\t".join(not_tmp)) + "\n") ##按照预先设置的格式存到不在数据库中的信息里
if enter_key in newborndb.entertainment: #enter_key = (chrom, start, stop, ref, call, gtype)
annotations["entertainment"].extend(enter_all[enter_key]) ### iwant zhidao 这个是用来干嘛的,春娜说是个体特征
if enter_k1 in enter_all: del enter_all[enter_k1]
if enter_k2 in enter_all: del enter_all[enter_k2]
if enter_k3 in enter_all: del enter_all[enter_k3]
except Exception, e:
logger.info("%s is not well-formed! %s" % (sample, e))
raise
tmp_out = os.path.join(outdir, sample + "_mut_indb")
annotations["in_db"], drug_usage = parsing_mut_indb(batch_num, sample, indb, check_gene, message['gender'], newborndb, haplotype,
titles, tmp_out, all_indb_out)
####here key important
##batch_num 批次或者项目编号
##sample 样品名称
##indb是个临时汇总记录的字典,如果不在外部数据库或者转录本是目的转录本,则记录
####check_gene基因与疾病组合键的包含的位点集合
##message['gender'] 性别信息
##newborndb是我也记不起来的数据库内容
##haplotype 项目分析的result路径下有id.xls 以及每个样本的好像是基因型但是就两行
##titles bedanno的表头,直接看起来就是,应该没有其他改动
##tmp_out
##all_indb_out在已有的数据库中的记录
fout = open(os.path.join(outdir, sample + "_mut_Notindb"), 'w')##不在数据库中的位点输出到文件,但实际运行中最后又删除了,annotations["not_in_db"]是中间记录的变量,annotation记录在库不在库以及各种库的位点情况
database_title = sorted(newborndb.database_title.keys(), key=lambda s: int(newborndb.database_title[s])) ###database_title过去突变位点的详细注释的表头的对应哪个列的记录
contral_varsheet_dict = dict()
for i, j in varsheet_title.iteritems():
# for i, j in newborndb.database_title.iteritems():
if i.lower() in varsheet_dict:
tmp_weight = varsheet_dict[i.lower()]["width"] or None
tmp_otherdict = {n:m for n, m in varsheet_dict[i.lower()].iteritems() if n != "width" and m != 0}
if tmp_weight or len(tmp_otherdict):
contral_varsheet_dict[int(j)] = (tmp_weight, tmp_otherdict)
fout.write("Project" + '\t' + "Sample" + '\t' + back_xml_format("\t".join([newborndb.dbrawtitle[i] for i in database_title if newborndb.database_title[i] <= newborndb.database_title["phgvs"]])))
fout.write('\t'+"ForPrimerDesign" + '\t' + back_xml_format("\t".join([newborndb.dbrawtitle[i] for i in database_title if newborndb.database_title[i] >newborndb.database_title["phgvs"]])) + '\n')
# fout.write(back_xml_format("\t".join([newborndb.dbrawtitle[i] for i in database_title])) + '\n')
for line in annotations["not_in_db"]:
rows = line.strip().split('\t')
gene_sym = rows[newborndb.database_title["#gene"]]
trans = rows[newborndb.database_title["nbtrans"]]
chgvs = rows[newborndb.database_title["nbchgvs"]]
phgvs = rows[newborndb.database_title["phgvs"]]
funcregion = rows[newborndb.database_title["funcregion"]]
zygosity = rows[newborndb.database_title["zygosity"]]
chrom=line[5].lower() if re.search('chr',line[5],re.I) else 'chr'+str(line[5]) #不知道为啥是line[5]
flank = rows[newborndb.database_title["flank"]]
forPrimerDesign = (gene_sym,trans,chgvs,phgvs,funcregion,zygosity,flank)