-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathVPOT_2_Gene.py
132 lines (130 loc) · 4.94 KB
/
VPOT_2_Gene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
###########################################################################################################
# #
###########################################################################################################
#
import sys, re, glob, os, subprocess, time #
import numpy as np #
import VPOT_conf #
from shutil import copyfile #
#
#
###########################################################################################################
# Define global variables
##########################################################################################################
suffix=str(int(time.time())) # get a unique timestamp for suffix
#
supplied_args=0 #
#
tab='\t' #
nl='\n' #
#
GENE_loc=-1 #
#
info_msg1_1="VPOT genef : Invalid number of inputs, must have three :"
info_msg1_2="VPOT genef : 1) output destination directory + prefix" #
info_msg1_3="VPOT genef : 2) Input file - output from VPOT prioritisation process"
info_msg1_4="VPOT genef : 3) Gene list" #
#
############################################################################################################
#
###########################################################################################################
#
###########################################################################################################
def initial_setup():
#
global supplied_args
#
print ("initial_setup():") #
print (suffix) #
# print sys.argv #
supplied_args=len(sys.argv) #
# print supplied_args #
#
if (supplied_args != 5 ): # arg [0] is the python program
print (info_msg1_1+nl+info_msg1_2+nl+info_msg1_3+nl+info_msg1_4) #
return 1 #
else :
VPOT_conf.output_dir=sys.argv[2] #
VPOT_conf.input_file=sys.argv[3] #
VPOT_conf.gene_list=sys.argv[4] #
VPOT_conf.final_output_file=VPOT_conf.output_dir+"gene_filtered_output_file_"+suffix+".txt" #
print ("output : ",VPOT_conf.final_output_file) #
return 0 #
#
#
###########################################################################################################
#
###########################################################################################################
def filter_the_variants(): #
#
# input file for filtering is the output from the VPOT process. This means the variant gene name is in a column named GENE_NAME.
global GENE_loc #
#
# print ("filter_the_variants(): ") #
#
# print (VPOT_conf.input_file,VPOT_conf.final_output_file) #
with open(VPOT_conf.input_file,'r',encoding="utf-8") as variants_file, open(VPOT_conf.final_output_file,'w',encoding="utf-8") as filtered_file : #
for line1 in variants_file: # work each line of new sample vcf file
write_it=False # initialise score
line_parts=re.split('\t|\n|\r',line1) # split the variant up
# print "line part 0 : ",line_parts[0] #
if ("#CHROM" != line_parts[2]): #
# print src_line1 #
#
write_it=filter_variants_by_GN(line_parts) # check get priority score
#
else : # save the header line
write_it=True # initialise score
for i, content in enumerate(line_parts): # return the value and index number of the GENE_NAME item in the line array
# print "content-",content,"/",i #
if (content == VPOT_conf.GENE_NAME) : #
GENE_loc=i #save sample location
# print "INFO_loc: ",INFO_loc #
#
if (write_it): #
filtered_file.write(line1) # write the line to final output file
#
###########################################################################################################
#
###########################################################################################################
def filter_variants_by_GN(INFO_details): #
#
# print "filter_variants_by_GN(INFO_details): " #
#
val=False #
with open(VPOT_conf.gene_list,'r',encoding="utf-8") as gene_file : #
for gene_id in gene_file: # work each line of new sample vcf file
gene_id1=gene_id.rstrip() #
# print "Gene ID",gene_id1,"in",INFO1[i+1] # move to pred_array slot
if ( gene_id1 in INFO_details[GENE_loc] ) : # is this the gene we are looking for
gene_detail=re.split(',',INFO_details[GENE_loc]) # split into annotations
#print INFO_details[GENE_loc],"/",gene_detail,"/",gene_id1 #
for k in range(len(gene_detail)): #
# print gene_detail[k],"/",gene_detail,"/",gene_id1 #
if ( gene_id1 == gene_detail[k] ) : # is this the gene we are looking for
# print "match: ",gene_detail[k],"/",gene_detail,"/",gene_id1 #
val=True #
break #
# print "genes go around-",gene_id1 #
if ( val ) : # we have found the gene
break # then get out and move to next variant
#
return val #
#
##
###########################################################################################################
#
###########################################################################################################
def main(): #
##
VPOT_conf.init() #
#
print ("Gene Filter - Main") #
#
if (initial_setup() != 0): #
# print "no good" #
return #
#
# Now filter the input file by gene list
filter_the_variants() #
#