-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove_coverage.py
104 lines (76 loc) · 3.5 KB
/
remove_coverage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
Remove the variants that are bellow 10 coverage or >110
"""
import os, sys, gzip, datetime
def main(chrom):
ch = chrom[3:]
print "chrom is: " + ch
#V1 for the wellderly filter
#input_filename="/gpfs/group/stsi/data/projects/wellderly/GenomeComb/vcf_filtered_VQHIGH_whiteOnly_clustered_repeats_homopoly_etc_missing/wellderly_inova.VQHIGH.0.95white.nocluster.repeats.etc.missing."+str(chrom)+".vcf.gz"
#output_filename="/gpfs/group/stsi/data/projects/wellderly/GenomeComb/vcf_filtered_VQHIGH_whiteOnly_clustered_repeats_homopoly_etc_missing_cov/v1_wellderly_inova.VQHIGH.0.95white.nocluster.repeats.etc.missing.cov."+str(chrom)+".vcf.gz"
#counter_file="/gpfs/group/stsi/data/projects/wellderly/GenomeComb/vcf_filtered_VQHIGH_whiteOnly_clustered_repeats_homopoly_etc_missing_cov/v1_counter_wellderly_inova.VQHIGH.0.95white.nocluster.repeats.etc.missing.cov.txt"
#coverage_file="/gpfs/group/torkamani/bhuvan/wellderly/coverage/CoverageInfo/MediansCompiled/medians_chrm_"+str(ch)+".csv"
#V2 for the inova filter
input_filename= "/gpfs/group/stsi/data/projects/wellderly/GenomeComb/vcf_allFilters_36kmer_snpsOnly_AF0.01/vcf_nokmer_snps_AF0.01.noRelated." + chrom + ".vcf.gz"
output_filename="/gpfs/group/stsi/data/projects/wellderly/GenomeComb/vcf_allFilters_36kmer_snpsOnly_AF0.01/final_vcf_nokmer_snps_AF0.01.noRelated." + chrom + ".vcf.gz"
counter_file="/gpfs/group/stsi/data/projects/wellderly/GenomeComb/vcf_allFilters_36kmer_snpsOnly_AF0.01/counter.txt"
coverage_file="/gpfs/group/stsi/data/projects/wellderly/GenomeComb/inova_median_coverage/MediansCompiled/medians_chrm_"+str(ch)+".csv"
o = gzip.open(output_filename, 'w')
#Create a dictionary of the covereage
cov_file = open(coverage_file)
covereage = {}
print "Start creating dictionary"
for i in cov_file:
ln = i.split(",")
covereage[ln[0]] = ln[1]
print "Dictionary created"
c = open(counter_file, "a")
print 'Calculating...'
counter = 0
good_lines = 0
block = ""
f = gzip.open(input_filename)
for line in f:
if line[:1] == "#":
print "header"
o.write(line)
else:
counter += 1
if counter%100 == 0:
o.write(block)
block = ""
o.flush()
if counter%10000 == 0:
print datetime.datetime.now().time()
print "total lines " + str(counter)
print "Good lines " + str(good_lines)
sys.stdout.flush()
tp_line = line.strip().split()
pos = tp_line[1] + "_" + str(len(tp_line[3]))
#Extract the coverage from the dictionary
cov = "0.0"
try:
cov = covereage[pos]
except:
print "coverage not found"
print tp_line[:5]
if float(cov) < 10.0 or float(cov) > 110.0:
continue
else:
block = block+line.strip()+"\n"
good_lines += 1
o.write(block)
print "Total lines " + str(counter)
print "Total good lines " + str(good_lines)
c.write(chrom + "\t" + str(counter) + "\t" + str(good_lines)+ "\n")
c.close()
f.close()
o.close()
if __name__ == '__main__':
print "Python Version: " + sys.version
import time
start = time.time()
main(sys.argv[1])
#main(sys.argv[1], sys.argv[2], sys.argv[3])
end = time.time()
#print 'This took {0} seconds'.format(end - start)