-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsnakefile_post-processing_svim-svs
132 lines (116 loc) · 6.68 KB
/
snakefile_post-processing_svim-svs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
aligners, = glob_wildcards("/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim.vcf")
print(aligners)
rule all:
input:
expand("/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_non-STR_INS_pad100.bed", aligner = aligners)
rule remove_SVs_less_50bp:
input:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim.vcf"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp.vcf"
shell:
"bcftools view -i 'INFO/SVLEN<-50 || INFO/SVLEN>50' {input} > {output}"
rule remove_SVs_more_than_10mb:
input:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp.vcf"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb.vcf"
shell:
"bcftools view -i 'INFO/SVLEN>=-10000000 && INFO/SVLEN<=10000000' {input} > {output}"
rule sort_vcf:
input:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb.vcf"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort.vcf"
shell:
"bcftools sort -O v {input} > {output}"
rule zip_vcf:
input:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort.vcf"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort.vcf.gz"
shell:
"bgzip {input}"
rule index_zipped_vcf:
input:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort.vcf.gz"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort.vcf.gz.tbi"
shell:
"tabix {input}"
rule retain_chr21_svs_only:
input:
zipped="/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort.vcf.gz",
to_ensure_rule_order="/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort.vcf.gz.tbi"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21.vcf"
shell:
"bcftools view -r chr21 {input.zipped} > {output}"
rule subset_vcf:
input:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21.vcf"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21.bed"
shell:
"bcftools query -f '[%CHROM]\t[%POS]\t[%INFO/END]\t[%INFO/SVLEN]\t[%INFO/SVTYPE]\t[%SUPPORT]\t[%GT]\n' {input} > {output}"
rule retain_STR_regions:
input:
data="/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21.bed",
STR="/home/croy/teams/CSE284_SP21_A00/team15/data/table-browser_simple-repeats-full.bed"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_STR.bed"
shell:
"bedtools intersect -f 0.5 -wa -wb -a {input.data} -b {input.STR} | cut -f 1-7 | uniq -c | cut -c 9- > {output}"
rule obtain_STR_DELs:
input:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_STR.bed"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_STR_DEL.bed"
shell:
"""awk '$5 == "DEL"' {input} > {output}"""
rule obtain_STR_INSs_DUPs:
input:
data="/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_STR.bed",
to_ensure_rule_order="/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_STR_DEL.bed"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_STR_INS.bed"
shell:
"""awk '$5 == "INS" || $5 == "DUP"' {input.data} > {output}"""
rule pad_STR_INS_DUP_coordinates:
input:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_STR_INS.bed"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_STR_INS_pad100.bed"
shell:
"""awk 'BEGIN {{OFS="\\t"}}; {{if($3-$2<100) printf "%s\\t%.0f\\t%.0f\\t%s\\t%s\\t%s\\t%s\\n", $1, $2-(100-($3-$2))/2, $3+(100-($3-$2))/2, $4, $5, $6, $7; else print $0}}' {input} > {output}"""
rule obtain_non_STR_SVs:
input:
data="/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21.bed",
STR="/home/croy/teams/CSE284_SP21_A00/team15/data/short-tandem-repeats_simple-repeats_repeat-masker-simple-repeats_cat_sort_merge.bed",
to_ensure_rule_order="/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_STR_INS_pad100.bed"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_non-STR.bed"
shell:
"bedtools intersect -v -f 0.5 -wa -wb -a {input.data} -b {input.STR} | cut -f 1-7 | uniq -c | cut -c 9- > {output}"
rule obtain_non_STR_DELs:
input:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_non-STR.bed"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_non-STR_DEL.bed"
shell:
"""awk '$5 == "DEL"' {input} > {output}"""
rule obtain_non_STR_INSs_DUPs:
input:
data="/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_non-STR.bed",
to_ensure_rule_order="/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_non-STR_DEL.bed"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_non-STR_INS.bed"
shell:
"""awk '$5 == "INS" || $5 == "DUP"' {input.data} > {output}"""
rule pad_non_STR_INS_DUP_coordinates:
input:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_non-STR_INS.bed"
output:
"/home/croy/teams/CSE284_SP21_A00/team15/data/hg002_ont_chr21_{aligner}_sort_svim_rm-50bp_rm-10mb_sort_index_chr21_non-STR_INS_pad100.bed"
shell:
"""awk 'BEGIN {{OFS="\\t"}}; {{if($3-$2<100) printf "%s\\t%.0f\\t%.0f\\t%s\\t%s\\t%s\\t%s\\n", $1, $2-(100-($3-$2))/2, $3+(100-($3-$2))/2, $4, $5, $6, $7; else print $0}}' {input} > {output}"""