-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExtract_homopolymer.py
59 lines (46 loc) · 1.3 KB
/
Extract_homopolymer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Extract Variants in/adjacent to homopolmyer repeat minimum length of 6bp.
date: June 25th, 2015
author: gerikson
"""
import os, sys, gzip, datetime
def open_chrom(j):
print "New chrom " + str(j)
infilename="/gpfs/group/stsi/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/chr"+str(j)+".fa"
infile=open(infilename, 'r')
outfilename="/gpfs/home/gerikson/wellderly/filter_data/Homopolymer/byChrom/homopolymer.chr"+str(j)+".txt"
outfile=open(outfilename, 'w')
counter=0
start_counter=0
amino_acid=""
repeat_lenght=0
for line in infile:
if line[0]!='>':
for i in line.strip():
#print i
counter=counter+1
if i.lower()==amino_acid:
#If this is part of a bigger repeat
if repeat_lenght>0:
repeat_lenght=repeat_lenght+1
#print str(repeat_lenght)
#if this is a newrepeat
else:
repeat_lenght=2
#That means we already had 2 aa minus the adjusent
start_counter=counter-3
else:
amino_acid=i.lower()
if repeat_lenght>6:
homopolmyer="chr"+str(j)+"\t"+str(start_counter)+"\t"+str(counter)+"\n"
outfile.write(homopolmyer)
repeat_lenght=0
#print "Polymer found! " + homopolmyer
else:
repeat_lenght=0
infile.close()
outfile.close()
for j in range(1,22):
open_chrom(j)
open_chrom("X")
open_chrom("Y")