-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalleles2genepop.py
128 lines (111 loc) · 4.14 KB
/
alleles2genepop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python
from collections import defaultdict as dd
import argparse
def parse_pop_file(fh):
'''
read in a headerless file specifying individual -> population mapping
return a dict thats keys are pop names, values are lists of individuals
'''
pops = dd(lambda : [])
indivs = []
for l in fh:
(indiv, pop) = l.split()
pops[pop].append(indiv)
indivs.append(indiv)
return(pops, indivs)
class Locus(object):
'''
container and helper functions for parsing loci in an alleles file
'''
def __init__(self):
self.inds = dd(lambda : [])
#current number of haplotypes at this locus
self.num_haplotypes = 0
#haplotype -> hapnum mapping
self.haps = {}
def add_ind(self, l):
(tmp_ind, hap) = l.split()
if hap not in self.haps:
self.num_haplotypes += 1
self.haps[hap] = self.num_haplotypes
ind = tmp_ind[1:-2]
self.inds[ind].append(self.haps[hap])
class Alleles_File(object):
'''
collections of alleles, phased, for individuals
-sample names are appended with r"_[01]" to specify haplotype 0 or 1
-spaces separate sample IDs and sequence
-loci are separeated by a line at the bottom that specifies the positions
of SNPs with either a * or -
allele lines look like:
>sample1_0 TCATACGTGACTGACTGACxxxxTCATACGTGACTGACTGAC
>sample1_1 TCATAAGTGACTGACTGACxxxxTCATACGGGACTGACTGAC
// - * |
'''
def __init__(self, fh):
'''
initialize starting variables
'''
self.file_handle = fh
self.curr_loc = Locus()
self.next_loc = self.get_next_locus()
def get_next_locus(self):
'''
'''
nl = Locus()
try:
currline = self.file_handle.next()
while not currline.startswith("//"):
nl.add_ind(currline)
currline = self.file_handle.next()
finally:
if nl.num_haplotypes >0:
return nl
else:
return None
def __iter__(self):
return self
# Python 3 compatibility
def __next__(self):
return self.next()
def next(self):
'''
'''
self.curr_loc = self.next_loc
if self.curr_loc != None:
self.next_loc = self.get_next_locus()
return self.curr_loc
else:
raise StopIteration()
def get_genepop_matrix(alleles_file, individuals):
gpop_field = '{:03d}' * 2
loci_names = []
hapmap = dd(lambda : [])
for i, locus in enumerate(Alleles_File(alleles_file)):
if locus.num_haplotypes > 1:
loci_names.append(str(i+1))
for ind in individuals:
if ind in locus.inds:
hapmap[ind].append(gpop_field.format(locus.inds[ind][0],locus.inds[ind][1]))
else:
hapmap[ind].append(gpop_field.format(0,0))
return(loci_names, hapmap)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Given a pyRAD alleles file and individual -> populations assignment, generate a genepop file.")
parser.add_argument("-a","--alleles", help="Your alleles file.",
type=argparse.FileType('r'), required=True)
parser.add_argument("-p","--pops", help="Populations assignment file. Must be white-space delimited, 2 columns. column 1: individual column 2: population",
type=argparse.FileType('r'), required=True)
parser.add_argument("-o","--out", help="The name of the genepop file.",
type=argparse.FileType('w'), required=True)
args = parser.parse_args()
out = args.out
populations, individuals = parse_pop_file(args.pops)
loci_names, haplotypes = get_genepop_matrix(args.alleles, individuals)
out.write("haplotypes\n")
out.write('\n'.join(loci_names) + '\n')
for pop in sorted(populations):
out.write("POP\n")
for ind in populations[pop]:
out.write(ind + ' , ')
out.write(' '.join(haplotypes[ind]) + '\n')