-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcogdb.py
137 lines (123 loc) · 4.64 KB
/
cogdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python3
"""
Generate database for prokka from high identity blast matches
See protein.sh
"""
import sys
import csv
from Bio import SeqIO
_, blast_output_path, cog20_index_path, protein_database_cogs, output_path = sys.argv
# Columns:
# 1. Gene ID (GenBank or ad hoc)
# 2. NCBI Assembly ID
# 3. Protein ID (GenBank if conforms to [A-Za-z0-9_]+\.[0-9]+ regex; ad hoc otherwise)
# 4. Protein length
# 5. COG footprint coordinates on the protein. "201-400" means "from position 201 to position 400"; "1-100=201-300" indicates a segmented footprint, 1-100 AND 201-300
# 6. Length of the COG footprint on the proteins
# 7. COG ID
# 8. reserved
# 9. COG membership class (0: footprint covers most of the protein and most of the COG profile; 1: footprint covers most of the COG profile and part of the protein; 2: footprint covers most of the protein and part of the COG profile; 3: partial match on both protein and COG profile)
# 10. PSI-BLAST bit score for the match between the protein and COG profile
# 11. PSI-BLAST e-value for the match between the protein and COG profile
# 12. COG profile length
# 13. Protein footprint coordinates on the COG profile
with open(cog20_index_path) as f:
cog20_index = {pid.replace('.', '_'): (cogid, gene) for gene, _, pid, _, _, _, cogid, *_ in csv.reader(f)}
# Find anything that aligns to a cog20 protein and assign it the same COG id
cog_hits = {}
non_terminal_hits = {}
no_cog_hit = set()
cog_hit = set()
with open(blast_output_path) as f:
for dbid, qid, bitscore in csv.reader(f, 'excel-tab'):
qid = qid.replace('.', '_')
dbid = dbid.replace('.', '_')
bitscore = float(bitscore)
if dbid == qid:
no_cog_hit.add(qid)
continue
cog_hit.add(qid)
hit = cog_hits.get(qid)
if not hit or hit[0] < bitscore:
cog = cog20_index.get(dbid)
if cog:
cog_hits[qid] = (bitscore, cog[0])
else:
hit = non_terminal_hits.get(qid)
if not hit or hit[0] < bitscore:
non_terminal_hits[qid] = (bitscore, dbid)
# Cluster anything that did not point directly to a COG protein
clusters = []
for qid, (_, dbid) in non_terminal_hits.items():
for c in clusters:
if qid in c or dbid in c:
c.add(qid)
c.add(dbid)
break
else:
clusters.append({qid, dbid})
print("Collapsing clusters")
i = 0
while i < len(clusters):
a = i + 1
while a < len(clusters):
if clusters[a] and clusters[i] & clusters[a]:
clusters[i] |= clusters[a]
del clusters[a]
else:
a += 1
i += 1
print("Clustered")
# Use any second order alignments to a COG protein to assign the entire cluster to the same COG
non_terminal_hits.update({v: (b, k) for k, (b, v) in non_terminal_hits.items()})
keys = set(cog_hits.keys())
todel = []
for i, c in enumerate(clusters):
cluster_hits = c.intersection(keys)
if len(cluster_hits) == 0: continue
chit = next(h for h in cluster_hits)
if len(cluster_hits) > 1:
hit = non_terminal_hits[chit]
for ch in cluster_hits:
newhit = non_terminal_hits[ch]
if hit[0] < newhit[0]:
hit = newhit
chit = ch
for p in c:
cog_hits[p] = cog_hits[chit]
todel.append(i)
print("Clusters associated")
for d in reversed(todel):
del clusters[d]
for i, c in enumerate(clusters):
for p in c:
cog_hits[p] = (None, f"pCOG{i}")
cog_hits = {k: v[1] for k, v in cog_hits.items()}
genes = {k: v for (k, v) in cog20_index.values()}
no_cog_hit = no_cog_hit - cog_hit
with open(protein_database_cogs) as f, open(output_path, 'w') as o:
for r in SeqIO.parse(f, 'fasta'):
r.description = r.description.lstrip(r.id + " ")
r.id = r.id.replace('.', '_')
if r.id not in cog_hits and r.id not in no_cog_hit: continue
gene = ''
desc = ''
if '~~~' not in r.description:
desc = r.description.rsplit('[', 1)[0].strip(' ')
cogid = ''
cog = cog20_index.get(r.id)
if cog:
cogid, gene = cog
else:
cogid = cog_hits.get(r.id, '')
if not gene:
gene = genes.get(cogid, '')
if '~~~' in r.description:
descs = r.description.split('~~~')
if not gene and descs[1] != 'None':
gene = descs[1]
if not desc and descs[2] != 'None':
desc = descs[2]
#r.description = f"~~~{gene}~~~{desc}~~~{cogid}" # for prokka db
r.description = cogid or f"{r.id} {r.id}" # SeqIO will trim first copy
SeqIO.write(r, o, 'fasta')