-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmoleculeSemanticRelatedness.py
310 lines (262 loc) · 13.3 KB
/
moleculeSemanticRelatedness.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# -*- coding: utf-8 -*-
"""
TODO: use per year word2vec models
TODO: adjust for P53 and synonyms in word embeddings calculations
TODO: read in a real word2vec model
TODO: improve the intact score calculation for known interactions (base it on methods and number of supporting evidence)
Created on Tue Jul 05 16:49:41 2016
Code to use semantic relatedness of small molecules to predict new
molecule to molecule relationships.
Given a molecule $m$, this defines a semantic context $sc(m)$ as a $n_sc$ sized vector
that maps the molecule to a semantic space. This is built using an external program to
construct the models. This program assumes the model is built and stored as a python
Gensim word2vec model
Given a molecule $mi$, this outputs the value Ni and Ns and Ni/Ns. Ns is the number
of other molecules $mj$ in the set $Smi$ which is the set of molecules
for which $s(mi,mj)$ is greater than a threshold. Ni is the number of molecules
in $Smi$ that have a known interaction with a given molecules (in our case, TP53).
input file:
dict1 = {'GeneA': genea, 'GeneB': geneb, 'PubMedId': pubmedid,
"Negative" : negflag, "IdMethodA": geneamethod, "IdMethodB": genebmethod,
'UTCDate' : pubmeddate}
output file:
{'Gene': curgene, 'Ni': ni, 'Ns': ns, 'Ni/Ns' : (ni/ns), 'Hit' : true/false}
where hit is true if no existing known relationship exists between the gene
and the target gene for a given evaluation date
@author: cpschmitt
"""
import pandas as pd
from sets import Set
import numpy as np
import random
import gensim.models
import time
import datetime
import string
class testmodel:
simmap={} # map from hash of genes to similarity score
vocab=[]
def __init__(self, infile):
dfmodel=pd.read_csv(infile)
self.vocab=dfmodel.GeneA.unique()
for row in dfmodel.iterrows():
genei = row[1]['GeneA']
genej = row[1]['GeneB']
score = row[1]['Score']
self.simmap[self.gethash(genei,genej)]=score
def similarity(self, genei, genej):
hkey=self.gethash(genei,genej)
if self.simmap.has_key(hkey):
return self.simmap[hkey]
else:
print "!!!"
return 0
def gethash(self,genei,genej):
if genei<genej:
return str(genei)+":"+str(genej)
else:
return str(genej)+":"+str(genei)
# return the value of gene that is in wemodel, otherwise return None
def isinmodel(gene,wemodel):
if gene in wemodel.vocab:
return gene
elif gene.lower() in wemodel.vocab:
return gene.lower()
elif gene.upper() in wemodel.vocab:
return gene.upper()
else:
return None
def checkgenecoverage(genelist,wemodel):
# checks the number of genes are in the model
numgenes=len(genelist)
numingenes=0.0
for i in range(0,numgenes):
genei = isinmodel(genelist[i],wemodel)
if (genei is not None):
numingenes=numingenes+1
print "num genes "+str(numgenes)+", num found genes "+str(numingenes) + ", percent of found genes "+str(100.0*(numingenes/numgenes))
def computesemanticrelatedness(intact,genelist,wemodel):
# sets intact[i,j,INTACT_SR_INDEX] to the semantic relatedness of genes, range [0,1]
numgenes=len(genelist)
for i in range(0,numgenes):
for j in range(i,numgenes):
if wemodel==None:
intact[i,j,INTACT_SR_INDEX]=0 #random.uniform(0, 1)
else:
genei = isinmodel(genelist[i],wemodel)
if (genei is None):
#print "Gene not found in embedded model "+str(genelist[i])
random.uniform(0, 1)
else:
genej = isinmodel(genelist[j],wemodel)
if (genej is None):
#print "Gene not found in embedded model "+str(genelist[j])
random.uniform(0, 1)
else:
#print "Gene pair found in embedded model "+str(genelist[i]) +","+str(genelist[j])+", sim "+str(wemodel.similarity(genei, genej))
intact[i,j,INTACT_SR_INDEX]=wemodel.similarity(genei, genej)
intact[j,i,INTACT_SR_INDEX]=intact[i,j,INTACT_SR_INDEX]
def computeknownrelations(intgraph,geneindexmap,dfpairlist,evaldate):
# sets intact[i,j,INTACT_KR_INDEX] to the relatedness of genes based on known interactions that occur before evaldate
# range [-1,1]
# the interaction score will be negative if the relations are known negative, positive if known positive
# the magnitude of the score will increase with the evidence
#
intgraph[:,:,INTACT_KR_INDEX]=0
for row in dfpairlist.iterrows():
genea = row[1]['GeneA']
geneb = row[1]['GeneB']
intdate = row[1]['UTCDate']
if intdate>evaldate:
continue
geneaindex=geneindexmap[genea]
genebindex=geneindexmap[geneb]
if row[1]['Negative'] is True:
intgraph[geneaindex,genebindex,INTACT_KR_INDEX]=intgraph[geneaindex,genebindex,INTACT_KR_INDEX]-0.5
intgraph[geneaindex,genebindex,INTACT_KR_INDEX]=max(-1.0,intgraph[geneaindex,genebindex,INTACT_KR_INDEX])
else:
intgraph[geneaindex,genebindex,INTACT_KR_INDEX]=intgraph[geneaindex,genebindex,INTACT_KR_INDEX]+0.5
intgraph[geneaindex,genebindex,INTACT_KR_INDEX]=min(1.0,intgraph[geneaindex,genebindex,INTACT_KR_INDEX])
def computeknownfirstdate(intgraph,geneindexmap,dfpairlist):
# sets intact[i,j,INTACT_FD_INDEX] to the first date of evidence for the known relationship, range utc date or -1 if none
intgraph[:,:,INTACT_FD_INDEX]=-1
for row in dfpairlist.iterrows():
genea = row[1]['GeneA']
geneb = row[1]['GeneB']
geneaindex=geneindexmap[genea]
genebindex=geneindexmap[geneb]
intdate = row[1]['UTCDate']
if intgraph[geneaindex,genebindex,INTACT_FD_INDEX]==-1 or intdate<intgraph[geneaindex,genebindex,INTACT_FD_INDEX]:
intgraph[geneaindex,genebindex,INTACT_FD_INDEX]=intdate
def computepredrelations_old(intgraph,genelist,geneindexmap,targetgene):
# compute predictions of all genes with the target gene
# make predictions as to whether genes in genelist interact with the target
# gene based on the semantic relation of genes in intgraph
# Approach
# for each gene in genelist
# look at number of neighbors Ns with strong semantic relatedness
# identify the number of genes Ni in Ns that have interactions with the gene-gene pair of interest
# record the values Ni, Ni/Ns
# returns a data frame with columns gene,ns,ni,ni/ns
numgenes=len(genelist)
Tns=0.2 # threshold for determining Ns
Tng=0.2 # threshold for determining Ni
targetgeneindex=geneindexmap[targetgene]
rows_list = []
for row in dfpairlist.iterrows():
intdate = row[1]['UTCDate']
if intgraph[geneaindex,genebindex,INTACT_FD_INDEX]==-1 or intdate<intgraph[geneaindex,genebindex,INTACT_FD_INDEX]:
intgraph[geneaindex,genebindex,INTACT_FD_INDEX]=intdate
for curgene in genelist:
curgeneindex=geneindexmap[curgene]
ns=0.0
ni=0.0
for neighborgeneindex in range(0,numgenes):
if intgraph[curgeneindex,neighborgeneindex,INTACT_SR_INDEX]>Tns:
ns=ns+1
if intgraph[neighborgeneindex,targetgeneindex,INTACT_KR_INDEX]>Tng:
ni=ni+1
if ns==0:
nidivns=-1
else:
nidivns=(ni/ns)
dict1 = {'Gene': curgene, 'Ni': ni, 'Ns': ns, 'Ni/Ns' : nidivns, 'Hit' : False, 'INTACT_FD_INDEX' : str(intdate) + str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(intdate)))}
rows_list.append(dict1)
return pd.DataFrame(rows_list)
def computepredrelations(intgraph,genelist,geneindexmap,targetgene):
# compute predictions of all genes with the target gene
# for a given gene, check if the gene has strong semantic relations with genes known to interact with the target gene
numgenes=len(genelist)
Tns=0.7 # threshold for determining Ns
Tng=0.4 # threshold for determining Ni
targetgeneindex=geneindexmap[targetgene]
rows_list = []
for curgene in genelist:
curgeneindex=geneindexmap[curgene]
ns=0.0
ni=0.0
for neighborgeneindex in range(0,numgenes):
if intgraph[curgeneindex,neighborgeneindex,INTACT_SR_INDEX]>Tns:
ns=ns+1
if intgraph[neighborgeneindex,targetgeneindex,INTACT_KR_INDEX]>Tng:
ni=ni+1
if ns==0:
nidivns=-1
else:
nidivns=(ni/ns)
dict1 = {'Gene': curgene, 'Ni': ni, 'Ns': ns, 'Score' : nidivns}
rows_list.append(dict1)
return pd.DataFrame(rows_list)
def evaluatepredinteractions(intgraph,dfresults,evaldate,targetgene):
# given a prediction in dfresults, check and see if the relation already exists
# before the evaldate in which case mark it as a non hit, otherwise its a hit
results=[]
for row in dfresults.iterrows():
genea = row[1]['Gene']
score = row[1]['Score']
geneaindex=geneindexmap[genea]
genebindex=geneindexmap[targetgene]
firstdate=intgraph[geneaindex,genebindex,INTACT_FD_INDEX]
if score>0 and (firstdate==-1 or firstdate>=evaldate):
results.append(True)
else:
results.append(False)
dfresults['Hit']=results
##################################
##################################
debug=True
inFile = "/projects/mipseq/chemotext2/IntActKinaseP53Pairs.csv"
# FOR DEBUGGING inFile = "C:\Users\cpschmitt\Desktop\development\kinases\TestSet1_IntActPairs.csv"
# inWEFile = "/projects/chemotext/kinase_analysis/wordembedding_models/examplemodel"
inWEFile = "/projects/stars/var/chemotext/w2v/gensim/cumulative/pmc-2016.w2v"
outPredResultsFile = "/projects/mipseq/chemotext2/KinaseP53Resultstest_Comulative2016.csv"
intgraph=None # main array, [i][j]=ith gene by jth gene
INTACT_SR_INDEX=0 # [i][j][0]=semantic similarity of gene i and gene j (0,1)
INTACT_KR_INDEX=1 # [i][j][1]=interaction score of gene i and gene j from known sources (-1 to 1)
INTACT_FD_INDEX=2 # [i][j][2]=first date of gene i and gene j from known sources (utc, -1 if unknown)
INTACT_NUM_DIMENSIONS=3 # number of dimensions of
# get evaluation date (date of word embedding generation)
evaldatestr = "01/01/2016" # jan 1 2012 0:0:0
#evaldate=1356998400 # jan 1 2013 0:0:0
evaldate = time.mktime(datetime.datetime.strptime(evaldatestr, "%d/%m/%Y").timetuple())
# target gene
targetgene="TP53"
# step 1 read in gene interaction data
dfpairlist=pd.read_csv(inFile)
# step 2 get list of genes and a map from gene to index within the genelist, the map will be used
# to go from gene name to index within genelist and intgraph
geneset=Set(dfpairlist['GeneA'].tolist())
geneset=geneset.union(dfpairlist['GeneB'].tolist())
genelist=sorted(geneset)
geneindexmap = {x:i for i,x in enumerate(genelist)}
# construct intact
numgenes=len(genelist)
intact=np.zeros((numgenes,numgenes,INTACT_NUM_DIMENSIONS))
# step 3 read in the word embeddings model
wemodel = gensim.models.Word2Vec.load(inWEFile)
checkgenecoverage(genelist,wemodel)
# FOR DEBUGGING wemodel= testmodel("C:\Users\cpschmitt\Desktop\development\kinases\TestSet1_IntActSemRep.csv")
# step 4 update intact with semantic relatedness between genes
computesemanticrelatedness(intact,genelist,wemodel)
#print intact[geneindexmap['K1'],geneindexmap['P1'],INTACT_SR_INDEX]
#print intact[geneindexmap['P2'],geneindexmap['K1'],INTACT_SR_INDEX]
#print intact[geneindexmap['P1'],geneindexmap['K1'],INTACT_SR_INDEX]
# compute known gene-gene interaction data
computeknownrelations(intact,geneindexmap,dfpairlist,evaldate)
#print intact[geneindexmap['P1'],geneindexmap['TP53'],INTACT_KR_INDEX]
#print intact[geneindexmap['P2'],geneindexmap['TP53'],INTACT_KR_INDEX]
#print intact[geneindexmap['P10'],geneindexmap['TP53'],INTACT_KR_INDEX]
#print intact[geneindexmap['K1'],geneindexmap['TP53'],INTACT_KR_INDEX]
#print intact[geneindexmap['K1'],geneindexmap['Tmp1'],INTACT_KR_INDEX]
#print intact[geneindexmap['K1'],geneindexmap['K2'],INTACT_KR_INDEX]
# get date of first known relations
computeknownfirstdate(intact,geneindexmap,dfpairlist)
#print intact[geneindexmap['P1'],geneindexmap['TP53'],INTACT_FD_INDEX]
#print intact[geneindexmap['P5'],geneindexmap['TP53'],INTACT_FD_INDEX]
#print intact[geneindexmap['K1'],geneindexmap['TP53'],INTACT_FD_INDEX]
#print intact[geneindexmap['K2'],geneindexmap['TP53'],INTACT_FD_INDEX]
# predict novel gene-gene interactions based upon semantic relatedness graph
dfresults=computepredrelations(intact,genelist,geneindexmap,targetgene)
# evaluate predictions
evaluatepredinteractions(intact,dfresults,evaldate,targetgene)
dfresults.to_csv(outPredResultsFile)