-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathassign_nodes.py
200 lines (170 loc) · 6.41 KB
/
assign_nodes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import pandas as pd
import re
import os
# set column number and width to display all information
pd.set_option('display.max_rows', None)
# Read in the user example file and output as a pandas dataframe
def read_user_input(user_example_file):
examples = pd.read_csv(user_example_file, sep= "|")
return(examples)
# Get list of unique nodes
# Inputs: examples pandas dataframe of user input examples.
# Outputs: nodes set of unique nodes
def unique_nodes(examples):
# get unique node values
nodes = list(set(pd.melt(examples)["value"].values))
return(nodes)
# Search through labels to find nodes based on input feature
# Inputs: node string for user input example.
# kg knowledge graph of class Knowledge Graph
# ontology specific ontology to restrict search of nodes
def find_node(node, kg, ontology = ""):
nodes = kg.labels_all
results = nodes[nodes["label"].str.contains(node,flags=re.IGNORECASE, na = False)][["label", "entity_uri"]]
return(results)
# Create a list of nodes for input
def search_nodes(nodes, kg, examples):
print('searching nodes: ',nodes)
examples["source_label"] = ""
examples["target_label"] = ""
for node in nodes:
orig_node = node
search_loop = True
while(search_loop):
print("User Search Node: ", node)
found_nodes = find_node(node,kg)
nrow = found_nodes.shape[0]
if nrow == 0:
print("No search terms returned")
node = input("Please try another input term: ")
else:
search_loop = False
print("Found", nrow, "features in KG")
user_input = ""
bad_input = True
if nrow < 20:
while(bad_input):
print(found_nodes.iloc[0:nrow,].to_string())
user_input = input("Input node 'label': ")
if node_in_search(found_nodes,user_input):
#Manage if there are 2 duplicate label names
if len(found_nodes[found_nodes['label'] == user_input][['label','entity_uri']]) > 1:
user_input = input("Input node 'entity_uri': ")
if node_id_in_search(found_nodes,user_input):
node_label = kg.labels_all.loc[kg.labels_all['entity_uri'] == user_input,'label'].values[0]
bad_input = False
else:
node_label= user_input
bad_input = False
elif node_in_labels(kg,user_input):
node_label= user_input
bad_input = False
else:
print("Input not in search results.... try again")
else:
i = 0
while(bad_input):
high = min(nrow,(i+1)*20)
print(found_nodes.iloc[i*20:high,].to_string())
user_input = input("Input node 'label' or 'f' for the next 20 features or 'b' for the previous 20: ")
if user_input == 'f':
i+=1
elif user_input == 'b':
i-=1
else:
i+=1
if node_in_search(found_nodes,user_input):
node_label = user_input
bad_input = False
else:
print("Input not in search results.... try again")
'found good input'
examples.loc[examples["source"] == orig_node,"source_label"] = node_label
## Skip this line for comparing kg-covid19/monarch graph to pkl only
examples.loc[examples["target"] == orig_node,"target_label"] = node_label
examples.drop_duplicates(keep='first', inplace=True)
print('examples complete: ',examples)
return(examples)
#An automatic way of doing search_node without user input, if what is given in file is exact
def extract_label(nodes, kg, examples):
examples["source_label"] = ""
examples["target_label"] = ""
for node in nodes:
node_label = kg.labels_all.loc[kg.labels_all['entity_uri'] == node,'label'].values[0]
examples.loc[examples["source"] == node,"source_label"] = node_label
examples.loc[examples["target"] == node,"target_label"] = node_label
return(examples)
# Check if search input is in the list of integer_ids
def node_in_search(found_nodes, user_input):
if user_input in found_nodes[["label"]].values:
return(True)
else:
return(False)
# Check if search input is in the list of integer_ids
def node_id_in_search(found_nodes, user_input):
if user_input in found_nodes[["entity_uri"]].values:
return(True)
else:
return(False)
# Check if search input is in the all nodes
def node_in_labels(kg, user_input):
labels = kg.labels_all
if user_input in labels[["label"]].values:
return(True)
else:
return(False)
#subgraph_df is a dataframe with source,targe headers and | delimited
def create_input_file(examples,output_dir,kg_type,experiment_paths):
input_file = output_dir+"/"+kg_type+'_'+experiment_paths+"_Input_Nodes_.csv"
#examples = examples[["source_label","target_label"]]
#examples.columns = ["source", "target"]
examples.to_csv(input_file, sep = "|", index = False)
# Check if the input_nodes file already exists
def check_input_existence(output_dir,kg_type,experiment_paths):
print('checking for: ',output_dir+'/'+kg_type+'_'+experiment_paths+"_Input_Nodes_")
exists = 'false'
mapped_file = ''
for fname in os.listdir(output_dir):
if bool(re.match(kg_type+'_'+experiment_paths+"_Input_Nodes_",fname)):
print(fname)
exists = 'true'
mapped_file = fname
return exists,mapped_file
# Wrapper function
def interactive_search_wrapper(g,user_input_file, output_dir,kg_type,experiment_paths,input_type,input_df):
if input_type == 'file':
exists = check_input_existence(output_dir,kg_type,experiment_paths)
if(exists[0] == 'false'):
print('Interactive Node Search')
#Interactively assign node
u = read_user_input(user_input_file)
n = unique_nodes(u)
s = search_nodes(n,g,u)
create_input_file(s,output_dir,kg_type,experiment_paths)
else:
print('Node mapping file exists... moving to embedding creation')
mapped_file = output_dir + '/'+ exists[1]
s = pd.read_csv(mapped_file, sep = "|")
else:
n = unique_nodes(input_df)
s = extract_label(n,g,input_df)
#create_input_file(s,output_dir,kg_type,experiment_paths)
return(s)
'''
#To REMOVE
# Wrapper function without input file that is user defining the full mechanism
def interactive_search_wrapper_without_file(g,u, output_dir,kg_type,experiment_paths):
exists = check_input_existence(output_dir,kg_type,experiment_paths)
if(exists[0] == 'false'):
print('Interactive Node Search')
#Interactively assign node
n = unique_nodes(u)
#Added removing duplicates here
s = search_nodes(n,g,u)
create_input_file(s,output_dir,kg_type,experiment_paths)
else:
print('Node mapping file exists... moving to embedding creation')
mapped_file = output_dir + '/'+ exists[1]
s = pd.read_csv(mapped_file, sep = "|")
return(s)
'''