-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathImproving_ML_with_Graph_Algorithm.py
141 lines (104 loc) · 4.19 KB
/
Improving_ML_with_Graph_Algorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from py2neo import Graph, Node, Relationship
import IPython.display
import nxneo4j as netneo
from neo4j import GraphDatabase
import pandas as pd
import networkx as nx
import numpy as np
import csv
# my-db credentials
password = '1234'
user = 'neo4j'
uri = 'bolt://localhost:7687'
driver = GraphDatabase.driver(uri=uri, auth=(user, password))
my_got = "C:\\Users\\Esrat Maria\\Desktop\\my_got.csv"
dataset = pd.read_csv(my_got)
# print(dataset.head())
# preparing data for ML model
data = dataset.dropna()
X = data[['allegiances', 'nobility', 'has_dead_rels', 'culture', 'house',
'gender', 'has_Allegiance', 'mother', 'father', 'spouse', 'heir', 'data_poor']]
X = pd.get_dummies(X)
y = data['is_alive']
# building ML model
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
classifier = RandomForestClassifier(
n_estimators=50, criterion='entropy', random_state=42, max_depth=8)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
# Trying to make a better model by using graph algorithms
# query = """
# LOAD CSV WITH HEADERS FROM 'file:///my_got.csv' AS line
# WITH line
# WHERE line.house IS NOT NULL
# MERGE(person:Person {Name: line.name})
# MERGE(house:House {Name: line.house})
# MERGE(person) - [:belongs_to] -> (house)
# """
#_to_neo4jDB = graph.run(query)
print("-------------------------------------------")
G = netneo.Graph(driver)
G.delete_all()
G.load_got()
G.identifier_property = 'name'
G.relationship_type = '*'
G.node_label = 'Character'
# Graph Algorithm 1
# the most influential characters
# PageRank Algorithm
# query = '''
# CALL gds.pageRank.stream('prGraph')
# YIELD nodeId, score
# RETURN gds.util.asNode(nodeId).Name AS name, score as pageRank
# ORDER BY pageRank DESC
# limit 1962
# '''
# _to_neo4jDB = graph.run(query).to_data_frame()
pagerank = netneo.pagerank(G)
sorted_pagerank = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)
for character, score in sorted_pagerank[:5]:
print(character, score)
print("--------------------------------------------------------------")
_to_df = pd.DataFrame(list(pagerank.items()), columns=['name', 'pagerank'])
print(_to_df.head())
print("--------------------------------------------------------------")
merge_pagerank_data_with_csv = data.merge(_to_df, on='name', how='left')
print(merge_pagerank_data_with_csv.head())
print("--------------------------------------------------------------")
# Graph Algorithm 2
# calculating Closeness centrality
closeness_centrality = netneo.closeness_centrality(G)
_cc_to_df = pd.DataFrame(list(closeness_centrality.items()), columns=[
'name', 'closeness centrality'])
print(_cc_to_df.head())
print("--------------------------------------------------------------")
merge_cc_data_with_csv = merge_pagerank_data_with_csv.merge(
_cc_to_df, on='name', how='left')
print(merge_cc_data_with_csv.head())
print("--------------------------------------------------------------")
# checking the ML model again to see if the accuracy got any better
merge_cc_data_with_csv = merge_cc_data_with_csv.fillna(
merge_cc_data_with_csv.mean())
X = merge_cc_data_with_csv[['allegiances', 'nobility', 'has_dead_rels', 'culture', 'house',
'gender', 'has_Allegiance', 'mother', 'father', 'spouse', 'heir', 'data_poor', 'pagerank', 'closeness centrality']]
X = pd.get_dummies(X)
y = data['is_alive']
# building ML model
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.37, random_state=42)
classifier = RandomForestClassifier(
n_estimators=50, criterion='entropy', random_state=42, max_depth=8)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("--------------------------------------------------------------")