Skip to content

Commit

Permalink
Merge pull request #87 from GiulioRossetti/markov-clustering
Browse files Browse the repository at this point in the history
⬆️ markov clustering update
  • Loading branch information
GiulioRossetti authored Jan 29, 2020
2 parents fcb9e5b + 0f5b076 commit 6358e44
Show file tree
Hide file tree
Showing 9 changed files with 61 additions and 63 deletions.
45 changes: 44 additions & 1 deletion cdlib/algorithms/crisp_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,15 @@
from cdlib.algorithms.internal.GDMP2_nx import GDMP2
from cdlib.algorithms.internal.AGDL import Agdl
from cdlib.algorithms.internal.FuzzyCom import fuzzy_comm
from cdlib.algorithms.internal.Markov import markov
import networkx as nx

from cdlib.utils import convert_graph_formats, __from_nx_to_graph_tool, affiliations2nodesets, nx_node_integer_mapping

__all__ = ["louvain", "leiden", "rb_pots", "rber_pots", "cpm", "significance_communities", "surprise_communities",
"greedy_modularity", "der", "label_propagation", "async_fluid", "infomap", "walktrap", "girvan_newman", "em",
"scan", "gdmp2", "spinglass", "eigenvector", "agdl", "frc_fgsn", "sbm_dl", "sbm_dl_nested"]
"scan", "gdmp2", "spinglass", "eigenvector", "agdl", "frc_fgsn", "sbm_dl", "sbm_dl_nested",
"markov_clustering"]


def girvan_newman(g, level):
Expand Down Expand Up @@ -965,3 +967,44 @@ def sbm_dl_nested(g, B_min=None,B_max=None, deg_corr=True, **kwargs):
coms = affiliations2nodesets(affiliations)
coms = [list(v) for k,v in coms.items()]
return NodeClustering(coms, g, "SBM_nested", method_parameters={"B_min": B_min, "B_max": B_max, "deg_corr": deg_corr})


def markov_clustering(g, max_loop=1000):
"""
The Markov clustering algorithm (MCL) is based on simulation of (stochastic) flow in graphs.
The MCL algorithm finds cluster structure in graphs by a mathematical bootstrapping procedure. The process deterministically computes (the probabilities of) random walks through the graph, and uses two operators transforming one set of probabilities into another. It does so using the language of stochastic matrices (also called Markov matrices) which capture the mathematical concept of random walks on a graph.
The MCL algorithm simulates random walks within a graph by alternation of two operators called expansion and inflation.
:param g: a networkx/igraph object
:param max_loop: maximum number of iterations, default 1000
:return: EdgeClustering object
:Example:
>>> from cdlib import algorithms
>>> import networkx as nx
>>> G = nx.karate_club_graph()
>>> coms = algorithms.markov_clustering(G, max_loop=1000)
:References:
Enright, Anton J., Stijn Van Dongen, and Christos A. Ouzounis. `An efficient algorithm for large-scale detection of protein families. <https://www.ncbi.nlm.nih.gov/pubmed/11917018/>`_ Nucleic acids research 30.7 (2002): 1575-1584.
.. note:: Reference implementation: https://github.com/HarshHarwani/markov-clustering-for-graphs
"""

g = convert_graph_formats(g, nx.Graph)
g, maps = nx_node_integer_mapping(g)

communities = markov(g, max_loop)

if maps is not None:
communities = []
for c in communities:
communities.append([tuple(maps[n]) for n in c])

nx.relabel_nodes(g, maps, False)
else:
communities = [list(c) for c in communities]

return NodeClustering(communities, g, "Markov Clustering", method_parameters={"max_loop": max_loop})
49 changes: 2 additions & 47 deletions cdlib/algorithms/edge_clustering.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from cdlib import EdgeClustering
from collections import defaultdict
import networkx as nx
from cdlib.algorithms.internal.Markov import markov
from cdlib.utils import convert_graph_formats, nx_node_integer_mapping
from cdlib.utils import convert_graph_formats
from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted

__all__ = ["hierarchical_link_community", "markov_clustering"]
__all__ = ["hierarchical_link_community"]


def hierarchical_link_community(g):
Expand Down Expand Up @@ -42,47 +41,3 @@ def hierarchical_link_community(g):

coms = [list(c) for c in coms.values()]
return EdgeClustering(coms, g, "HLC", method_parameters={})


def markov_clustering(g, max_loop=1000):
"""
The Markov clustering algorithm (MCL) is based on simulation of (stochastic) flow in graphs.
The MCL algorithm finds cluster structure in graphs by a mathematical bootstrapping procedure. The process deterministically computes (the probabilities of) random walks through the graph, and uses two operators transforming one set of probabilities into another. It does so using the language of stochastic matrices (also called Markov matrices) which capture the mathematical concept of random walks on a graph.
The MCL algorithm simulates random walks within a graph by alternation of two operators called expansion and inflation.
:param g: a networkx/igraph object
:param max_loop: maximum number of iterations, default 1000
:return: EdgeClustering object
:Example:
>>> from cdlib import algorithms
>>> import networkx as nx
>>> G = nx.karate_club_graph()
>>> coms = algorithms.markov_clustering(G, max_loop=1000)
:References:
Enright, Anton J., Stijn Van Dongen, and Christos A. Ouzounis. `An efficient algorithm for large-scale detection of protein families. <https://www.ncbi.nlm.nih.gov/pubmed/11917018/>`_ Nucleic acids research 30.7 (2002): 1575-1584.
.. note:: Reference implementation: https://github.com/HarshHarwani/markov-clustering-for-graphs
"""

g = convert_graph_formats(g, nx.Graph)
g, maps = nx_node_integer_mapping(g)

coms = markov(g, max_loop)

if maps is not None:
communities = []
for c in coms:
com = []
for e in c:
com.append(tuple([maps[n] for n in e]))
communities.append(com)

nx.relabel_nodes(g, maps, False)
else:
communities = [list(c) for c in coms]

return EdgeClustering(communities, g, "Markov Clustering", method_parameters={"max_loop": max_loop})
2 changes: 1 addition & 1 deletion cdlib/algorithms/internal/CONGA.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def create_clique(G, v, pb):

# Can use ints instead: (dtype=int). Only works if we use matrix_min
# instead of mat_min.
clique = np.matrix(np.zeros((n, n)))
clique = np.zeros((n, n))

for uw, score in pb.items():

Expand Down
16 changes: 8 additions & 8 deletions cdlib/algorithms/internal/CONGO.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def congo(OG, h=2):
the length of the longest shortest path that Congo is to consider.
"""

logging.basicConfig(filename='congo.log',level=logging.DEBUG)
# logging.basicConfig(filename='congo.log',level=logging.DEBUG)
G = OG.copy()

# Just in case the original graph is disconnected
Expand All @@ -286,7 +286,7 @@ def congo(OG, h=2):
allCovers = {nClusters : ig.VertexCover(OG)}
while G.es:

logging.info("%d edges remaining", len(G.es))
# logging.info("%d edges remaining", len(G.es))
# get the edge with the max edge betweenness, and its betweenness.
maxEdge, maxEb = max(enumerate(G.es['eb']), key=operator.itemgetter(1))
G.vs['vb'] = G.betweenness(cutoff=h)
Expand All @@ -300,7 +300,7 @@ def congo(OG, h=2):
# TODO check if I need to multiply by 2
vInteresting = [i for i, b in enumerate(G.vs['vb']) if 2 * b > maxEb]

logging.info("Vertices to examine: %s", vInteresting)
# logging.info("Vertices to examine: %s", vInteresting)
splitInstr = max_split_betweenness(G, vInteresting)

# split if max split betweenness > max edge betweenness
Expand Down Expand Up @@ -330,7 +330,7 @@ def delete_edge(G, edge, h):

tup = G.es[edge].tuple

logging.info("Deleted: %s", tup)
# logging.info("Deleted: %s", tup)

neighborhood = get_neighborhood_edge(G, tup, h)
# subtracts local betweennesses in the region, as discussed
Expand Down Expand Up @@ -401,7 +401,7 @@ def split_vertex(G, vToSplit, instr, h):
G.delete_edges(toDelete)
neighborhood.append(new_index)
fix_betweennesses(G)
logging.info("split: %d, %s", vToSplit, instr)
# logging.info("split: %d, %s", vToSplit, instr)
do_local_betweenness(G, neighborhood, h, operator.pos)
# check if the two new vertices are disconnected.
return check_for_split(G, (vToSplit, new_index))
Expand Down Expand Up @@ -455,7 +455,7 @@ def do_initial_betweenness(G, h):
# Counter for normalizing scores
pathCounts = Counter()
for ver in G.vs:
logging.info("initializing betweennesses for %d", ver.index)
# logging.info("initializing betweennesses for %d", ver.index)
neighborhood = get_neighborhood_vertex(G, ver, h)
neighborhood.remove(ver.index)
#for i, v in enumerate(neighborhood):
Expand All @@ -467,7 +467,7 @@ def do_initial_betweenness(G, h):
for path in all_pairs_shortest_paths:
pathCounts[(path[0], path[-1])] += 1

logging.info("updating all betweenness attributes...")
# logging.info("updating all betweenness attributes...")
for path in all_pairs_shortest_paths:
if len(path) <= h + 1:
update_betweenness(G, path, pathCounts[(path[0], path[-1])], operator.pos)
Expand Down Expand Up @@ -584,7 +584,7 @@ def create_clique(G, v, pb):

# Can use ints instead: (dtype=int). Only works if we use matrix_min
# instead of mat_min.
clique = np.matrix(np.zeros((n, n)))
clique = np.zeros((n, n))
for uw, score in pb.items():
clique[mapping[uw[0]], mapping[uw[1]]] = score
clique[mapping[uw[1]], mapping[uw[0]]] = score
Expand Down
2 changes: 1 addition & 1 deletion cdlib/algorithms/internal/LEMON.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def __cal_conductance(G, cluster):
subgraph = temp[:, cluster]
cutsize = temp.sum() - subgraph.sum()
denominator = min(temp.sum(), G.sum() - temp.sum())
conductance = cutsize / denominator
conductance = cutsize / denominator if denominator > 0 else 1

return conductance

Expand Down
4 changes: 2 additions & 2 deletions cdlib/algorithms/internal/Markov.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def markov(graph, max_loop=1000):
for part in cls.values():
com = []
for eid in part:
com.append(tuple(map(int, edges[eid])))
communities.append(com)
com.extend(list(map(int, edges[eid])))
communities.append(list(set(com)))

return communities
4 changes: 2 additions & 2 deletions cdlib/test/test_community_discovery_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def test_markov_clustering(self):
if len(communities.communities) > 0:
self.assertEqual(type(communities.communities[0]), list)
if len(communities.communities[0]) > 0:
self.assertEqual(type(communities.communities[0][0]), tuple)
self.assertEqual(type(communities.communities[0][0]), str)

g = nx.karate_club_graph()

Expand All @@ -290,7 +290,7 @@ def test_markov_clustering(self):
if len(communities.communities) > 0:
self.assertEqual(type(communities.communities[0]), list)
if len(communities.communities[0]) > 0:
self.assertEqual(type(communities.communities[0][0]), tuple)
self.assertEqual(type(communities.communities[0][0]), int)

def test_bigClam(self):
g = get_string_graph()
Expand Down
1 change: 0 additions & 1 deletion docs/reference/cd_algorithms/edge_clustering.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,3 @@ They return as result a ``EdgeClustering`` object instance.
:toctree: algs/

hierarchical_link_community
markov_clustering
1 change: 1 addition & 0 deletions docs/reference/cd_algorithms/node_clustering.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Methods in this subclass returns as result a ``NodeClustering`` object instance.
label_propagation
leiden
louvain
markov_clustering
rber_pots
rb_pots
scan
Expand Down

0 comments on commit 6358e44

Please sign in to comment.