Merge pull request #87 from GiulioRossetti/markov-clustering

⬆️ markov clustering update
GiulioRossetti · Jan 29, 2020 · 6358e44 · 6358e44
2 parents fcb9e5b + 0f5b076
commit 6358e44
Show file tree

Hide file tree

Showing 9 changed files with 61 additions and 63 deletions.
diff --git a/cdlib/algorithms/crisp_partition.py b/cdlib/algorithms/crisp_partition.py
@@ -31,13 +31,15 @@
 from cdlib.algorithms.internal.GDMP2_nx import GDMP2
 from cdlib.algorithms.internal.AGDL import Agdl
 from cdlib.algorithms.internal.FuzzyCom import fuzzy_comm
+from cdlib.algorithms.internal.Markov import markov
 import networkx as nx
 
 from cdlib.utils import convert_graph_formats, __from_nx_to_graph_tool, affiliations2nodesets, nx_node_integer_mapping
 
 __all__ = ["louvain", "leiden", "rb_pots", "rber_pots", "cpm", "significance_communities", "surprise_communities",
            "greedy_modularity", "der", "label_propagation", "async_fluid", "infomap", "walktrap", "girvan_newman", "em",
-           "scan", "gdmp2", "spinglass", "eigenvector", "agdl", "frc_fgsn", "sbm_dl", "sbm_dl_nested"]
+           "scan", "gdmp2", "spinglass", "eigenvector", "agdl", "frc_fgsn", "sbm_dl", "sbm_dl_nested",
+           "markov_clustering"]
 
 
 def girvan_newman(g, level):
@@ -965,3 +967,44 @@ def sbm_dl_nested(g, B_min=None,B_max=None, deg_corr=True, **kwargs):
     coms = affiliations2nodesets(affiliations)
     coms = [list(v) for k,v in coms.items()]
     return NodeClustering(coms, g, "SBM_nested", method_parameters={"B_min": B_min, "B_max": B_max, "deg_corr": deg_corr})
+
+
+def markov_clustering(g,  max_loop=1000):
+    """
+    The Markov clustering algorithm (MCL) is based on simulation of (stochastic) flow in graphs.
+    The MCL algorithm finds cluster structure in graphs by a mathematical bootstrapping procedure. The process deterministically computes (the probabilities of) random walks through the graph, and uses two operators transforming one set of probabilities into another. It does so using the language of stochastic matrices (also called Markov matrices) which capture the mathematical concept of random walks on a graph.
+    The MCL algorithm simulates random walks within a graph by alternation of two operators called expansion and inflation.
+
+    :param g: a networkx/igraph object
+    :param max_loop: maximum number of iterations, default 1000
+    :return: EdgeClustering object
+
+    :Example:
+
+    >>> from cdlib import algorithms
+    >>> import networkx as nx
+    >>> G = nx.karate_club_graph()
+    >>> coms = algorithms.markov_clustering(G, max_loop=1000)
+
+    :References:
+
+    Enright, Anton J., Stijn Van Dongen, and Christos A. Ouzounis. `An efficient algorithm for large-scale detection of protein families. <https://www.ncbi.nlm.nih.gov/pubmed/11917018/>`_ Nucleic acids research 30.7 (2002): 1575-1584.
+
+    .. note:: Reference implementation: https://github.com/HarshHarwani/markov-clustering-for-graphs
+    """
+
+    g = convert_graph_formats(g, nx.Graph)
+    g, maps = nx_node_integer_mapping(g)
+
+    communities = markov(g, max_loop)
+
+    if maps is not None:
+        communities = []
+        for c in communities:
+            communities.append([tuple(maps[n]) for n in c])
+
+        nx.relabel_nodes(g, maps, False)
+    else:
+        communities = [list(c) for c in communities]
+
+    return NodeClustering(communities, g, "Markov Clustering", method_parameters={"max_loop": max_loop})
diff --git a/cdlib/algorithms/edge_clustering.py b/cdlib/algorithms/edge_clustering.py
@@ -1,11 +1,10 @@
 from cdlib import EdgeClustering
 from collections import defaultdict
 import networkx as nx
-from cdlib.algorithms.internal.Markov import markov
-from cdlib.utils import convert_graph_formats, nx_node_integer_mapping
+from cdlib.utils import convert_graph_formats
 from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted
 
-__all__ = ["hierarchical_link_community", "markov_clustering"]
+__all__ = ["hierarchical_link_community"]
 
 
 def hierarchical_link_community(g):
@@ -42,47 +41,3 @@ def hierarchical_link_community(g):
 
     coms = [list(c) for c in coms.values()]
     return EdgeClustering(coms, g, "HLC", method_parameters={})
-
-
-def markov_clustering(g,  max_loop=1000):
-    """
-    The Markov clustering algorithm (MCL) is based on simulation of (stochastic) flow in graphs.
-    The MCL algorithm finds cluster structure in graphs by a mathematical bootstrapping procedure. The process deterministically computes (the probabilities of) random walks through the graph, and uses two operators transforming one set of probabilities into another. It does so using the language of stochastic matrices (also called Markov matrices) which capture the mathematical concept of random walks on a graph.
-    The MCL algorithm simulates random walks within a graph by alternation of two operators called expansion and inflation.
-
-    :param g: a networkx/igraph object
-    :param max_loop: maximum number of iterations, default 1000
-    :return: EdgeClustering object
-
-    :Example:
-
-    >>> from cdlib import algorithms
-    >>> import networkx as nx
-    >>> G = nx.karate_club_graph()
-    >>> coms = algorithms.markov_clustering(G, max_loop=1000)
-
-    :References:
-
-    Enright, Anton J., Stijn Van Dongen, and Christos A. Ouzounis. `An efficient algorithm for large-scale detection of protein families. <https://www.ncbi.nlm.nih.gov/pubmed/11917018/>`_ Nucleic acids research 30.7 (2002): 1575-1584.
-
-    .. note:: Reference implementation: https://github.com/HarshHarwani/markov-clustering-for-graphs
-    """
-
-    g = convert_graph_formats(g, nx.Graph)
-    g, maps = nx_node_integer_mapping(g)
-
-    coms = markov(g, max_loop)
-
-    if maps is not None:
-        communities = []
-        for c in coms:
-            com = []
-            for e in c:
-                com.append(tuple([maps[n] for n in e]))
-            communities.append(com)
-
-        nx.relabel_nodes(g, maps, False)
-    else:
-        communities = [list(c) for c in coms]
-
-    return EdgeClustering(communities, g, "Markov Clustering", method_parameters={"max_loop": max_loop})
diff --git a/cdlib/algorithms/internal/CONGA.py b/cdlib/algorithms/internal/CONGA.py
@@ -429,7 +429,7 @@ def create_clique(G, v, pb):
 
     # Can use ints instead: (dtype=int). Only works if we use matrix_min
     # instead of mat_min.
-    clique = np.matrix(np.zeros((n, n)))
+    clique = np.zeros((n, n))
 
     for uw, score in pb.items():
 

diff --git a/cdlib/algorithms/internal/CONGO.py b/cdlib/algorithms/internal/CONGO.py
@@ -266,7 +266,7 @@ def congo(OG, h=2):
     the length of the longest shortest path that Congo is to consider.
     """
 
-    logging.basicConfig(filename='congo.log',level=logging.DEBUG)
+    # logging.basicConfig(filename='congo.log',level=logging.DEBUG)
     G = OG.copy()
 
     # Just in case the original graph is disconnected
@@ -286,7 +286,7 @@ def congo(OG, h=2):
     allCovers = {nClusters : ig.VertexCover(OG)}
     while G.es:
 
-        logging.info("%d edges remaining", len(G.es))
+        # logging.info("%d edges remaining", len(G.es))
         # get the edge with the max edge betweenness, and its betweenness.
         maxEdge, maxEb = max(enumerate(G.es['eb']), key=operator.itemgetter(1))
         G.vs['vb'] = G.betweenness(cutoff=h)
@@ -300,7 +300,7 @@ def congo(OG, h=2):
         # TODO check if I need to multiply by 2
         vInteresting = [i for i, b in enumerate(G.vs['vb']) if 2 * b > maxEb]
 
-        logging.info("Vertices to examine: %s", vInteresting)
+        # logging.info("Vertices to examine: %s", vInteresting)
         splitInstr = max_split_betweenness(G, vInteresting)
 
         # split if max split betweenness > max edge betweenness
@@ -330,7 +330,7 @@ def delete_edge(G, edge, h):
 
     tup = G.es[edge].tuple
 
-    logging.info("Deleted: %s", tup)
+    # logging.info("Deleted: %s", tup)
 
     neighborhood = get_neighborhood_edge(G, tup, h)
     # subtracts local betweennesses in the region, as discussed
@@ -401,7 +401,7 @@ def split_vertex(G, vToSplit, instr, h):
     G.delete_edges(toDelete)
     neighborhood.append(new_index)
     fix_betweennesses(G)
-    logging.info("split: %d, %s", vToSplit, instr)
+    # logging.info("split: %d, %s", vToSplit, instr)
     do_local_betweenness(G, neighborhood, h, operator.pos)
     # check if the two new vertices are disconnected.
     return check_for_split(G, (vToSplit, new_index))
@@ -455,7 +455,7 @@ def do_initial_betweenness(G, h):
     # Counter for normalizing scores
     pathCounts = Counter()
     for ver in G.vs:
-        logging.info("initializing betweennesses for %d", ver.index)
+        # logging.info("initializing betweennesses for %d", ver.index)
         neighborhood = get_neighborhood_vertex(G, ver, h)
         neighborhood.remove(ver.index)
         #for i, v in enumerate(neighborhood):
@@ -467,7 +467,7 @@ def do_initial_betweenness(G, h):
     for path in all_pairs_shortest_paths:
         pathCounts[(path[0], path[-1])] += 1
 
-    logging.info("updating all betweenness attributes...")
+    # logging.info("updating all betweenness attributes...")
     for path in all_pairs_shortest_paths:
         if len(path) <= h + 1:
             update_betweenness(G, path, pathCounts[(path[0], path[-1])], operator.pos)
@@ -584,7 +584,7 @@ def create_clique(G, v, pb):
 
     # Can use ints instead: (dtype=int). Only works if we use matrix_min
     # instead of mat_min.
-    clique = np.matrix(np.zeros((n, n)))
+    clique = np.zeros((n, n))
     for uw, score in pb.items():
         clique[mapping[uw[0]], mapping[uw[1]]] = score
         clique[mapping[uw[1]], mapping[uw[0]]] = score

diff --git a/cdlib/algorithms/internal/LEMON.py b/cdlib/algorithms/internal/LEMON.py
@@ -81,7 +81,7 @@ def __cal_conductance(G, cluster):
     subgraph = temp[:, cluster]
     cutsize = temp.sum() - subgraph.sum()
     denominator = min(temp.sum(), G.sum() - temp.sum())
-    conductance = cutsize / denominator
+    conductance = cutsize / denominator if denominator > 0 else 1
 
     return conductance
 

diff --git a/cdlib/algorithms/internal/Markov.py b/cdlib/algorithms/internal/Markov.py
@@ -104,7 +104,7 @@ def markov(graph, max_loop=1000):
     for part in cls.values():
         com = []
         for eid in part:
-            com.append(tuple(map(int, edges[eid])))
-        communities.append(com)
+            com.extend(list(map(int, edges[eid])))
+        communities.append(list(set(com)))
 
     return communities
diff --git a/cdlib/test/test_community_discovery_models.py b/cdlib/test/test_community_discovery_models.py
@@ -281,7 +281,7 @@ def test_markov_clustering(self):
         if len(communities.communities) > 0:
             self.assertEqual(type(communities.communities[0]), list)
             if len(communities.communities[0]) > 0:
-                self.assertEqual(type(communities.communities[0][0]), tuple)
+                self.assertEqual(type(communities.communities[0][0]), str)
 
         g = nx.karate_club_graph()
 
@@ -290,7 +290,7 @@ def test_markov_clustering(self):
         if len(communities.communities) > 0:
             self.assertEqual(type(communities.communities[0]), list)
             if len(communities.communities[0]) > 0:
-                self.assertEqual(type(communities.communities[0][0]), tuple)
+                self.assertEqual(type(communities.communities[0][0]), int)
 
     def test_bigClam(self):
         g = get_string_graph()

diff --git a/docs/reference/cd_algorithms/edge_clustering.rst b/docs/reference/cd_algorithms/edge_clustering.rst
@@ -13,4 +13,3 @@ They return as result a ``EdgeClustering`` object instance.
     :toctree: algs/
 
     hierarchical_link_community
-    markov_clustering
diff --git a/docs/reference/cd_algorithms/node_clustering.rst b/docs/reference/cd_algorithms/node_clustering.rst
@@ -31,6 +31,7 @@ Methods in this subclass returns as result a ``NodeClustering`` object instance.
     label_propagation
     leiden
     louvain
+    markov_clustering
     rber_pots
     rb_pots
     scan
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,4 +13,3 @@ They return as result a ``EdgeClustering`` object instance.
		:toctree: algs/

		hierarchical_link_community
		markov_clustering
-Original file line number
+Diff line change
@@ Expand Up @@
         label_propagation
         leiden
         louvain
+        markov_clustering
         rber_pots
         rb_pots
         scan
@@ Expand Down @@