Merge pull request #88 from GiulioRossetti/karate_club_integration

Methods integration
GiulioRossetti · Jan 30, 2020 · 1a54ecd · 1a54ecd
2 parents 6358e44 + 472ba90
commit 1a54ecd
Show file tree

Hide file tree

Showing 18 changed files with 409 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -6,14 +6,15 @@
 [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/GiulioRossetti/nclib.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/GiulioRossetti/nclib/context:python)
 [![pyversions](https://img.shields.io/pypi/pyversions/cdlib.svg)](https://badge.fury.io/py/cdlib)
 [![PyPI version](https://badge.fury.io/py/cdlib.svg)](https://badge.fury.io/py/cdlib)
+[![PyPI download month](https://img.shields.io/pypi/dm/cdlib.svg?color=blue&style=plastic)](https://pypi.python.org/pypi/cdlib/)
 [![DOI](https://zenodo.org/badge/159944561.svg)](https://zenodo.org/badge/latestdoi/159944561)
 [![FOSSA Status](https://app.fossa.io/api/projects/git%2Bgithub.com%2FGiulioRossetti%2Fcdlib.svg?type=shield)](https://app.fossa.io/projects/git%2Bgithub.com%2FGiulioRossetti%2Fcdlib?ref=badge_shield)
 
 
-``CDlib`` provides implementations of several community discovery algorithms.
-Moreover, it implements a wide set of partition evaluation measures as well as predefined visualization facilities.
+``CDlib`` is a meta-library for community discovery in complex networks: it implements algorithms, clustering fitness functions as well as visualization facilities.
+Moreover
 
-``CDlib`` is designed around the ``networkx`` python library: however, when needed, it takes care to authomatically convert (from and to) ``igraph`` object so to provide an abstraction on specific algorithm implementations to the final user.
+``CDlib`` is designed around the ``networkx`` python library: however, when needed, it takes care to automatically convert (from and to) ``igraph`` object so to provide an abstraction on specific algorithm implementations to the final user.
 
 ``CDlib`` provides a standardized input/output facilities for several Community Discovery algorithms: whenever possible, to guarantee literature coherent results, implementations of CD algorithms are inherited from their original projects (see for instance [Infomap](https://pypi.org/project/infomap/),
 [Louvain](https://github.com/taynaud/python-louvain), [Leiden](https://github.com/vtraag/leidenalg)).

diff --git a/cdlib/__init__.py b/cdlib/__init__.py
@@ -2,3 +2,4 @@
 from cdlib.classes.edge_clustering import EdgeClustering
 from cdlib.classes.fuzzy_node_clustering import FuzzyNodeClustering
 from cdlib.classes.attr_node_clustering import AttrNodeClustering
+from cdlib.classes.bipartite_node_clustering import BiNodeClustering
diff --git a/cdlib/algorithms/__init__.py b/cdlib/algorithms/__init__.py
@@ -2,3 +2,4 @@
 from .crisp_partition import *
 from .overlapping_partition import *
 from .attribute_clustering import *
+from .biparitte_clustering import *
diff --git a/cdlib/algorithms/biparitte_clustering.py b/cdlib/algorithms/biparitte_clustering.py
@@ -0,0 +1,41 @@
+from BiMLPA import BiMLPA_SqrtDeg, relabeling, output_community
+from cdlib import BiNodeClustering
+
+import networkx as nx
+from cdlib.utils import convert_graph_formats
+
+
+__all__ = ['bimlpa']
+
+
+def bimlpa(g, theta=0.3, lambd=7):
+    """
+    BiMLPA is designed to detect the many-to-many correspondence community in bipartite networks using multi-label propagation algorithm.
+
+    :param g: a networkx/igraph object
+    :param theta: Label weights threshold. Default 0.3.
+    :param lambd: The max number of labels. Default 7.
+    :return: BiNodeClustering object
+
+
+    :Example:
+
+    >>> from cdlib import algorithms
+    >>> import networkx as nx
+    >>> G = nx.karate_club_graph()
+    >>> coms = algorithms.bimlpa(G)
+
+    :References:
+
+    Taguchi, Hibiki, Tsuyoshi Murata, and Xin Liu. "BiMLPA: Community Detection in Bipartite Networks by Multi-Label Propagation." International Conference on Network Science. Springer, Cham, 2020.
+
+    .. note:: Reference implementation: https://github.com/hbkt/BiMLPA
+    """
+    g = convert_graph_formats(g, nx.Graph)
+
+    bimlpa = BiMLPA_SqrtDeg(g, theta, lambd)
+    bimlpa.start()
+    relabeling(g)
+    top_coms, bottom_coms = output_community(g)
+
+    return BiNodeClustering(top_coms, bottom_coms, g, "BiMLPA", method_parameters={"theta": theta, "lambd": lambd})
diff --git a/cdlib/algorithms/crisp_partition.py b/cdlib/algorithms/crisp_partition.py
@@ -32,14 +32,15 @@
 from cdlib.algorithms.internal.AGDL import Agdl
 from cdlib.algorithms.internal.FuzzyCom import fuzzy_comm
 from cdlib.algorithms.internal.Markov import markov
+from karateclub import EdMot
 import networkx as nx
 
 from cdlib.utils import convert_graph_formats, __from_nx_to_graph_tool, affiliations2nodesets, nx_node_integer_mapping
 
 __all__ = ["louvain", "leiden", "rb_pots", "rber_pots", "cpm", "significance_communities", "surprise_communities",
            "greedy_modularity", "der", "label_propagation", "async_fluid", "infomap", "walktrap", "girvan_newman", "em",
            "scan", "gdmp2", "spinglass", "eigenvector", "agdl", "frc_fgsn", "sbm_dl", "sbm_dl_nested",
-           "markov_clustering"]
+           "markov_clustering", "edmot"]
 
 
 def girvan_newman(g, level):
@@ -977,7 +978,7 @@ def markov_clustering(g,  max_loop=1000):
 
     :param g: a networkx/igraph object
     :param max_loop: maximum number of iterations, default 1000
-    :return: EdgeClustering object
+    :return: NodeClustering object
 
     :Example:
 
@@ -1008,3 +1009,42 @@ def markov_clustering(g,  max_loop=1000):
         communities = [list(c) for c in communities]
 
     return NodeClustering(communities, g, "Markov Clustering", method_parameters={"max_loop": max_loop})
+
+
+def edmot(g, component_count=2, cutoff=10):
+    """
+    The algorithm first creates the graph of higher order motifs. This graph is clustered by the Louvain method.
+
+    :param g: a networkx/igraph object
+    :param component_count: Number of extracted motif hypergraph components. Default is 2.
+    :param cutoff: Motif edge cut-off value. Default is 10.
+    :return: NodeClustering object
+
+    :Example:
+
+    >>> from cdlib import algorithms
+    >>> import networkx as nx
+    >>> G = nx.karate_club_graph()
+    >>> coms = algorithms.markov_clustering(G, max_loop=1000)
+
+    :References:
+
+    Li, Pei-Zhen, et al. "EdMot: An Edge Enhancement Approach for Motif-aware Community Detection." Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2019.
+
+    .. note:: Reference implementation: https://karateclub.readthedocs.io/
+    """
+
+    g = convert_graph_formats(g, nx.Graph)
+    model = EdMot(component_count=2, cutoff=10)
+
+    model.fit(g)
+    members = model.get_memberships()
+
+    # Reshaping the results
+    coms_to_node = defaultdict(list)
+    for n, c in members.items():
+        coms_to_node[c].append(n)
+
+    coms = [list(c) for c in coms_to_node.values()]
+
+    return NodeClustering(coms, g, "EdMot", method_parameters={"component_count": component_count, "cutoff": cutoff})
diff --git a/cdlib/algorithms/internal/CONGO.py b/cdlib/algorithms/internal/CONGO.py
@@ -6,7 +6,7 @@
     ig = None
 import numpy as np
 import operator
-import logging
+# import logging
 
 
 

diff --git a/cdlib/algorithms/overlapping_partition.py b/cdlib/algorithms/overlapping_partition.py
@@ -11,6 +11,7 @@
 from cdlib.algorithms.internal import OSSE
 import networkx as nx
 import numpy as np
+from collections import defaultdict
 from cdlib import NodeClustering
 from cdlib.utils import suppress_stdout, convert_graph_formats, nx_node_integer_mapping
 from cdlib.algorithms.internal.CONGO import Congo_
@@ -20,11 +21,12 @@
 from cdlib.algorithms.internal import LEMON
 from cdlib.algorithms.internal.SLPA_nx import slpa_nx
 from cdlib.algorithms.internal.multicom import MultiCom
-from cdlib.algorithms.internal import BIGCLAM
+from karateclub import DANMF, EgoNetSplitter, NNSED, MNMF, BigClam
 
 
 __all__ = ["ego_networks", "demon", "angel", "node_perception", "overlapping_seed_set_expansion", "kclique", "lfm",
-           "lais2", "congo", "conga", "lemon", "slpa", "multicom", "big_clam"]
+           "lais2", "congo", "conga", "lemon", "slpa", "multicom", "big_clam", "danmf", "egonet_splitter", "nnsed",
+           "nmnf"]
 
 
 def ego_networks(g, level=1):
@@ -547,16 +549,15 @@ def multicom(g, seed_node):
     return NodeClustering(communities, g, "Multicom", method_parameters={"seeds": seed_node}, overlap=True)
 
 
-def big_clam(g, number_communities=5):
+def big_clam(g, dimensions=8, iterations=50, learning_rate=0.005):
     """
     BigClam is an overlapping community detection method that scales to large networks.
-    The model has three main ingredients:
-    1)The node community memberships are represented with a bipartite affiliation network that links nodes of the social network to communities that they belong to.
-    2)People tend to be involved in communities to various degrees. Therefore,  each affiliation edge in the bipartite affiliation network has a nonnegative weight. The higher the node’s weight of the affiliation to the community the more likely is the node to be connected to other members in the community.
-    3)When people share multiple community affiliations, the links between them stem for one dominant reason. This means that for each community a pair of nodes shares we get an independent chance of connecting the nodes. Thus, naturally, the more communities a pair of nodes shares, the higher the probability of being connected.
+    The procedure uses gradient ascent to create an embedding which is used for deciding the node-cluster affiliations.
 
     :param g: a networkx/igraph object
-    :param number_communities: number communities desired, default 5
+    :param dimensions: Number of embedding dimensions. Default 8.
+    :param iterations: Number of training iterations. Default 50.
+    :param learning_rate: Gradient ascent learning rate. Default is 0.005.
     :return: NodeClustering object
 
 
@@ -565,17 +566,197 @@ def big_clam(g, number_communities=5):
     >>> from cdlib import algorithms
     >>> import networkx as nx
     >>> G = nx.karate_club_graph()
-    >>> coms = algorithms.big_clam(G, 2)
+    >>> coms = algorithms.big_clam(G)
 
     :References:
 
-    Yang, J., & Leskovec, J. (2013, February). `Overlapping community detection at scale: a nonnegative matrix factorization approach. <https://dl.acm.org/citation.cfm?id=2433471/>`_ In Proceedings of the sixth ACM international conference on Web search and data mining (pp. 587-596). ACM.
+    Yang, Jaewon, and Jure Leskovec. "Overlapping community detection at scale: a nonnegative matrix factorization approach." Proceedings of the sixth ACM international conference on Web search and data mining. 2013.
 
-    .. note:: Reference implementation: https://github.com/RobRomijnders/bigclam
+    .. note:: Reference implementation: https://karateclub.readthedocs.io/
     """
 
     g = convert_graph_formats(g, nx.Graph)
 
-    communities = BIGCLAM.big_Clam(g, number_communities)
+    model = BigClam(dimensions=dimensions, iterations=iterations, learning_rate=learning_rate)
+    model.fit(g)
+    members = model.get_memberships()
+
+    # Reshaping the results
+    coms_to_node = defaultdict(list)
+    for n, c in members.items():
+        coms_to_node[c].append(n)
+
+    coms = [list(c) for c in coms_to_node.values()]
+
+    return NodeClustering(coms, g, "BigClam", method_parameters={"dimensions": dimensions, "iterations": iterations,
+                                                                 "learning_rate": learning_rate}, overlap=True)
+
+
+def danmf(g, layers=(32, 8), pre_iterations=100, iterations=100, seed=42, lamb=0.01):
+    """
+    The procedure uses telescopic non-negative matrix factorization in order to learn a cluster memmbership distribution over nodes. The method can be used in an overlapping and non-overlapping way.
+
+    :param g: a networkx/igraph object
+    :param layers: Autoencoder layer sizes in a list of integers. Default [32, 8].
+    :param pre_iterations: Number of pre-training epochs. Default 100.
+    :param iterations: Number of training epochs. Default 100.
+    :param seed: Random seed for weight initializations. Default 42.
+    :param lamb: Regularization parameter. Default 0.01.
+    :return: NodeClustering object
+
+
+    :Example:
+
+    >>> from cdlib import algorithms
+    >>> import networkx as nx
+    >>> G = nx.karate_club_graph()
+    >>> coms = algorithms.danmf(G)
+
+    :References:
+
+    Ye, Fanghua, Chuan Chen, and Zibin Zheng. "Deep autoencoder-like nonnegative matrix factorization for community detection." Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
+
+    .. note:: Reference implementation: https://karateclub.readthedocs.io/
+    """
+    g = convert_graph_formats(g, nx.Graph)
+    model = DANMF(layers, pre_iterations, iterations, seed, lamb)
+    model.fit(g)
+    members = model.get_memberships()
+
+    # Reshaping the results
+    coms_to_node = defaultdict(list)
+    for n, c in members.items():
+        coms_to_node[c].append(n)
+
+    coms = [list(c) for c in coms_to_node.values()]
+
+    return NodeClustering(coms, g, "DANMF", method_parameters={"layers": layers, "pre_iteration": pre_iterations,
+                                                               "iterations": iterations, "seed": seed, "lamb": lamb},
+                          overlap=True)
+
+
+def egonet_splitter(g, resolution=1.0):
+    """
+    The method first creates the egonets of nodes. A persona-graph is created which is clustered by the Louvain method.
+
+    :param g: a networkx/igraph object
+    :param resolution: Resolution parameter of Python Louvain. Default 1.0.
+    :return: NodeClustering object
+
+
+    :Example:
+
+    >>> from cdlib import algorithms
+    >>> import networkx as nx
+    >>> G = nx.karate_club_graph()
+    >>> coms = algorithms.egonet_splitter(G)
+
+    :References:
+
+    Epasto, Alessandro, Silvio Lattanzi, and Renato Paes Leme. "Ego-splitting framework: From non-overlapping to overlapping clusters." Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. 2017.
+
+    .. note:: Reference implementation: https://karateclub.readthedocs.io/
+    """
+    g = convert_graph_formats(g, nx.Graph)
+    model = EgoNetSplitter(resolution=resolution)
+    model.fit(g)
+    members = model.get_memberships()
+
+    # Reshaping the results
+    coms_to_node = defaultdict(list)
+    for n, cs in members.items():
+        for c in cs:
+            coms_to_node[c].append(n)
+
+    coms = [list(c) for c in coms_to_node.values()]
+
+    return NodeClustering(coms, g, "EgoNetSplitter", method_parameters={"resolution":resolution}, overlap=True)
+
+
+def nnsed(g, dimensions=32, iterations=10, seed=42):
+    """
+    The procedure uses non-negative matrix factorization in order to learn an unnormalized cluster membership distribution over nodes. The method can be used in an overlapping and non-overlapping way.
+
+    :param g: a networkx/igraph object
+    :param dimensions: Embedding layer size. Default is 32.
+    :param iterations: Number of training epochs. Default 10.
+    :param seed:  Random seed for weight initializations. Default 42.
+    :return: NodeClustering object
+
+
+    :Example:
+
+    >>> from cdlib import algorithms
+    >>> import networkx as nx
+    >>> G = nx.karate_club_graph()
+    >>> coms = algorithms.nnsed(G)
+
+    :References:
+
+    Sun, Bing-Jie, et al. "A non-negative symmetric encoder-decoder approach for community detection." Proceedings of the 2017 ACM on Conference on Information and Knowledge Management. 2017.
+
+    .. note:: Reference implementation: https://karateclub.readthedocs.io/
+    """
+    g = convert_graph_formats(g, nx.Graph)
+    model = NNSED(dimensions=dimensions,iterations=iterations, seed=seed)
+    model.fit(g)
+    members = model.get_memberships()
+
+    # Reshaping the results
+    coms_to_node = defaultdict(list)
+    for n, c in members.items():
+        coms_to_node[c].append(n)
+
+    coms = [list(c) for c in coms_to_node.values()]
+
+    return NodeClustering(coms, g, "NNSED", method_parameters={"dimension": dimensions, "iterations": iterations,
+                                                               "seed": seed}, overlap=True)
+
+
+def nmnf(g, dimensions=128, clusters=10, lambd=0.2, alpha=0.05, beta=0.05, iterations=200, lower_control=1e-15, eta=5.0):
+    """
+    The procedure uses joint non-negative matrix factorization with modularity based regul;arization in order to learn a cluster memmbership distribution over nodes. The method can be used in an overlapping and non-overlapping way.
+
+    :param g: a networkx/igraph object
+    :param dimensions: Number of dimensions. Default is 128.
+    :param clusters: Number of clusters. Default is 10.
+    :param lambd: KKT penalty. Default is 0.2
+    :param alpha: Clustering penalty. Default is 0.05.
+    :param beta: Modularity regularization penalty. Default is 0.05.
+    :param iterations:  Number of power iterations. Default is 200.
+    :param lower_control: Floating point overflow control. Default is 10**-15.
+    :param eta: Similarity mixing parameter. Default is 5.0.
+    :return: NodeClustering object
+
+
+    :Example:
+
+    >>> from cdlib import algorithms
+    >>> import networkx as nx
+    >>> G = nx.karate_club_graph()
+    >>> coms = algorithms.nmnf(G)
+
+    :References:
+
+    Wang, Xiao, et al. "Community preserving network embedding." Thirty-first AAAI conference on artificial intelligence. 2017.
+
+    .. note:: Reference implementation: https://karateclub.readthedocs.io/
+    """
+    g = convert_graph_formats(g, nx.Graph)
+    model = MNMF(dimensions=dimensions, clusters=clusters, lambd=lambd, alpha=alpha, beta=beta, iterations=iterations,
+                 lower_control=lower_control, eta=eta)
+    model.fit(g)
+    members = model.get_memberships()
+
+    # Reshaping the results
+    coms_to_node = defaultdict(list)
+    for n, c in members.items():
+        coms_to_node[c].append(n)
+
+    coms = [list(c) for c in coms_to_node.values()]
+
+    return NodeClustering(coms, g, "MNMF", method_parameters={"dimension": dimensions, "clusters": clusters,
+                                                              "lambd": lambd, "alpha": alpha, "beta": beta,
+                                                              "iterations": iterations, "lower_control": lower_control,
+                                                              "eta": eta}, overlap=True)
 
-    return NodeClustering(communities, g, "BigClam", method_parameters={"number_communities": number_communities}, overlap=True)