diff --git a/cdlib/algorithms/attribute_clustering.py b/cdlib/algorithms/attribute_clustering.py index d45bdca3..01e1f736 100644 --- a/cdlib/algorithms/attribute_clustering.py +++ b/cdlib/algorithms/attribute_clustering.py @@ -93,7 +93,6 @@ def ilouvain(g_original, labels, id): 1. Combe D., Largeron C., Géry M., Egyed-Zsigmond E. "I-Louvain: An Attributed Graph Clustering Method". In: Fromont E., De Bie T., van Leeuwen M. (eds) Advances in Intelligent Data Analysis XIV. IDA (2015). Lecture Notes in Computer Science, vol 9385. Springer, Cham """ - g = convert_graph_formats(g_original, nx.Graph) nx.set_node_attributes(g, labels) id = dict() diff --git a/cdlib/algorithms/crisp_partition.py b/cdlib/algorithms/crisp_partition.py index 9c3c9531..f48d2bf9 100644 --- a/cdlib/algorithms/crisp_partition.py +++ b/cdlib/algorithms/crisp_partition.py @@ -1,17 +1,17 @@ try: import infomap as imp except ModuleNotFoundError: - imp = None - + imp = None + try: from wurlitzer import pipes except ModuleNotFoundError: - pipes = None + pipes = None try: import igraph as ig except ModuleNotFoundError: - ig = None + ig = None try: import leidenalg @@ -23,10 +23,9 @@ except ModuleNotFoundError: gt = None - from cdlib.algorithms.internal import DER import community as louvain_modularity - +import warnings from collections import defaultdict from cdlib import NodeClustering, FuzzyNodeClustering from cdlib.algorithms.internal.em import EM_nx @@ -35,6 +34,7 @@ from cdlib.algorithms.internal.AGDL import Agdl from cdlib.algorithms.internal.FuzzyCom import fuzzy_comm from cdlib.algorithms.internal.Markov import markov +from cdlib.algorithms.internal.SiblinarityAntichain import matrix_node_recursive_antichain_partition from karateclub import EdMot import markov_clustering as mc from chinese_whispers import chinese_whispers as cw @@ -47,7 +47,7 @@ __all__ = ["louvain", "leiden", "rb_pots", "rber_pots", "cpm", "significance_communities", "surprise_communities", "greedy_modularity", "der", "label_propagation", "async_fluid", "infomap", "walktrap", "girvan_newman", "em", "scan", "gdmp2", "spinglass", "eigenvector", "agdl", "frc_fgsn", "sbm_dl", "sbm_dl_nested", - "markov_clustering", "edmot", "chinesewhispers"] + "markov_clustering", "edmot", "chinesewhispers", "siblinarity_antichain"] def girvan_newman(g_original, level): @@ -153,7 +153,7 @@ def scan(g_original, epsilon, mu): algorithm = SCAN_nx(g, epsilon, mu) coms = algorithm.execute() return NodeClustering(coms, g_original, "SCAN", method_parameters={"epsilon": epsilon, - "mu": mu}) + "mu": mu}) def gdmp2(g_original, min_threshold=0.75): @@ -256,7 +256,7 @@ def eigenvector(g_original): communities = [g.vs[x]['name'] for x in coms] - return NodeClustering(communities, g_original, "Eigenvector", method_parameters={"":""}) + return NodeClustering(communities, g_original, "Eigenvector", method_parameters={"": ""}) def agdl(g_original, number_communities, number_neighbors, kc, a): @@ -294,8 +294,8 @@ def agdl(g_original, number_communities, number_neighbors, kc, a): coms.append([nodes[n] for n in com]) return NodeClustering(coms, g_original, "AGDL", method_parameters={"number_communities": number_communities, - "number_neighbors": number_neighbors, - "kc": kc, "a": a}) + "number_neighbors": number_neighbors, + "kc": kc, "a": a}) def louvain(g_original, weight='weight', resolution=1., randomize=False): @@ -339,8 +339,9 @@ def louvain(g_original, weight='weight', resolution=1., randomize=False): coms_to_node[c].append(n) coms_louvain = [list(c) for c in coms_to_node.values()] - return NodeClustering(coms_louvain, g_original, "Louvain", method_parameters={"weight": weight, "resolution": resolution, - "randomize": randomize}) + return NodeClustering(coms_louvain, g_original, "Louvain", + method_parameters={"weight": weight, "resolution": resolution, + "randomize": randomize}) def leiden(g_original, initial_membership=None, weights=None): @@ -381,7 +382,7 @@ def leiden(g_original, initial_membership=None, weights=None): ) coms = [g.vs[x]['name'] for x in part] return NodeClustering(coms, g_original, "Leiden", method_parameters={"initial_membership": initial_membership, - "weights": weights}) + "weights": weights}) def rb_pots(g_original, initial_membership=None, weights=None, resolution_parameter=1): @@ -430,8 +431,8 @@ def rb_pots(g_original, initial_membership=None, weights=None, resolution_parame initial_membership=initial_membership, weights=weights) coms = [g.vs[x]['name'] for x in part] return NodeClustering(coms, g_original, "RB Pots", method_parameters={"initial_membership": initial_membership, - "weights": weights, - "resolution_parameter": resolution_parameter}) + "weights": weights, + "resolution_parameter": resolution_parameter}) def rber_pots(g_original, initial_membership=None, weights=None, node_sizes=None, resolution_parameter=1): @@ -478,8 +479,9 @@ def rber_pots(g_original, initial_membership=None, weights=None, node_sizes=None ) coms = [g.vs[x]['name'] for x in part] return NodeClustering(coms, g_original, "RBER Pots", method_parameters={"initial_membership": initial_membership, - "weights": weights, "node_sizes": node_sizes, - "resolution_parameter": resolution_parameter}) + "weights": weights, + "node_sizes": node_sizes, + "resolution_parameter": resolution_parameter}) def cpm(g_original, initial_membership=None, weights=None, node_sizes=None, resolution_parameter=1): @@ -534,8 +536,8 @@ def cpm(g_original, initial_membership=None, weights=None, node_sizes=None, reso weights=weights, node_sizes=node_sizes, ) coms = [g.vs[x]['name'] for x in part] return NodeClustering(coms, g_original, "CPM", method_parameters={"initial_membership": initial_membership, - "weights": weights, "node_sizes": node_sizes, - "resolution_parameter": resolution_parameter}) + "weights": weights, "node_sizes": node_sizes, + "resolution_parameter": resolution_parameter}) def significance_communities(g_original, initial_membership=None, node_sizes=None): @@ -579,7 +581,7 @@ def significance_communities(g_original, initial_membership=None, node_sizes=Non node_sizes=node_sizes) coms = [g.vs[x]['name'] for x in part] return NodeClustering(coms, g_original, "Significance", method_parameters={"initial_membership": initial_membership, - "node_sizes": node_sizes}) + "node_sizes": node_sizes}) def surprise_communities(g_original, initial_membership=None, weights=None, node_sizes=None): @@ -626,7 +628,8 @@ def surprise_communities(g_original, initial_membership=None, weights=None, node weights=weights, node_sizes=node_sizes) coms = [g.vs[x]['name'] for x in part] return NodeClustering(coms, g_original, "Surprise", method_parameters={"initial_membership": initial_membership, - "weights": weights, "node_sizes": node_sizes}) + "weights": weights, + "node_sizes": node_sizes}) def greedy_modularity(g_original, weight=None): @@ -741,7 +744,7 @@ def walktrap(g_original): for c in coms: communities.append([g.vs[x]['name'] for x in c]) - return NodeClustering(communities, g_original, "Walktrap", method_parameters={"":""}) + return NodeClustering(communities, g_original, "Walktrap", method_parameters={"": ""}) def label_propagation(g_original): @@ -774,7 +777,7 @@ def label_propagation(g_original): coms = list(nx.algorithms.community.label_propagation_communities(g)) coms = [list(x) for x in coms] - return NodeClustering(coms, g_original, "Label Propagation", method_parameters={"":""}) + return NodeClustering(coms, g_original, "Label Propagation", method_parameters={"": ""}) def async_fluid(g_original, k): @@ -846,7 +849,7 @@ def der(g_original, walk_len=3, threshold=.00001, iter_bound=50): coms.append([maps[n] for n in c]) return NodeClustering(coms, g_original, "DER", method_parameters={"walk_len": walk_len, "threshold": threshold, - "iter_bound": iter_bound}) + "iter_bound": iter_bound}) def frc_fgsn(g_original, theta, eps, r): @@ -894,10 +897,10 @@ def frc_fgsn(g_original, theta, eps, r): coms = [list(c) for c in communities] return FuzzyNodeClustering(coms, fuzz_assoc, g_original, "FuzzyComm", method_parameters={"theta": theta, - "eps": eps, "r": r}) + "eps": eps, "r": r}) -def sbm_dl(g_original, B_min=None,B_max=None, deg_corr=True, **kwargs): +def sbm_dl(g_original, B_min=None, B_max=None, deg_corr=True, **kwargs): """Efficient Monte Carlo and greedy heuristic for the inference of stochastic block models. Fit a non-overlapping stochastic block model (SBM) by minimizing its description length using an agglomerative heuristic. @@ -936,11 +939,12 @@ def sbm_dl(g_original, B_min=None,B_max=None, deg_corr=True, **kwargs): affiliations = state.get_blocks().get_array() affiliations = {label_map[i]: affiliations[i] for i in range(len(affiliations))} coms = affiliations2nodesets(affiliations) - coms = [list(v) for k,v in coms.items()] - return NodeClustering(coms, g_original, "SBM", method_parameters={"B_min": B_min, "B_max": B_max, "deg_corr": deg_corr}) + coms = [list(v) for k, v in coms.items()] + return NodeClustering(coms, g_original, "SBM", + method_parameters={"B_min": B_min, "B_max": B_max, "deg_corr": deg_corr}) -def sbm_dl_nested(g_original, B_min=None,B_max=None, deg_corr=True, **kwargs): +def sbm_dl_nested(g_original, B_min=None, B_max=None, deg_corr=True, **kwargs): """Efficient Monte Carlo and greedy heuristic for the inference of stochastic block models. (nested) Fit a nested non-overlapping stochastic block model (SBM) by minimizing its description length using an agglomerative heuristic. @@ -981,8 +985,9 @@ def sbm_dl_nested(g_original, B_min=None,B_max=None, deg_corr=True, **kwargs): affiliations = level0.get_blocks().get_array() affiliations = {label_map[i]: affiliations[i] for i in range(len(affiliations))} coms = affiliations2nodesets(affiliations) - coms = [list(v) for k,v in coms.items()] - return NodeClustering(coms, g_original, "SBM_nested", method_parameters={"B_min": B_min, "B_max": B_max, "deg_corr": deg_corr}) + coms = [list(v) for k, v in coms.items()] + return NodeClustering(coms, g_original, "SBM_nested", + method_parameters={"B_min": B_min, "B_max": B_max, "deg_corr": deg_corr}) def markov_clustering(g_original, expansion=2, inflation=2, loop_value=1, iterations=100, pruning_threshold=0.001, @@ -1129,4 +1134,68 @@ def edmot(g_original, component_count=2, cutoff=10): coms = [list(c) for c in coms_to_node.values()] - return NodeClustering(coms, g_original, "EdMot", method_parameters={"component_count": component_count, "cutoff": cutoff}) + return NodeClustering(coms, g_original, "EdMot", + method_parameters={"component_count": component_count, "cutoff": cutoff}) + + +def siblinarity_antichain(g_original, forwards_backwards_on=True, backwards_forwards_on=False, + Lambda=1, with_replacement=False, space_label=None, time_label=None): + """ + The algorithm extract communities from a DAG that (i) respects its intrinsic order and (ii) are composed of similar nodes. + The approach takes inspiration from classic similarity measures of bibliometrics, used to assess how similar two publications are, based on their relative citation patterns. + + :param g_original: a networkx/igraph object representing a DAG (directed acyclic graph) + :param forwards_backwards_on: checks successors' similarity. Boolean, default True + :param backwards_forwards_on: checks predecessors' similarity. Boolean, default True + :param Lambda: desired resolution of the partition. Default 1 + :param with_replacement: If True he similarity of a node to itself is equal to the number of its neighbours based on which the similarity is defined. Boolean, default True. + :return: NodeClustering object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> G = nx.karate_club_graph() + >>> coms = algorithms.siblinarity_antichain(G, Lambda=1) + + :References: + + Vasiliauskaite, V., Evans, T.S. Making communities show respect for order. Appl Netw Sci 5, 15 (2020). https://doi.org/10.1007/s41109-020-00255-5 + + .. note:: Reference implementation: https://github.com/vv2246/siblinarity_antichains + """ + + g = convert_graph_formats(g_original, nx.Graph) + + if not nx.is_directed_acyclic_graph(g): + raise Exception("The Siblinarity Antichain algorithm require as input a Directed Acyclic Graph (DAG).") + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + result_list = matrix_node_recursive_antichain_partition(g, forwards_backwards_on=forwards_backwards_on, + backwards_forwards_on=backwards_forwards_on, + Q_check_on=True, + Lambda=Lambda, with_replacement=with_replacement, + space_label=None, time_label=None) + + node_partition = {} + for n in g.nodes(): + p_at_level = result_list[0]["n_to_p"][n] + for i in range(1, len(result_list) - 1): + p_at_level = result_list[i]["n_to_p"][p_at_level] + node_partition[n] = p_at_level + + partition = defaultdict(list) + for key, val in node_partition.items(): + partition[val].append(key) + + coms = [list(c) for c in partition.values()] + + return NodeClustering(coms, g_original, "Siblinarity Antichain", + method_parameters={"forwards_backwards_on": forwards_backwards_on, + "backwards_forwards_on": backwards_forwards_on, + + "Lambda": Lambda, + "with_replacement": with_replacement, + "space_label": space_label, + "time_label": time_label}) diff --git a/cdlib/algorithms/internal/ILouvain.py b/cdlib/algorithms/internal/ILouvain.py index 2e23b6b4..fd13b059 100644 --- a/cdlib/algorithms/internal/ILouvain.py +++ b/cdlib/algorithms/internal/ILouvain.py @@ -1,20 +1,11 @@ -#!/usr/bin/python - from __future__ import division - -#from pprint import pprint - -#import argparse import numpy as np -#from scipy.spatial.distance import pdist, squareform import os.path - -# -*- coding: utf-8 -*- """ This module implements community detection. """ -__all__ = ["partition_at_level", "modularity", "best_partition", "generate_dendogram", "induced_graph"] -__author__ = """Thomas Aynaud (thomas.aynaud@lip6.fr)""" + +__author__ = ["Thomas Aynaud (thomas.aynaud@lip6.fr)"] # Copyright (C) 2009 by # Thomas Aynaud # All rights reserved. @@ -23,7 +14,7 @@ import networkx as nx -class ML2: +class ML2(object): __MIN = 0.000001 __PASS_MAX = -1 LOGOPERATIONS = False @@ -49,9 +40,6 @@ def __init__(self, graph, attributes, authorIndex): self.status_list = list() def critereCombinaison(self): - # if(args.verbose): - # print("Mod1: " + str(self.__modularity(self.statusTab[0]))) - # print("Mod2: " + str(self.__modularity(self.statusTab[1]))) return (self.__modularity(self.statusTab[0]) + self.__modularity(self.statusTab[1])) / 2. def findPartition(self): @@ -75,11 +63,6 @@ def findPartition(self): self.__one_level(giniMatrix=giniMatrix) new_mod = self.critereCombinaison() if new_mod - mod < self.__MIN: - # if(args.verbose): - # print("modularities") - # print(self.__modularity(self.statusTab[0])) - # print(self.__modularity(self.statusTab[1])) - # print("Modularity Final: " + str(self.__modularity(self.statusTab[1]) + self.__modularity(self.statusTab[0]))) break partition, bijection = self.__renumber() @@ -103,17 +86,6 @@ def findPartition(self): partition[node] = dendogram[index][community] return partition - # for elem, part in sorted(partition.iteritems()) : - # if(args.verbose): - # print(str(self.authorIndex[elem]) + " " + str(part) + " " + str(self.attributes[self.authorIndex[elem]])) - # else: - # out = str(self.authorIndex[elem]) + " " + str(part) - # if(args.multipleDataset != None): - # f = open(args.dataset + "_" + str(curDatasetIdx) + ".2ModLouvain",'a') - # else: - # f = open(args.dataset + ".2ModLouvain",'a') - # f.write(out + "\n") - # f.close() def dist(self, v1, v2): attrV1 = self.attributes[v1] @@ -317,7 +289,7 @@ def __modularity(self, status): return result -class Status: +class Status(object): """ To handle several data in one struct. Could be replaced by named tuple, but don't want to depend on python 2.6 @@ -351,7 +323,6 @@ def initAttribStatus(self, graph, authorIndex, attributes): variance = {} for node in sorted(graph.nodes()): - distanceToCenterOfGravity = 0. for attrId, attrValue in meanVector.items(): variance[attrId] = variance.get(attrId, 0.) + ( (attrValue - attributes[authorIndex[node]].get(attrId, 0.)) ** 2) @@ -359,8 +330,6 @@ def initAttribStatus(self, graph, authorIndex, attributes): for v in variance.values(): inertieTot += (v / N) - # if(args.verbose): - # print("# Total inertia:", inertieTot) self.total_weight = (0.0 - inertieTot) for node in sorted(graph.nodes()): @@ -426,7 +395,7 @@ def loadDataset(path): graph = nx.Graph() # Read the graph - if (not os.path.isfile(path + ".edgeList")): + if not os.path.isfile(path + ".edgeList"): print("Error: file '" + path + ".edgeList' not found") exit(-1) with open(path + ".edgeList") as f: @@ -442,7 +411,7 @@ def loadDataset(path): for n in graph: attributes[n] = {} - if (not os.path.isfile(path + ".attributes")): + if not os.path.isfile(path + ".attributes"): print("Error: file '" + path + ".attributes' not found") exit(-1) @@ -468,37 +437,3 @@ def loadDataset(path): os.remove(path + ".2ModLouvain") return graph, attributes, authorIndex - - -def readToyGraph(): - graph = nx.Graph() - graph.add_node("a") - graph.add_node("b") - graph.add_node("c") - graph.add_node("d") - graph.add_node("e") - graph.add_edge("a", "b") - graph.add_edge("b", "c") - graph.add_edge("c", "d") - graph.add_edge("d", "e") - graph.add_edge("a", "e") - graph.add_edge("b", "e") - graph.add_edge("c", "e") - graph.add_edge("b", "d") - graph.add_edge("a", "c") - graph.add_edge("a", "d") - - authorIndex = {} - authorIndex["a"] = 0 - authorIndex["b"] = 1 - authorIndex["c"] = 2 - authorIndex["d"] = 3 - authorIndex["e"] = 4 - - attributes = { - 0: {0: 2., 1: 4}, - 1: {0: 8., 1: 1}, - 2: {0: 7., 1: 5}, - 3: {0: 12., 1: 6}, - 4: {0: 1., 1: 4}} - return graph, attributes, authorIndex \ No newline at end of file diff --git a/cdlib/algorithms/internal/SiblinarityAntichain.py b/cdlib/algorithms/internal/SiblinarityAntichain.py new file mode 100644 index 00000000..da9a5f02 --- /dev/null +++ b/cdlib/algorithms/internal/SiblinarityAntichain.py @@ -0,0 +1,683 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 20 12:52:55 2018 +@author: Vaiva & Tim +""" + +from collections import defaultdict +import itertools +import numpy as np +import random +import itertools +import math +import networkx as nx +import scipy.sparse as sparse + +__authors__ = ['Vaiva Vasiliauskaite', 'T.S. Evans'] +__all__ = ['matrix_node_recursive_antichain_partition'] + + +def is_weakly_connected(graph, source_nodes, target_nodes): + """ + Tests whether a list of source nodes in a graph have a path in either direction between a list of target nodes. + + Parameters + ---------- + graph = networkx graph + source_nodes = list of source nodes for paths + target_nodes = list of target nodes for paths + + Returns + ------- + Bool: + True if there is a path from at least one source node to at least one target node or the other way round + False otherwise + """ + for s, t in itertools.product(source_nodes, target_nodes): + if nx.has_path(graph, s, t) or nx.has_path(graph, t, s): + return True + return False + + +class Quality_matrix: + ''' + Quality measures for use in antichains. Similarity matrix implementation. + ''' + + # class variables + def __init__(self, node_id_dict, similarity_matrix, Lambda, with_replacement): + # Initial Quality Measures + self.similarity_matrix = similarity_matrix + self.node_id_dict = node_id_dict + self.strength = similarity_matrix.sum(axis=0) + self.strength = {n: self.strength[0, node_id_dict[n]] for n in node_id_dict.keys()} # sum over rows? + self.total_weight = similarity_matrix.sum() # /2 + self.Lambda = Lambda + self.with_replacement = with_replacement + + def delta_strength_quality_unnormalised(self, partition1, partition2): + """ + Using in-strength null model calculate the change in unnormalised quality if two partitions are combined. + + Definition is that used for weighted graph. + + Q = \sum_{u \in partition1} \sum_{v \in partition2} + ( S_ij + - k_i*k_j/W ) + where W = total strength of edges in the graph ((sum_{i,j}S_ij)/2), + S_{ij} - i,j^th entry in the similarity matrix. For instance, A.A^T is successors-based similarity; + A^T.A is predecessors-based similarity. + + Note this is not normalised. + + Note no test for connectedness of nodes in partitions. + + Note both partitions must be non-empty otherwise TypeError raised. + + Note no test to see if partitions are sets or if they share common elements. + + Input + partition1 - iterable list or set of the nodes in first partition + partition2 - iterable list or set of the nodes in second partition + + Return + Contribution of the quality Q from the all pairs of nodes with one from partition1, second from partition2 + """ + + return sum([self.similarity_matrix[self.node_id_dict[node1], self.node_id_dict[node2]] + - self.Lambda * self.strength[node1] * self.strength[node2] / self.total_weight + for node1, node2 in itertools.product(partition1, partition2)]) + + def quality_one_partition(self, partition): + return sum([self.similarity_matrix[self.node_id_dict[node1], self.node_id_dict[node2]] + - self.Lambda * self.strength[node1] * self.strength[node2] / self.total_weight + for node1, node2 in itertools.combinations(partition, 2)]) + + def total_strength_quality_unnormalised(self, partitions): + """ + Calculate the total unnormalised quality using strength null model + + Definition is that used for weighted graph. + + Q = \sum_{u \in partition1} \sum_{v \in partition2} + ( S_ij + - k_i*k_j/W ) + where W = total strength of edges in the graph ((sum_{i,j}S_ij)/2), + S_{ij} - i,j^th entry in the similarity matrix. For instance, A.A^T is successors-based similarity; + A^T.A is predecessors-based similarity. + + Note this is not normalised. + + Note no test for connectedness of nodes in partitions. + + Input + ----- + partition - iterable list of sets of nodes in partitions so that + partition[p] is an set (or any iterable list) of the nodes in partition p + + Return + ------ + Total value of the quality Q from the all pairs of nodes + """ + return sum([ + sum([self.similarity_matrix[self.node_id_dict[node1], self.node_id_dict[node2]] + - self.Lambda * self.strength[node1] * self.strength[node2] / self.total_weight + for node1, node2 in itertools.combinations(p, 2)]) + for p in partitions]) + + +def get_edge_weight(G, node1, node2, weight_attribute='weight'): + """ + Get Edge Weight + + Returns edge weight for edge in G from node1 to node2 + If edge exists and has weight_attribute, this value is returned. + If edge exists but has not weight_attribute, 1 is returned. + Otherwise 0 is returned + + Input + ----- + G - networkx graph + node1 - source node + node2 - target node + weight_attribute='weight' - attribute of edge containing weight value + + Return + ------ + edge weight, 1 if edge exists but no weight attribute exists, 0 otherwise. + + """ + edge_data = G.get_edge_data(node1, node2) + if edge_data is None: + return 0 + elif weight_attribute in edge_data: + return edge_data[weight_attribute] + else: + return 1 + + +def get_node_attribute_value(G, node1, node_attribute=None): + """ + Get Node Attribute Value + + Returns node attribute as a float. + Otherwise 0.0 is returned + + Input + ----- + G - networkx graph + node1 - node + node_attribute=None - attribute of node required + + Return + ------ + node attribute as a float + + """ + try: + node_data = G.node[node1][node_attribute] + return float(node_data) + except: + pass + return 0.0 + + +def is_weakly_connected(graph, source_nodes, target_nodes): + """ + Tests whether a list of source nodes in a graph have a path in either direction between a list of target nodes. + + Parameters + ---------- + graph = networkx graph + source_nodes = list of source nodes for paths + target_nodes = list of target nodes for paths + + Returns + ------- + Bool: + True if there is a path from at least one source node to at least one target node or the other way round + False otherwise + """ + for s, t in itertools.product(source_nodes, target_nodes): + if nx.has_path(graph, s, t) or nx.has_path(graph, t, s): + return True + return False + + +def coarse_grain(G, node_to_partition_label, partition_label_to_nodes, weight_attribute='weight', + time_label='t', space_label='x'): + """ + Coarse Grain + + The new graph H has the partitions of G as the nodes in H. + An edges from partition1 to partition2 in H is present if + there is an edge from a node in G in partition1 of G to + a node in G in partition2. The total weight of the edge + from partition1 to partition2 in H will be the sum of all + the weights of all such edges in G + from nodes in partition1 to nodes in partition2. + If unweighted, weights are assumed to be 1. + If time_label or space_label are set, these are assumed to be numerical + values (e.g. coordinates) and nodes in the new graph get the average value from + the partition of nodes they represent in the old graph. + + + Input + ---- + G - networkx graph + node_to_partition_label - dictionary from G node key to its partition label + partition_label_to_nodes - dictionary from partition label to the set of nodes in G in that partition + weight_attribute='weight' - attribute on edge containing edge weight data + time_label='t': Node key for time coordinate (used as y/vertical coordinate) + space_label='x': Node key for space coordinate (used as x/horizontal coordinate) + + Return + ------ + H - coarse grained graph, nodes are the partitions labels, weights are under eight_attribute of edges + + """ + H = nx.DiGraph() + H.add_nodes_from(list(partition_label_to_nodes.keys())) + for partition in partition_label_to_nodes.keys(): + # nodes_in_partition = partition_label_to_nodes[partition] + number_in_partition = len(partition_label_to_nodes[partition]) + if time_label is not None: + average_time = sum([get_node_attribute_value(G, n, node_attribute=time_label) for n in + partition_label_to_nodes[partition]]) / number_in_partition + H.nodes[partition][time_label] = average_time + if space_label is not None: + average_space = sum([get_node_attribute_value(G, n, node_attribute=space_label) for n in + partition_label_to_nodes[partition]]) / number_in_partition + H.nodes[partition][space_label] = average_space + + for partition1, partition2 in itertools.combinations(partition_label_to_nodes.keys(), 2): + w = sum([get_edge_weight(G, node1, node2, weight_attribute) for node1, node2 in + itertools.product(partition_label_to_nodes[partition1], partition_label_to_nodes[partition2])]) + if w > 0: + H.add_edge(partition1, partition2, weight_attribute=w) + w = sum([get_edge_weight(G, node2, node1, weight_attribute) for node1, node2 in + itertools.product(partition_label_to_nodes[partition1], partition_label_to_nodes[partition2])]) + if w > 0: + H.add_edge(partition2, partition1, weight_attribute=w) + return H + + +def similarity_matrix(DAG, similarity="intersection", neighbours="successors"): + """ + Function to produce a similarity matrix based on neighbourhoods of nodes in DAG. + + Input + ----- + DAG - networkx directed acyclic graph + similarity - type of similarity of sets. Currently only implemented for the size of intersection + neighbours - type of neighbours to consider in the similarity. Can be either successors or predecessors + + Return + ----- + A - symmetric similarity matrix where entry A[i,j] represents similarity between nodes of indices i, j + nodedict - dictionary of node names and their indices in the similarity matrix + """ + + nodes = list(DAG.nodes()) + nodedict = {} + for i in range(len(nodes)): + nodedict[nodes[i]] = i + nodelist = list(nodedict.keys()) + + A = (nx.adjacency_matrix(DAG, nodelist)).todense() + + if similarity == "intersection" and neighbours == "successors": + A = A.dot(A.transpose()) + np.fill_diagonal(A, 0) + return A, nodedict + + elif similarity == "intersection" and neighbours == "predecessors": + A = (A.transpose()).dot(A) + np.fill_diagonal(A, 0) + return A, nodedict + + +def similarity_matrix_sparse(DAG, similarity="intersection", neighbours="successors", with_replacement=False): + """ + Function to produce a sparse similarity matrix based on neighbourhoods of nodes in DAG. + + Input + ----- + DAG - networkx directed acyclic graph + similarity - type of similarity of sets. Currently only implemented for the size of intersection + neighbours - type of neighbours to consider in the similarity. Can be either successors or predecessors or both + + Return + ----- + A - scipy sparse symmetric similarity matrix where entry A[i,j] represents similarity between nodes of indices i, j + nodedict - dictionary of node names and their indices in the similarity matrix + """ + + nodes = list(DAG.nodes()) + nodedict = {} + for i in range(len(nodes)): + nodedict[nodes[i]] = i + nodelist = list(nodedict.keys()) + + A = (nx.adjacency_matrix(DAG, nodelist)) + + if similarity == "intersection" and neighbours == "successors": + A = A.dot(A.transpose()) + if not with_replacement: + A.setdiag(0) + return A, nodedict + + elif similarity == "intersection" and neighbours == "predecessors": + A = A.transpose().dot(A) + if not with_replacement: + A.setdiag(0) + return A, nodedict + elif similarity == "intersection" and neighbours == "both": + A = A.transpose().dot(A) + A.dot(A.transpose()) + if not with_replacement: + A.setdiag(0) + return A, nodedict + + +def has_path_matrix(DAG, nodedict, cutoff=350): + nodes = list(nodedict.keys()) + A = (nx.adjacency_matrix(DAG, nodes)).todense() + A_sum = np.copy(A) + if nx.is_directed_acyclic_graph(DAG): + L_max = len(nx.dag_longest_path(DAG)) + else: + L_max = cutoff + current_length = 1 + while current_length <= L_max: + A_sum = np.dot(A_sum, A) + current_length += 1 + + return (A_sum > 0).astype(np.int8) + + +def find_paths_sparse(A, length_max=10): + """ + Tim's numpy path implementation updated by Vaiva to sparse matrices + Scipy sparse matrix implementation to find all paths + Given adjacency matrix A will find all the paths between all vertices + + Input + A: numpy square adjacency matrix,can be weighted + Return + #tuple current_length,path_length,path_bool where + #current_length = one more than the maximum length found. + If equals length_max then may have terminated because reachd maximum requested length + #non_zero_entries = number of non-zero entries in P=(A)^current_length + #path_length = matrix of longest paths lengths + path_length[target,source]= longest path from source to target + path_bool = matrix of booleans indicating if path exists. Paths from vertex to slef (length zero) gives True on diagonal + path_bool[target,source]= True (False) if path from source to target + """ + + # Assume vertices start connected to selves only path path of length zero. + m, n = np.shape(A) + path_bool = sparse.eye(m, n, dtype=bool) + path_bool = path_bool.tolil() + path_length = sparse.lil_matrix((m, n), dtype="int32") + current_length = 1 + P = A.copy() + non_zero_entries = P.count_nonzero() + + while non_zero_entries > 0 and current_length < length_max: + non_zero_entries = P.nonzero() + path_bool[non_zero_entries[0], non_zero_entries[1]] = True + path_length[non_zero_entries] = current_length + P = P.dot(A) + current_length += 1 + non_zero_entries = P.count_nonzero() + return path_bool + + +def is_weakly_connected_matrix(path_matrix, nodedict, source_nodes, target_nodes): + """ + Function to check whether nodes in the source_nodes are not weakly connected to + nodes in the target_nodes. + + Input + ----- + path_matrix - 1/0 matrix where entry P[i,j] = 1 if nodes with indices i,j are weakly connected + nodedict - dictionary where keys are node names and values are their corresponding indices in the path matrix + source_nodes - list of nodes + target_nodes - list of nodes + + Return + ------ + True - if nodes in source_nodes and target_nodes form a weakly_connected subgraph + False - if not + """ + source_nodes_id, target_nodes_id = [nodedict[s] for s in source_nodes], [nodedict[t] for t in target_nodes] + + for s, t in itertools.product(source_nodes_id, target_nodes_id): + if path_matrix[s, t] == 1 or path_matrix[t, s] == 1: + return True + return False + + +def node_matrix_greedy_antichain_partition(G, level, Lambda, with_replacement, + random_on=False, seed=None, max_number_sweeps=None, + backwards_forwards_on=True, forwards_backwards_on=False, + Q_check_on=True, weight_attribute="weight"): + """ + In this implementation we iterate over nodes in the graph moving individual nodes until no changes occur . + + We start with each node in its own partition. + In one sweep we look at each partition ac in turn. + We find all the backwards-forwards neighbours of the nodes in partition ac + and collect all their partition labels, excluding the current partition ac. + For each of these we find if we can increase the quality function by merging + current partition ac with one of its bf-neighbouring partitions. If we do we then + do the merge removing the current partition ac. + We continue the sweep looking at remaining partitions and trying to merge them. Note that + later partitions will see some previous partitions already merged. The option to randomise the order + that we visit the partitions will lead to different results. + + After each sweep, if at least one partition was merged then we sweep through again. + We only stop when no more merges are found on one sweep, or if the number of sweeps exceed + the maximum requested. + + Note that we do NOT create an explicit hybrid graph. We only use a weakly connected check + of the + + Input + ----- + G - networkx graph. + random_on=False - if true will shuffle the order in which partitions are examined + seed=None - used as seed if shuffle is on. If None then time is used as seed + max_number_sweeps=None - this is the maximum number of sweeps to consider. If less than 1 or None, then uses number of nodes. + backwards_forwards_on=True - find possible new partitions by making a backwards step then a forwards step from node being considered for a move + forwards_backwards_on=False - find possible new partitions by making a forwards step then a backwards step from node being considered for a move + Q_check_on=False - check to see if change in Q is correct by printing out total value and changes + weight_attribute - edge attribute of weight. if None, unweighted quality functions are used. Note, weight must be integer + + Return + ------ + tuple node_to_partition_label, partition_label_to_nodes + where + node_to_partition_label is a dictionary from node key to its partition label + partition_label_to_nodes is a dictionary from partition label to the set of nodes in that partition + + """ + if not (forwards_backwards_on or backwards_forwards_on): + raise ValueError("At least one of forwards_backwards_on or backwards_forwards_on parameters must be True") + + if backwards_forwards_on == True and forwards_backwards_on == True: + adj_matrix, nodedict = similarity_matrix_sparse(G, similarity="intersection", neighbours="both", + with_replacement=with_replacement) + + elif backwards_forwards_on == True and forwards_backwards_on == False: + # use in-degree quality with backwards-forwards step (predecessors) + adj_matrix, nodedict = similarity_matrix_sparse(G, similarity="intersection", neighbours="predecessors", + with_replacement=with_replacement) + elif forwards_backwards_on == True and backwards_forwards_on == False: + # use out-degree quality with forwards-backwards step (successors) + + adj_matrix, nodedict = similarity_matrix_sparse(G, similarity="intersection", neighbours="successors", + with_replacement=with_replacement) + Q = Quality_matrix(nodedict, adj_matrix, Lambda, with_replacement) + Q_method = Q.delta_strength_quality_unnormalised + Q_total = Q.total_strength_quality_unnormalised + + number_of_nodes = G.number_of_nodes() + + if max_number_sweeps is None or max_number_sweeps < 1: + max_number_sweeps = number_of_nodes + if random_on: + random.seed(seed) + # set up partition node dictionaries + # These play the role of induced graphs + # first place each node into its own partition + node_to_partition_label = {} + partition_label_to_nodes = {} + next_partition_label = 0 + + for n in G: + node_to_partition_label[n] = next_partition_label + partition_label_to_nodes[next_partition_label] = {n} + next_partition_label += 1 + moved = True + number_sweeps = 0 + + if Q_check_on: + Q_total_current = Q_total(partition_label_to_nodes.values()) + count = 0 + + while moved == True and number_sweeps < max_number_sweeps: + # Start of one sweep through all current partitions. + # Check every partition ac in turn, doing the best merge you can for each partition ac. + # Note the partition ac under study will be removed in the merge. + # That means the list of partition labels will be altered and therefore + # this list can not be used as a list of partition labels in the ac loop. + # For that reason we need a deep copy of the current list of partition labels + # which we get from the keys of the partition label to node set dictionary + # Conveniently we can shuffle the list used in teh ac loop if we want to randomise the greedy + # algorithm + number_sweeps += 1 + number_moves = 0 + node_list = list(G.nodes()) + if random_on: + random.shuffle(node_list) + for n in node_list: + count += 1 + # check to see if node n should be moved + moved = False + # ac is the partition containing node n + ac = node_to_partition_label[n] + # now find the contribution from Q that comes if we move n into its own partition + partition_ac_no_n = set(partition_label_to_nodes[ac]) + partition_ac_no_n.discard(n) + if len(partition_ac_no_n) > 0: + delta_Q_remove_n = Q_method([n], partition_ac_no_n) + else: + delta_Q_remove_n = 0 + + # now find the neighbouring partitions via backwards-forward step + bf_nearest_neighbours_all = set() + if backwards_forwards_on: + for p in G.predecessors(n): + bf_nearest_neighbours_all.update(G.successors(p)) + if forwards_backwards_on: + for p in G.successors(n): + bf_nearest_neighbours_all.update(G.predecessors(p)) + bf_nearest_neighbour_partitions = set(node_to_partition_label[bf_nn] for bf_nn in bf_nearest_neighbours_all) + # remove current partition ac from neighbours + try: + bf_nearest_neighbour_partitions.remove(ac) + except KeyError: # ac is not in set, must have no neighbours or no b-f nearest neighbours + pass + + # dictionary from partition label to delta quality value, + # so delta_Q_nn[ac_nn] is change in quality if node n was to join partition ac_nn + delta_Q_nn = {} + # Loop round bf nearest neighbour partitions ac_nn. + # Check each ac_nn partition to make sure it is not weakly connected to partition ac + # then calculate the modularity change if partitions ac and ac_nn are merged + for ac_nn in bf_nearest_neighbour_partitions: + if not is_weakly_connected(G, [n], partition_label_to_nodes[ac_nn]): + delta_Q_nn[ac_nn] = Q_method([n], partition_label_to_nodes[ac_nn]) + if len(delta_Q_nn) > 0: + # Note nice use of operator.itemgetter to get key with largest value + # https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary + ac_max = max(delta_Q_nn, key=delta_Q_nn.get) + if delta_Q_nn[ac_max] > delta_Q_remove_n and ac_max != ac: + # now merge partition ac into partition ac_max + number_moves += 1 + node_to_partition_label[n] = ac_max + partition_label_to_nodes[ac_max].add(n) + partition_label_to_nodes[ac].remove(n) + if len(partition_label_to_nodes[ac]) == 0: # no more elements in this partition + partition_label_to_nodes.pop(ac, None) # remove ac from this dictionary + + if Q_check_on: + dQ = delta_Q_nn[ac_max] - delta_Q_remove_n + Q_total_old = Q_total_current + Q_total_current = Q_total(partition_label_to_nodes.values()) + + _error = Q_total_current - Q_total_old - dQ + moved = True + + elif delta_Q_remove_n < 0: + number_moves += 1 + node_to_partition_label[n] = next_partition_label + partition_label_to_nodes[next_partition_label] = {n} + next_partition_label += 1 + + # keeping looping through all partitions until can not merge any more + # keep doing new sweeps as long as something changed + return node_to_partition_label, partition_label_to_nodes + + +def matrix_node_recursive_antichain_partition(G, time_label='t', space_label='x', + random_on=False, seed=None, max_number_sweeps=None, + backwards_forwards_on=True, forwards_backwards_on=False, + Q_check_on=True, + plot_on=False, + filenameroot=None, extlist=['pdf'], + ScreenOn=False, Lambda=1, with_replacement=False): + """ + + Use , **kwargs in func defn and call with kawargs the dictionary for named arguments + used for the partition + + """ + + result_list = list() + + _matrix_node_recursive_antichain_partition_step(G, + time_label=time_label, space_label=space_label, + level=0, + result_list=result_list, + random_on=random_on, + seed=seed, + max_number_sweeps=max_number_sweeps, + backwards_forwards_on=backwards_forwards_on, + forwards_backwards_on=forwards_backwards_on, + Q_check_on=Q_check_on, + plot_on=plot_on, + filenameroot=filenameroot, extlist=extlist, + ScreenOn=ScreenOn, Lambda=Lambda, with_replacement=with_replacement) + + return result_list + + +def _matrix_node_recursive_antichain_partition_step(G, Lambda, with_replacement, time_label='t', space_label='x', + level=0, result_list=None, + random_on=False, + seed=None, max_number_sweeps=None, + backwards_forwards_on=True, forwards_backwards_on=False, + Q_check_on=True, + plot_on=False, + filenameroot=None, extlist=['pdf'], + ScreenOn=False): + # Internal routine to perform recursive version of node greedy + result_list.append(None) + node_to_partition_label, partition_label_to_nodes = node_matrix_greedy_antichain_partition(G, + random_on=random_on, + seed=seed, + max_number_sweeps=max_number_sweeps, + backwards_forwards_on=backwards_forwards_on, + forwards_backwards_on=forwards_backwards_on, + Q_check_on=Q_check_on, + level=level, + Lambda=Lambda, + with_replacement=with_replacement) + if len(partition_label_to_nodes.keys()) == G.number_of_nodes(): + return + # optional plot + if plot_on: + # ScreenOnValue=True + node_labels_on_value = True + cluster_labels_on_value = True + plot_dag(G, time_label=time_label, space_label=space_label, + filenameroot=filenameroot + '_l{0:d}'.format(level), + extlist=extlist, + messageString='Plot', + ScreenOn=ScreenOn, + node_labels_on=node_labels_on_value, + cluster_dict=node_to_partition_label, + cluster_labels_on=cluster_labels_on_value, + cluster_to_nodes=partition_label_to_nodes) + + new_G = coarse_grain(G, node_to_partition_label, partition_label_to_nodes) + _matrix_node_recursive_antichain_partition_step(new_G, + time_label=time_label, space_label=space_label, + level=level + 1, + result_list=result_list, + random_on=random_on, + seed=seed, + max_number_sweeps=max_number_sweeps, + backwards_forwards_on=backwards_forwards_on, + forwards_backwards_on=forwards_backwards_on, + Q_check_on=Q_check_on, + plot_on=plot_on, + filenameroot=filenameroot, + extlist=extlist, + ScreenOn=ScreenOn, + Lambda=Lambda, + with_replacement=with_replacement) + result_list[level] = {'level': level, 'n_to_p': node_to_partition_label, 'p_to_n': partition_label_to_nodes} + return diff --git a/cdlib/test/test_community_discovery_models.py b/cdlib/test/test_community_discovery_models.py index d0d64547..668809ac 100644 --- a/cdlib/test/test_community_discovery_models.py +++ b/cdlib/test/test_community_discovery_models.py @@ -1,6 +1,8 @@ import unittest from cdlib import algorithms import networkx as nx +import itertools +import random import os @@ -35,6 +37,20 @@ def get_string_graph(): return g +def random_dag(N, P): + nodes = [n for n in range(1, N + 1)] + G = nx.DiGraph() + G.add_nodes_from(nodes) + for n1, n2 in itertools.combinations(nodes, 2): + p = random.random() + if p <= P: + if n1 > n2: + G.add_edge(n2, n1) + else: + G.add_edge(n1, n2) + return G + + class CommunityDiscoveryTests(unittest.TestCase): def test_ego(self): @@ -526,6 +542,16 @@ def test_wCommunities(self): communities = algorithms.wCommunity(g, min_bel_degree=0.6, threshold_bel_degree=0.6) self.assertEqual(type(communities.communities), list) + if len(communities.communities) > 0: + self.assertEqual(type(communities.communities[0]), list) + if len(communities.communities[0]) > 0: + self.assertEqual(type(communities.communities[0][0]), int) + + def test_siblinarity_antichain(self): + + g = random_dag(100, 0.1) + communities = algorithms.siblinarity_antichain(g, Lambda=1) + self.assertEqual(type(communities.communities), list) if len(communities.communities) > 0: self.assertEqual(type(communities.communities[0]), list) if len(communities.communities[0]) > 0: diff --git a/cdlib/viz/networks.py b/cdlib/viz/networks.py index e3ede7e2..698d2c45 100644 --- a/cdlib/viz/networks.py +++ b/cdlib/viz/networks.py @@ -67,7 +67,7 @@ def plot_network_clusters(graph, partition, position=None, figsize=(8, 8), node_ partition = __filter(partition.communities, top_k, min_size) graph = convert_graph_formats(graph, nx.Graph) - if position==None: + if position is None: position=nx.spring_layout(graph) n_communities = len(partition) diff --git a/docs/conf.py b/docs/conf.py index a8232431..1400f173 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -26,7 +26,8 @@ def __getattr__(cls, name): MOCK_MODULES = ['ASLPAw_package', 'ipaddress', 'ASLPAw', 'graph-tool', 'leidenalg', 'numpy', 'scipy', 'networkx', 'karateclub', 'bimlpa', 'sklearn', 'pquality', 'functools', 'nf1', 'ipython', 'pygtk', 'gtk', 'gobject', 'argparse', 'matplotlib', 'matplotlib.pyplot', 'scikit-learn', - 'python-igraph', 'wurlitzer', 'pulp', 'seaborn', 'pandas', 'infomap', 'angel-cd', 'omega_index_py3', 'markov_clustering', 'chinese_whispers'] + 'python-igraph', 'wurlitzer', 'pulp', 'seaborn', 'pandas', 'infomap', 'angel-cd', 'omega_index_py3', 'markov_clustering', 'chinese_whispers', + 'scipy.sparse'] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) html_theme = "sphinx_rtd_theme" diff --git a/docs/reference/cd_algorithms/node_clustering.rst b/docs/reference/cd_algorithms/node_clustering.rst index 33101683..c3e3d2c3 100644 --- a/docs/reference/cd_algorithms/node_clustering.rst +++ b/docs/reference/cd_algorithms/node_clustering.rst @@ -2,7 +2,7 @@ Node Clustering =============== -Algorithms falling in this category generates communities composed by nodes. +Algorithms falling in this category generate communities composed by nodes. The communities can represent neat, *crisp*, partition as well as *overlapping* or even *fuzzy* ones. @@ -13,7 +13,7 @@ Crisp Communities ^^^^^^^^^^^^^^^^^ A clustering is said to be a *partition* if each node belongs to one and only one community. -Methods in this subclass returns as result a ``NodeClustering`` object instance. +Methods in this subclass return as result a ``NodeClustering`` object instance. .. autosummary:: :toctree: algs/ @@ -51,7 +51,7 @@ Overlapping Communities ^^^^^^^^^^^^^^^^^^^^^^^ A clustering is said to be *overlapping* if any generic node can be assigned to more than one community. -Methods in this subclass returns as result a ``NodeClustering`` object instance. +Methods in this subclass return as result a ``NodeClustering`` object instance. .. autosummary:: :toctree: algs/ @@ -83,7 +83,7 @@ Fuzzy Communities ^^^^^^^^^^^^^^^^^ A clustering is said to be a *fuzzy* if each node can belongs (with a different degree of likelihood) to more than one community. -Methods in this subclass returns as result a ``FuzzyNodeClustering`` object instance. +Methods in this subclass return as result a ``FuzzyNodeClustering`` object instance. .. autosummary:: :toctree: algs/ @@ -95,7 +95,7 @@ Methods in this subclass returns as result a ``FuzzyNodeClustering`` object inst Node Attribute ^^^^^^^^^^^^^^ -Methods in this subclass returns as result a ``AttrNodeClustering`` object instance. +Methods in this subclass return as result a ``AttrNodeClustering`` object instance. .. autosummary:: :toctree: algs/ @@ -104,13 +104,25 @@ Methods in this subclass returns as result a ``AttrNodeClustering`` object insta ilouvain -^^^^^^^^^^^^^^ -Bipartite Node -^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Bipartite Graph Communities +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Methods in this subclass return as result a ``BiNodeClustering`` object instance. + +.. autosummary:: + :toctree: algs/ + + bimlpa + + +^^^^^^^^^^^^^^^^^^^^^ +Antichain Communities +^^^^^^^^^^^^^^^^^^^^^ -Methods in this subclass returns as result a ``BiNodeClustering`` object instance. +Methods in this subclass are designed to extract communities from Directed Acyclic Graphs (DAG) and return as result a ``NodeClustering`` object instance. .. autosummary:: :toctree: algs/ - bimlpa \ No newline at end of file + siblinarity_antichain \ No newline at end of file