diff --git a/cdlib/algorithms/crisp_partition.py b/cdlib/algorithms/crisp_partition.py index fa24a811..d87fde73 100644 --- a/cdlib/algorithms/crisp_partition.py +++ b/cdlib/algorithms/crisp_partition.py @@ -35,6 +35,7 @@ from cdlib.algorithms.internal.MCODE import m_code from cdlib.algorithms.internal.RSC import rsc_evaluate_graph from cdlib.prompt_utils import report_missing_packages, prompt_import_failure +import cdlib.algorithms.internal.markov_clustering as mc import warnings @@ -90,12 +91,6 @@ by = None -try: - import markov_clustering as mc -except ModuleNotFoundError: - missing_packages.add("markov_clustering") - by = None - # try: # import karateclub # except ModuleNotFoundError: @@ -1674,13 +1669,6 @@ def markov_clustering( .. note:: Reference implementation: https://github.com/GuyAllard/markov_clustering """ - if mc is None: - raise Exception( - "===================================================== \n" - "The markov clustering algorithm seems not to be installed (or incorrectly installed). \n" - "Please resolve with: pip install markov_clustering" - ) - g = convert_graph_formats(g_original, nx.Graph) g, maps = nx_node_integer_mapping(g) diff --git a/cdlib/algorithms/internal/RSC.py b/cdlib/algorithms/internal/RSC.py index 8b428509..943296b7 100644 --- a/cdlib/algorithms/internal/RSC.py +++ b/cdlib/algorithms/internal/RSC.py @@ -35,7 +35,7 @@ def __regularized_laplacian_matrix(adj_matrix, tau): # diags will be zero at points where there is no edge and/or the node you are at # ignore the error and make it zero later - #with scipy.errstate(divide="ignore"): + # with scipy.errstate(divide="ignore"): diags_sqrt = 1.0 / np.sqrt(diags) diags_sqrt[np.isinf(diags_sqrt)] = 0 D = sp.spdiags(diags_sqrt, [0], m, n, format="csr") diff --git a/cdlib/algorithms/internal/markov_clustering.py b/cdlib/algorithms/internal/markov_clustering.py new file mode 100644 index 00000000..3a8a0908 --- /dev/null +++ b/cdlib/algorithms/internal/markov_clustering.py @@ -0,0 +1,238 @@ +import numpy as np +from scipy.sparse import isspmatrix, dok_matrix, csc_matrix +import sklearn.preprocessing + + +def sparse_allclose(a, b, rtol=1e-5, atol=1e-8): + """ + Version of np.allclose for use with sparse matrices + """ + c = np.abs(a - b) - rtol * np.abs(b) + # noinspection PyUnresolvedReferences + return c.max() <= atol + + +def normalize(matrix): + """ + Normalize the columns of the given matrix + + :param matrix: The matrix to be normalized + :returns: The normalized matrix + """ + return sklearn.preprocessing.normalize(matrix, norm="l1", axis=0) + + +def inflate(matrix, power): + """ + Apply cluster inflation to the given matrix by raising + each element to the given power. + + :param matrix: The matrix to be inflated + :param power: Cluster inflation parameter + :returns: The inflated matrix + """ + if isspmatrix(matrix): + return normalize(matrix.power(power)) + + return normalize(np.power(matrix, power)) + + +def expand(matrix, power): + """ + Apply cluster expansion to the given matrix by raising + the matrix to the given power. + + :param matrix: The matrix to be expanded + :param power: Cluster expansion parameter + :returns: The expanded matrix + """ + + if isspmatrix(matrix): + return matrix**power + else: + import scipy.sparse as sp + + matrix = sp.csr_matrix(matrix) + return matrix**power + + return np.linalg.matrix_power(matrix, power) + + +def add_self_loops(matrix, loop_value): + """ + Add self-loops to the matrix by setting the diagonal + to loop_value + + :param matrix: The matrix to add loops to + :param loop_value: Value to use for self-loops + :returns: The matrix with self-loops + """ + shape = matrix.shape + assert shape[0] == shape[1], "Error, matrix is not square" + + if isspmatrix(matrix): + new_matrix = matrix.todok() + else: + new_matrix = matrix.copy() + + for i in range(shape[0]): + new_matrix[i, i] = loop_value + + if isspmatrix(matrix): + return new_matrix.tocsc() + + return new_matrix + + +def prune(matrix, threshold): + """ + Prune the matrix so that very small edges are removed. + The maximum value in each column is never pruned. + + :param matrix: The matrix to be pruned + :param threshold: The value below which edges will be removed + :returns: The pruned matrix + """ + if isspmatrix(matrix): + pruned = dok_matrix(matrix.shape) + pruned[matrix >= threshold] = matrix[matrix >= threshold] + pruned = pruned.tocsc() + else: + pruned = matrix.copy() + pruned[pruned < threshold] = 0 + + # keep max value in each column. same behaviour for dense/sparse + num_cols = matrix.shape[1] + row_indices = matrix.argmax(axis=0).reshape((num_cols,)) + col_indices = np.arange(num_cols) + pruned[row_indices, col_indices] = matrix[row_indices, col_indices] + + return pruned + + +def converged(matrix1, matrix2): + """ + Check for convergence by determining if + matrix1 and matrix2 are approximately equal. + + :param matrix1: The matrix to compare with matrix2 + :param matrix2: The matrix to compare with matrix1 + :returns: True if matrix1 and matrix2 approximately equal + """ + if isspmatrix(matrix1) or isspmatrix(matrix2): + return sparse_allclose(matrix1, matrix2) + + return np.allclose(matrix1, matrix2) + + +def iterate(matrix, expansion, inflation): + """ + Run a single iteration (expansion + inflation) of the mcl algorithm + + :param matrix: The matrix to perform the iteration on + :param expansion: Cluster expansion factor + :param inflation: Cluster inflation factor + """ + + # matrix = matrix.todense() + + # Expansion + matrix = expand(matrix, expansion) + + # Inflation + matrix = inflate(matrix, inflation) + + return matrix + + +def get_clusters(matrix): + """ + Retrieve the clusters from the matrix + + :param matrix: The matrix produced by the MCL algorithm + :returns: A list of tuples where each tuple represents a cluster and + contains the indices of the nodes belonging to the cluster + """ + if not isspmatrix(matrix): + # cast to sparse so that we don't need to handle different + # matrix types + matrix = csc_matrix(matrix) + + # get the attractors - non-zero elements of the matrix diagonal + attractors = matrix.diagonal().nonzero()[0] + + # somewhere to put the clusters + clusters = set() + + # the nodes in the same row as each attractor form a cluster + for attractor in attractors: + cluster = tuple(matrix.getrow(attractor).nonzero()[1].tolist()) + clusters.add(cluster) + + return sorted(list(clusters)) + + +def run_mcl( + matrix, + expansion=2, + inflation=2, + loop_value=1, + iterations=100, + pruning_threshold=0.001, + pruning_frequency=1, + convergence_check_frequency=1, + verbose=False, +): + """ + Perform MCL on the given similarity matrix + + :param matrix: The similarity matrix to cluster + :param expansion: The cluster expansion factor + :param inflation: The cluster inflation factor + :param loop_value: Initialization value for self-loops + :param iterations: Maximum number of iterations + (actual number of iterations will be less if convergence is reached) + :param pruning_threshold: Threshold below which matrix elements will be set + set to 0 + :param pruning_frequency: Perform pruning every 'pruning_frequency' + iterations. + :param convergence_check_frequency: Perform the check for convergence + every convergence_check_frequency iterations + :param verbose: Print extra information to the console + :returns: The final matrix + """ + assert expansion > 1, "Invalid expansion parameter" + assert inflation > 1, "Invalid inflation parameter" + assert loop_value >= 0, "Invalid loop_value" + assert iterations > 0, "Invalid number of iterations" + assert pruning_threshold >= 0, "Invalid pruning_threshold" + assert pruning_frequency > 0, "Invalid pruning_frequency" + assert convergence_check_frequency > 0, "Invalid convergence_check_frequency" + + # Initialize self-loops + if loop_value > 0: + matrix = add_self_loops(matrix, loop_value) + + # Normalize + matrix = normalize(matrix) + + # iterations + for i in range(iterations): + + # store current matrix for convergence checking + last_mat = matrix.copy() + + # perform MCL expansion and inflation + + matrix = iterate(matrix, expansion, inflation) + + # prune + if pruning_threshold > 0 and i % pruning_frequency == pruning_frequency - 1: + matrix = prune(matrix, pruning_threshold) + + # Check for convergence + if i % convergence_check_frequency == convergence_check_frequency - 1: + if converged(matrix, last_mat): + break + + return matrix diff --git a/cdlib/test/test_community_discovery_models.py b/cdlib/test/test_community_discovery_models.py index 956aaf71..781fb78f 100644 --- a/cdlib/test/test_community_discovery_models.py +++ b/cdlib/test/test_community_discovery_models.py @@ -54,10 +54,6 @@ except ModuleNotFoundError: by = None -try: - import markov_clustering as mc -except ModuleNotFoundError: - mc = None try: from cdlib.algorithms.internal.LPAM import LPAM @@ -360,24 +356,14 @@ def test_osse(self): def test_markov_clustering(self): - if mc is not None: - g = get_string_graph() - - communities = algorithms.markov_clustering(g) - self.assertEqual(type(communities.communities), list) - if len(communities.communities) > 0: - self.assertEqual(type(communities.communities[0]), list) - if len(communities.communities[0]) > 0: - self.assertEqual(type(communities.communities[0][0]), str) - - g = nx.karate_club_graph() + g = nx.karate_club_graph() - communities = algorithms.markov_clustering(g) - self.assertEqual(type(communities.communities), list) - if len(communities.communities) > 0: - self.assertEqual(type(communities.communities[0]), list) - if len(communities.communities[0]) > 0: - self.assertEqual(type(communities.communities[0][0]), int) + communities = algorithms.markov_clustering(g) + self.assertEqual(type(communities.communities), list) + if len(communities.communities) > 0: + self.assertEqual(type(communities.communities[0]), list) + if len(communities.communities[0]) > 0: + self.assertEqual(type(communities.communities[0][0]), int) # def test_bigClam(self): # if karateclub is None: diff --git a/requirements.txt b/requirements.txt index 521e61c7..92223f73 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,18 @@ numpy -scikit-learn +scikit - learn tqdm -networkx>=3.0 +networkx >= 3.0 demon -python-louvain>=0.16 -scipy>=1.10 +python - louvain >= 0.16 +scipy >= 1.10 pulp seaborn pandas eva_lcd bimlpa -python-igraph>=0.10 +python - igraph >= 0.10 angelcommunity pooch dynetx thresholdclustering -python-Levenshtein \ No newline at end of file +python - Levenshtein diff --git a/requirements_optional.txt b/requirements_optional.txt index 3a8309f3..895e0ef7 100644 --- a/requirements_optional.txt +++ b/requirements_optional.txt @@ -3,5 +3,4 @@ networkit pycombo leidenalg infomap>=1.3.0 -wurlitzer>=1.0.2 -markov_clustering \ No newline at end of file +wurlitzer>=1.0.2 \ No newline at end of file diff --git a/setup.py b/setup.py index 6b8aaeb8..5dbb8328 100644 --- a/setup.py +++ b/setup.py @@ -54,12 +54,9 @@ "GraphRicciCurvature", "networkit", "pycombo", - "leidenalg" - ], - "pypi": [ - "bayanpy", - "pyclustering" + "leidenalg", ], + "pypi": ["bayanpy", "pyclustering"], "all": [ "infomap>=1.3.0", "wurlitzer>=1.0.2", @@ -68,8 +65,8 @@ "pycombo", "leidenalg", "bayanpy", - "pyclustering" - ] + "pyclustering", + ], }, packages=find_packages( exclude=["*.test", "*.test.*", "test.*", "test", "cdlib.test", "cdlib.test.*"]