Stable clustering alternative #34

mbruhns · 2023-11-30T15:44:13Z

There are several methods published that claim to find more stable cluster by evaluating different parameter combinations (aka consensus clustering). One example is SC3S. We might want to use this or something similear instead of finding metrics for n_neighbors and resolution.

The text was updated successfully, but these errors were encountered:

mbruhns · 2023-12-01T14:03:49Z

Another published approach for this would be this. It should be fairly easy to reimplement in Python, here's a draft:

import numpy as np
import scanpy as sc
from sklearn.metrics import silhouette_samples
from scipy.stats import bootstrap

# Load and preprocess data (replace with your data file)
# adata = sc.read('your_data_file.h5ad')
# For illustration, let's use a sample dataset
adata = sc.datasets.pbmc68k_reduced()

# Normalize and log-transform data
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)

# Function for repeated clustering
def repeated_clustering(adata, n_repeats=100, subset_fraction=0.8, resolution=1.0, n_neighbors=15):
    np.random.seed(42)  # for reproducibility
    original_indices = np.arange(adata.n_obs)
    co_cluster_matrix = np.zeros((adata.n_obs, adata.n_obs))

    for _ in range(n_repeats):
        subset_indices = np.random.choice(original_indices, int(len(original_indices) * subset_fraction), replace=False)
        subset_adata = adata[subset_indices]
        sc.pp.neighbors(subset_adata, n_neighbors=n_neighbors)
        sc.tl.leiden(subset_adata, resolution=resolution)

        for i in subset_indices:
            for j in subset_indices:
                if subset_adata.obs['leiden'][i] == subset_adata.obs['leiden'][j]:
                    co_cluster_matrix[i, j] += 1

    co_cluster_matrix /= n_repeats
    return co_cluster_matrix

# Function to calculate silhouette scores
def calculate_silhouette_scores(adata, distance_matrix):
    silhouette_scores = silhouette_samples(distance_matrix, adata.obs['leiden'], metric='precomputed')
    cluster_silhouette_scores = {cluster: np.mean(silhouette_scores[adata.obs['leiden'] == cluster]) for cluster in adata.obs['leiden'].unique()}
    return cluster_silhouette_scores

# Function to explore parameter values
def explore_parameter_values(adata, resolution_range, neighbors_range, n_repeats=100, subset_fraction=0.8):
    results = {}
    for resolution in resolution_range:
        for n_neighbors in neighbors_range:
            sc.pp.neighbors(adata, n_neighbors=n_neighbors)
            sc.tl.leiden(adata, resolution=resolution)
            co_cluster_matrix = repeated_clustering(adata, n_repeats, subset_fraction, resolution=resolution, n_neighbors=n_neighbors)
            distance_matrix = 1 - co_cluster_matrix
            cluster_silhouette_scores = calculate_silhouette_scores(adata, distance_matrix)
            mean_silhouette_score = np.mean(list(cluster_silhouette_scores.values()))
            results[(resolution, n_neighbors)] = mean_silhouette_score
    return results

# Function to calculate confidence interval
def calculate_confidence_interval(scores):
    ci_lower, ci_upper = bootstrap((scores,), np.median, confidence_level=0.95, n_resamples=25000, method='bca').conf_interval
    return ci_lower, ci_upper

# Function to determine optimal parameter combination
def determine_optimal_parameters(results):
    decision_threshold = max(ci_lower for _, ci_lower in results.values())
    optimal_parameters = max(results.keys(), key=lambda x: results[x][0] if results[x][1] >= decision_threshold else float('-inf'))
    return optimal_parameters

# Main execution
resolution_range = np.linspace(0.1, 2.0, 10)  # Example range of resolution parameters
neighbors_range = np.arange(10, 50, 10)  # Example range of n_neighbors parameters

exploration_results = explore_parameter_values(adata, resolution_range, neighbors_range)

# Calculate confidence intervals for each parameter combination
ci_results = {params: calculate_confidence_interval([exploration_results[params]]) for params in exploration_results.keys()}

# Determine the near-optimal parameter combination
optimal_params = determine_optimal_parameters(ci_results)

print(f"Optimal clustering parameters: Resolution - {optimal_params[0]}, n_neighbors - {optimal_params[1]}")

mbruhns added the enhancement New feature or request label Nov 30, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Stable clustering alternative #34

Stable clustering alternative #34

mbruhns commented Nov 30, 2023

mbruhns commented Dec 1, 2023

Stable clustering alternative #34

Stable clustering alternative #34

Comments

mbruhns commented Nov 30, 2023

mbruhns commented Dec 1, 2023