Source code for egc.utils.clustering

"""Clustering Methods.
"""
from typing import Tuple

import numpy as np
import torch
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering

from .metrics import get_soft_assignment_matrix


[docs]def sk_clustering(
    X: torch.Tensor,
    n_clusters: int,
    name: str = "kmeans",
) -> np.ndarray:
    """sklearn clustering.

    Args:
        X (torch.Tensor): data embeddings.
        n_clusters (int): num of clusters.
        name (str, optional): type name. Defaults to 'kmeans'.

    Raises:
        NotImplementedError: clustering method not implemented.

    Returns:
        np.ndarray: cluster assignments.
    """
    if name == "kmeans":
        model = KMeans(n_clusters=n_clusters)
        label_pred = model.fit(X).labels_
        return label_pred

    if name == "spectral":
        model = SpectralClustering(n_clusters=n_clusters,
                                   affinity="precomputed")
        label_pred = model.fit(X).labels_
        return label_pred

    raise NotImplementedError


######################################################################################
# START: This section of code is adapted from https://github.com/bwilder0/clusternet #
######################################################################################


[docs]def soft_kmeans_clustering(
    data: torch.Tensor,
    miu: torch.Tensor,
    num_iter: int = 1,
    cluster_temp: float = 5,
    dist_type: str = "cosine_similarity",
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """pytorch (differentiable) implementation of soft k-means clustering.

    Args:
        data (torch.Tensor): data embeddings.
        miu (torch.Tensor, optional): cluster centers.
        num_iter (int, optional): num of iterations. Defaults to 1.
        cluster_temp (float, optional): softmax temperature. Defaults to 5.
        dist_type (str, optional): distance type. Defaults to 'cosine_similarity'.

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:\
            [cluster_centers, soft_assignment_matrix, distance]
    """
    n_clusters = miu.shape[0]
    data = torch.diag(1.0 / torch.norm(data, p=2, dim=1)) @ data

    for _ in range(num_iter):
        r = get_soft_assignment_matrix(
            data=data,
            miu=miu,
            cluster_temp=cluster_temp,
            dist_type=dist_type,
        )
        cluster_r = r.sum(dim=0)
        cluster_mean = (r.t().unsqueeze(1) @ data.expand(
            n_clusters,
            *data.shape,
        )).squeeze(1)
        new_miu = torch.diag(1 / cluster_r) @ cluster_mean
        miu = new_miu

    dist = data @ miu.t()
    r = torch.softmax(cluster_temp * dist, 1)
    return miu, r, dist


######################################################################################
# END:   This section of code is adapted from https://github.com/bwilder0/clusternet #
######################################################################################