Source code for egc.module.pretrain.ComE.community_embeddings_ComE

"""
Used for creating community embedding
"""
import logging as log

import numpy as np
import torch
from sklearn import mixture

from ....utils.ComE_utils import chunkize_serial

log.basicConfig(
    format="%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s",
    level=log.DEBUG,
)


[docs]class Community2Vec: """ Class that train the community embedding """ def __init__(self, lr): self.lr = lr self.g_mixture = None
[docs] def fit(self, model, reg_covar=0, n_init=10): """ Fit the GMM model with the current node embedding and save the result in the model :param model: model injected to add the mixture parameters """ self.g_mixture = mixture.GaussianMixture( n_components=model.n_clusters, reg_covar=reg_covar, covariance_type="full", n_init=n_init, ) print(f"Fitting: {model.n_clusters} communities") self.g_mixture.fit(model.node_embedding) model.centroid = self.g_mixture.means_.astype(np.float32) model.covariance_mat = self.g_mixture.covariances_.astype(np.float32) model.inv_covariance_mat = self.g_mixture.precisions_.astype( np.float32) model.pi = self.g_mixture.predict_proba(model.node_embedding).astype( np.float32)
[docs] def train(self, nodes, model, beta, chunksize=150, epochs=1): for _ in range(epochs): # grad_input = np.zeros(model.node_embedding.shape).astype( # np.float32) grad_input = torch.zeros(model.node_embedding.shape, dtype=torch.float32) if torch.cuda.is_available(): grad_input = grad_input.cuda() for node_index in chunkize_serial( map( lambda node: model.vocab[node].index, filter( lambda node: node in model.vocab and (model.vocab[node].sample_probability >= 1.0 or model.vocab[node].sample_probability >= np.random. random_sample()), nodes, ), ), chunksize, ): input_tensor = torch.FloatTensor( model.node_embedding[node_index]) # batch_grad_input = np.zeros(input.shape).astype(np.float32) batch_grad_input_tensor = torch.zeros(input_tensor.shape, dtype=torch.float32) for com in range(model.n_clusters): centroid_tensor = torch.FloatTensor(model.centroid[com]) diff_tensor = torch.unsqueeze( input_tensor - centroid_tensor, -1) pi_tensor_temp = torch.FloatTensor(model.pi[node_index, com]) pi_tensor = pi_tensor_temp.reshape(len(node_index), 1, 1) inv_covariance_mat_tensor = torch.FloatTensor( model.inv_covariance_mat[com]) m_tensor = pi_tensor * inv_covariance_mat_tensor if torch.cuda.is_available(): m_tensor = m_tensor.cuda() diff_tensor = diff_tensor.cuda() batch_grad_input_tensor = batch_grad_input_tensor.cuda( ) batch_grad_input_tensor += torch.squeeze( torch.matmul(m_tensor, diff_tensor), -1) # diff = np.expand_dims(input - model.centroid[com], axis=-1) # m = model.pi[node_index, com].reshape( # len(node_index), 1, # 1) * (model.inv_covariance_mat[com]) # # batch_grad_input += np.squeeze(np.matmul(m, diff), axis=-1) # grad_input[node_index] += batch_grad_input grad_input[node_index] += batch_grad_input_tensor grad_input *= beta / model.n_clusters model.node_embedding -= (np.array(grad_input.cpu()).clip( min=-0.25, max=0.25)) * self.lr