Source code for egc.model.graph_clustering.disjoint.ComE

"""Model of ComE"""
import logging as log
import os
import random
import timeit
from math import floor
from pathlib import Path

import numpy as np

from ....module.pretrain.ComE.community_embeddings_ComE import Community2Vec
from ....module.pretrain.ComE.context_embeddings_ComE import Context2Vec
from ....module.pretrain.ComE.node_embeddings_ComE import Node2Vec
from ....utils.ComE_utils import combine_files_iter
from ....utils.ComE_utils import count_textfiles
from ....utils.ComE_utils import Vocab
from ....utils.ComE_utils import WriteWalksToDisk
from ....utils.ComE_utils import xavier_normal

log.basicConfig(
    format="%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s",
    level=log.DEBUG,
)


[docs]class ComE:
    """
    class that keep track of all the parameters used during the learning of the embedding.


    :param nodes_degree: Dict with node_id: degree of node
    :param size: projection space
    :param down_sampling: perform down_sampling of common node
    :param table_size: size of the negative table to generate
    :param path_labels: location of the file containing the ground true (label for each node)
    :param input_file: name of the file containing the ground true (label for each node)
    :return:
    """

    def __init__(
        self,
        graph,
        n_clusters=7,
        size=2,
        down_sampling=0,
        table_size=100000000,
        labels=None,
        batch_size=50,
        num_workers=10,
        negative=5,
        lr=0.025,
        window_size=10,
        num_walks=10,
        walk_length=80,
        num_iter=1,
        output_file="Cora",
        alpha=0.1,
        beta=0.1,
        reg_covar=0.00001,
    ) -> None:
        self.predict = None
        self.down_sampling = down_sampling
        self.table_size = table_size
        if size % 4 != 0:
            log.warning(
                "consider setting layer size to a multiple of 4 for greater performance"
            )
        self.layer1_size = int(size)
        self.G = graph.to_networkx().to_undirected()
        self.num_walks = num_walks
        self.walk_length = walk_length
        self.num_workers = num_workers
        self.batch_size = batch_size
        self.negative = negative
        self.lr = lr
        self.window_size = window_size
        self.num_iter = num_iter
        self.output_file = output_file
        self.alpha = alpha
        self.beta = beta
        self.reg_covar = reg_covar
        self.n_clusters = n_clusters
        self.centroid = None
        self.covariance_mat = None
        self.inv_covariance_mat = None
        self.pi = None
        self.node_embedding = np.array(graph.ndata["feat"])
        self.node_embedding[self.node_embedding - 0.0 > 0.0001] = 1.0
        log.info("\t\tsampling the paths")

        basename = os.path.abspath(
            f"{os.path.dirname(os.path.realpath(__file__))}/..", )
        walks_filebase = os.path.join(
            basename, "tmp", "data",
            self.output_file + "_Walks")  # where read/write the sampled path
        if not os.path.exists(walks_filebase):
            Path(walks_filebase).mkdir(parents=True, exist_ok=True)
        writeWalksToDisk = WriteWalksToDisk()
        self.walk_files = writeWalksToDisk.write_walks_to_disk(
            self.G,
            os.path.join(walks_filebase, f"{self.output_file}.walks"),
            num_paths=self.num_walks,
            path_length=self.walk_length,
            alpha=0,
            rand=random.Random(0),
            num_workers=self.num_workers,
        )

        nodes_degree = count_textfiles(self.walk_files, self.num_workers)

        if nodes_degree is not None:
            self.build_vocab_(nodes_degree)
            self.ground_true = labels
            # inizialize node and context embeddings
            self.make_table()
            self.precalc_sampling()
            self.reset_weights()
        else:
            log.warning("Model not initialized, need the nodes degree")

[docs]    def build_vocab_(self, nodes_degree):
        """
        Build vocabulary from a sequence of paths (can be a once-only generator stream).
        Sorted by node id
        """
        # assign a unique index to each word
        self.vocab = {}

        for node_idx, (node, count) in enumerate(
                sorted(nodes_degree.items(), key=lambda x: x[0])):
            v = Vocab()
            v.count = count
            v.index = node_idx
            self.vocab[node] = v
        self.vocab_size = len(self.vocab)
        print(f"total {self.vocab_size} nodes")

[docs]    def precalc_sampling(self):
        """
        Peach vocabulary item's threshold for sampling
        """

        if self.down_sampling:
            print(
                f"frequent-node down sampling, threshold {self.down_sampling};"
                f" progress tallies will be approximate")
            total_nodes = sum(v.count for v in self.vocab.values())
            threshold_count = float(self.down_sampling) * total_nodes

        for v in self.vocab.values():
            prob = ((np.sqrt(v.count / threshold_count) + 1) *
                    (threshold_count / v.count) if self.down_sampling else 1.0)
            v.sample_probability = min(prob, 1.0)

[docs]    def reset_weights(self):
        """Reset all projection weights to an initial (untrained) state,
        but keep the existing vocabulary."""
        self.context_embedding = xavier_normal(size=(self.vocab_size,
                                                     self.layer1_size),
                                               as_type=np.float32)

        self.centroid = np.zeros((self.n_clusters, self.layer1_size),
                                 dtype=np.float32)
        self.covariance_mat = np.zeros(
            (self.n_clusters, self.layer1_size, self.layer1_size),
            dtype=np.float32)
        self.inv_covariance_mat = np.zeros(
            (self.n_clusters, self.layer1_size, self.layer1_size),
            dtype=np.float32)
        self.pi = np.zeros((self.vocab_size, self.n_clusters),
                           dtype=np.float32)

[docs]    def reset_communities_weights(self):
        """Reset all projection weights to an initial (untrained) state,
        but keep the existing vocabulary."""

        self.centroid = np.zeros((self.n_clusters, self.layer1_size),
                                 dtype=np.float32)
        self.covariance_mat = np.zeros(
            (self.n_clusters, self.layer1_size, self.layer1_size),
            dtype=np.float32)
        self.inv_covariance_mat = np.zeros(
            (self.n_clusters, self.layer1_size, self.layer1_size),
            dtype=np.float32)
        self.pi = np.zeros((self.vocab_size, self.n_clusters),
                           dtype=np.float32)
        print(f"reset communities data| k: {self.n_clusters}")

[docs]    def make_table(self, power=0.75):
        """
        Create a table using stored vocabulary word counts for drawing random words in the negative
        sampling training routines.

        Called internally from `build_vocab()`.

        """
        print(f"constructing a table with noise distribution "
              f"from {self.vocab_size} words of size {self.table_size}")
        # table (= list of words) of noise distribution for negative sampling
        self.table = np.zeros(self.table_size, dtype=np.uint32)
        sorted_keys = sorted(self.vocab.keys())
        k_idx = 0
        # compute sum of all power (Z in paper)
        train_words_pow = float(
            sum([v.count**power for k, v in self.vocab.items()]))
        # go through the whole table and fill it up with the word indexes proportional
        # to a word's count**power
        node_idx = sorted_keys[k_idx]
        # normalize count^0.75 by Z
        d1 = self.vocab[node_idx].count**power / train_words_pow
        for tidx in range(self.table_size):
            self.table[tidx] = self.vocab[node_idx].index
            if 1.0 * tidx / self.table_size > d1:
                k_idx += 1
                if k_idx > sorted_keys[-1]:
                    k_idx = sorted_keys[-1]
                node_idx = sorted_keys[k_idx]
                d1 += self.vocab[node_idx].count**power / train_words_pow

        print(f"Max value in the negative sampling table: {max(self.table)}")

[docs]    def fit(self):
        # Learning algorithm
        node_learner = Node2Vec(workers=self.num_workers,
                                negative=self.negative,
                                lr=self.lr)
        cont_learner = Context2Vec(
            window_size=self.window_size,
            workers=self.num_workers,
            negative=self.negative,
            lr=self.lr,
        )
        com_learner = Community2Vec(lr=self.lr)

        context_total_path = self.G.number_of_nodes(
        ) * self.num_walks * self.walk_length
        edges = np.array(list(self.G.edges()))
        print(f"context_total_path: {context_total_path}")
        print(f"node total edges: {self.G.number_of_edges()}")

        log.info("\n_______________________________________")
        log.info("\t\tPRE-TRAINING\n")
        ###########################
        #   PRE-TRAINING          #
        ###########################
        cont_learner.train(
            self,
            paths=combine_files_iter(self.walk_files),
            total_nodes=context_total_path,
            alpha=1,
            chunksize=self.batch_size,
        )
        ###########################
        #   EMBEDDING LEARNING    #
        ###########################
        iter_node = floor(context_total_path / self.G.number_of_edges() / 100)
        iter_com = floor(context_total_path / (self.G.number_of_edges()) / 100)
        for it in range(self.num_iter):
            alpha = self.alpha
            beta = self.beta
            print("\n_______________________________________\n")
            print(f"\t\tITER-{it}\n")
            print(f"k: {self.n_clusters}")
            self.reset_communities_weights()
            print(
                f"using alpha:{alpha}\tbeta:{beta}\titer_com:{iter_com}\titer_node: {iter_node}"
            )
            start_time = timeit.default_timer()
            com_learner.fit(self, reg_covar=self.reg_covar, n_init=10)

            log.info("Start training node embedding")
            node_learner.train(self,
                               edges=edges,
                               epochs=iter_node,
                               chunksize=self.batch_size)
            log.info("Stop training node embedding")

            log.info("Start training community embedding")
            com_learner.train(self.G.nodes(),
                              self,
                              beta,
                              chunksize=self.batch_size,
                              epochs=iter_com)
            log.info("Stop training community embedding")

            log.info("Start training context embedding")
            cont_learner.train(
                self,
                paths=combine_files_iter(self.walk_files),
                total_nodes=context_total_path,
                alpha=alpha,
                chunksize=self.batch_size,
            )
            log.info("Stop training context embedding")
            print(f"time: {timeit.default_timer() - start_time:.2f}s")
        self.predict = np.argmax(self.pi, axis=1)

[docs]    def get_memberships(self):
        return self.predict