Source code for hype.graph

#!/usr/bin/env python3
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

from collections import defaultdict as ddict
import pandas
import numpy as np
from numpy.random import choice
import torch as th
from torch import nn
from torch.utils.data import Dataset as DS
from sklearn.metrics import average_precision_score
from multiprocessing.pool import ThreadPool
from functools import partial
import h5py
from tqdm import tqdm


def load_adjacency_matrix(path, format='hdf5', symmetrize=False):
    if format == 'hdf5':
        with h5py.File(path, 'r') as hf:
            return {
                'ids': hf['ids'].value.astype('int'),
                'neighbors': hf['neighbors'].value.astype('int'),
                'offsets': hf['offsets'].value.astype('int'),
                'weights': hf['weights'].value.astype('float'),
                'objects': hf['objects'].value
            }
    elif format == 'csv':
        df = pandas.read_csv(path, usecols=['id1', 'id2', 'weight'], engine='c')

        if symmetrize:
            rev = df.copy().rename(columns={'id1' : 'id2', 'id2' : 'id1'})
            df = pandas.concat([df, rev])

        idmap = {}
        idlist = []

        def convert(id):
            if id not in idmap:
                idmap[id] = len(idlist)
                idlist.append(id)
            return idmap[id]
        df.loc[:, 'id1'] = df['id1'].apply(convert)
        df.loc[:, 'id2'] = df['id2'].apply(convert)

        groups = df.groupby('id1').apply(lambda x: x.sort_values(by='id2'))
        counts = df.groupby('id1').id2.size()

        ids = groups.index.levels[0].values
        offsets = counts.loc[ids].values
        offsets[1:] = np.cumsum(offsets)[:-1]
        offsets[0] = 0
        neighbors = groups['id2'].values
        weights = groups['weight'].values
        return {
            'ids' : ids.astype('int'),
            'offsets' : offsets.astype('int'),
            'neighbors': neighbors.astype('int'),
            'weights': weights.astype('float'),
            'objects': np.array(idlist)
        }
    else:
        raise RuntimeError(f'Unsupported file format {format}')


[docs]def load_edge_list(path, symmetrize=False):
    """
    Load an edgelist dataset in CSV format.  The CSV file must have at least
    3 columns: ``id1``, ``id2``, and ``weight``.  If the dataset is directed,
    then it is assumed that ``id2`` is the parent of ``id1``.

    Args:
        path (str): path to the CSV file
        symmetrize (bool): If set to ``True``, then for every edge ``A -> B``,
            we create a symmetric edge ``B -> A``

    Return:
        Tuple[np.ndarray[ndim=2], list[str], np.ndarray[ndim=1]]:
            A tuple containiner: ``idx`` an array of edges, ``objects`` a
            list of the unique objects in the graph, and ``weights`` an array
            the same length of ``idx`` containing the weights of each edge
    """
    df = pandas.read_csv(path, usecols=['id1', 'id2', 'weight'], engine='c')
    df.dropna(inplace=True)
    if symmetrize:
        rev = df.copy().rename(columns={'id1' : 'id2', 'id2' : 'id1'})
        df = pandas.concat([df, rev])
    idx, objects = pandas.factorize(df[['id1', 'id2']].values.reshape(-1))
    idx = idx.reshape(-1, 2).astype('int')
    weights = df.weight.values.astype('float')
    return idx, objects.tolist(), weights


class Embedding(nn.Module):
    """
    Base class for Embedding models

    Args:
        size (int): number of embeddings
        dim (int): dimension of the embeddings
        manifold (Manifold): which manifold to use
        sparse (bool): whether or not to use sparse gradients
    """
    def __init__(self, size, dim, manifold, sparse=True):
        super(Embedding, self).__init__()
        self.dim = dim
        self.nobjects = size
        self.manifold = manifold
        self.lt = nn.Embedding(size, dim, sparse=sparse)
        self.dist = manifold.distance
        self.pre_hook = None
        self.post_hook = None
        self.init_weights(manifold)

    def init_weights(self, manifold, scale=1e-4):
        manifold.init_weights(self.lt.weight, scale)

    def forward(self, inputs):
        e = self.lt(inputs)
        with th.no_grad():
            e = self.manifold.normalize(e)
        if self.pre_hook is not None:
            e = self.pre_hook(e)
        fval = self._forward(e)
        return fval

    def embedding(self):
        return list(self.lt.parameters())[0].data.cpu().numpy()

    def optim_params(self, manifold):
        return [{
            'params': self.lt.parameters(),
            'rgrad': manifold.rgrad,
            'expm': manifold.expm,
            'logm': manifold.logm,
            'ptransp': manifold.ptransp,
        }, ]


# This class is now deprecated in favor of BatchedDataset (graph_dataset.pyx)
class Dataset(DS):
    _neg_multiplier = 1
    _ntries = 10
    _sample_dampening = 0.75

    def __init__(self, idx, objects, weights, nnegs, unigram_size=1e8):
        assert idx.ndim == 2 and idx.shape[1] == 2
        assert weights.ndim == 1
        assert len(idx) == len(weights)
        assert nnegs >= 0
        assert unigram_size >= 0

        print('Indexing data')
        self.idx = idx
        self.nnegs = nnegs
        self.burnin = False
        self.objects = objects

        self._weights = ddict(lambda: ddict(int))
        self._counts = np.ones(len(objects), dtype=np.float)
        self.max_tries = self.nnegs * self._ntries
        for i in range(idx.shape[0]):
            t, h = self.idx[i]
            self._counts[h] += weights[i]
            self._weights[t][h] += weights[i]
        self._weights = dict(self._weights)
        nents = int(np.array(list(self._weights.keys())).max())
        assert len(objects) > nents, f'Number of objects do no match'

        if unigram_size > 0:
            c = self._counts ** self._sample_dampening
            self.unigram_table = choice(
                len(objects),
                size=int(unigram_size),
                p=(c / c.sum())
            )

    def __len__(self):
        return self.idx.shape[0]

    def weights(self, inputs, targets):
        return self.fweights(self, inputs, targets)

    def nnegatives(self):
        if self.burnin:
            return self._neg_multiplier * self.nnegs
        else:
            return self.nnegs

    @classmethod
    def collate(cls, batch):
        inputs, targets = zip(*batch)
        return th.cat(inputs, 0), th.cat(targets, 0)


# This function is now deprecated in favor of eval_reconstruction
def eval_reconstruction_slow(adj, lt, distfn):
    ranks = []
    ap_scores = []

    for s, s_types in adj.items():
        s_e = lt[s].expand_as(lt)
        _dists = distfn(s_e, lt).data.cpu().numpy().flatten()
        _dists[s] = 1e+12
        _labels = np.zeros(lt.size(0))
        _dists_masked = _dists.copy()
        _ranks = []
        for o in s_types:
            _dists_masked[o] = np.Inf
            _labels[o] = 1
        for o in s_types:
            d = _dists_masked.copy()
            d[o] = _dists[o]
            r = np.argsort(d)
            _ranks.append(np.where(r == o)[0][0] + 1)
        ranks += _ranks
        ap_scores.append(
            average_precision_score(_labels, -_dists)
        )
    return np.mean(ranks), np.mean(ap_scores)


def reconstruction_worker(adj, lt, distfn, objects, progress=False):
    ranksum = nranks = ap_scores = iters = 0
    labels = np.empty(lt.size(0))
    for object in tqdm(objects) if progress else objects:
        labels.fill(0)
        neighbors = np.array(list(adj[object]))
        dists = distfn(lt[None, object], lt)
        dists[object] = 1e12
        sorted_dists, sorted_idx = dists.sort()
        ranks, = np.where(np.in1d(sorted_idx.cpu().numpy(), neighbors))
        # The above gives us the position of the neighbors in sorted order.  We
        # want to count the number of non-neighbors that occur before each neighbor
        ranks += 1
        N = ranks.shape[0]

        # To account for other positive nearer neighbors, we subtract (N*(N+1)/2)
        # As an example, assume the ranks of the neighbors are:
        # 0, 1, 4, 5, 6, 8
        # For each neighbor, we'd like to return the number of non-neighbors
        # that ranked higher than it.  In this case, we'd return 0+0+2+2+2+3=14
        # Another way of thinking about it is to return
        # 0 + 1 + 4 + 5 + 6 + 8 - (0 + 1 + 2 + 3 + 4 + 5)
        # (0 + 1 + 2 + ... + N) == (N * (N + 1) / 2)
        # Note that we include `N` to account for the source embedding itself
        # always being the nearest neighbor
        ranksum += ranks.sum() - (N * (N - 1) / 2)
        nranks += ranks.shape[0]
        labels[neighbors] = 1
        ap_scores += average_precision_score(labels, -dists.cpu().numpy())
        iters += 1
    return float(ranksum), nranks, ap_scores, iters


[docs]def eval_reconstruction(adj, lt, distfn, workers=1, progress=False):
    '''
    Reconstruction evaluation.  Evaluate how well the embedding is able
    to reconstruct the original input graph.  Specifically, for each node,
    we compute all of its nearest neighbors in the embedding space and rank
    them amongst its non-neighbors.

    Args:
        adj (dict[int, set[int]]): Adjacency list mapping objects to its neighbors
        lt (torch.Tensor[N, dim]): Embedding table with `N` embeddings and `dim`
            dimensionality
        distfn (Callable[[Tensor, Tensor], Tensor]): distance function to use for
            computing nearest neighbors in embedding space
        workers (int): number of workers to use
        progress (bool): display progress bar

    Returns:
        Tuple[float, float]: ``mean_rank``, ``map_rank``
    '''
    objects = np.array(list(adj.keys()))
    if workers > 1:
        with ThreadPool(workers) as pool:
            f = partial(reconstruction_worker, adj, lt, distfn)
            results = pool.map(f, np.array_split(objects, workers))
            results = np.array(results).sum(axis=0).astype(float)
    else:
        results = reconstruction_worker(adj, lt, distfn, objects, progress)
    return float(results[0]) / results[1], float(results[2]) / results[3]