Source code for hype.graph

#!/usr/bin/env python3
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

from collections import defaultdict as ddict
import pandas
import numpy as np
from numpy.random import choice
import torch as th
from torch import nn
from torch.utils.data import Dataset as DS
from sklearn.metrics import average_precision_score
from multiprocessing.pool import ThreadPool
from functools import partial
import h5py
from tqdm import tqdm


def load_adjacency_matrix(path, format='hdf5', symmetrize=False):
    if format == 'hdf5':
        with h5py.File(path, 'r') as hf:
            return {
                'ids': hf['ids'].value.astype('int'),
                'neighbors': hf['neighbors'].value.astype('int'),
                'offsets': hf['offsets'].value.astype('int'),
                'weights': hf['weights'].value.astype('float'),
                'objects': hf['objects'].value
            }
    elif format == 'csv':
        df = pandas.read_csv(path, usecols=['id1', 'id2', 'weight'], engine='c')

        if symmetrize:
            rev = df.copy().rename(columns={'id1' : 'id2', 'id2' : 'id1'})
            df = pandas.concat([df, rev])

        idmap = {}
        idlist = []

        def convert(id):
            if id not in idmap:
                idmap[id] = len(idlist)
                idlist.append(id)
            return idmap[id]
        df.loc[:, 'id1'] = df['id1'].apply(convert)
        df.loc[:, 'id2'] = df['id2'].apply(convert)

        groups = df.groupby('id1').apply(lambda x: x.sort_values(by='id2'))
        counts = df.groupby('id1').id2.size()

        ids = groups.index.levels[0].values
        offsets = counts.loc[ids].values
        offsets[1:] = np.cumsum(offsets)[:-1]
        offsets[0] = 0
        neighbors = groups['id2'].values
        weights = groups['weight'].values
        return {
            'ids' : ids.astype('int'),
            'offsets' : offsets.astype('int'),
            'neighbors': neighbors.astype('int'),
            'weights': weights.astype('float'),
            'objects': np.array(idlist)
        }
    else:
        raise RuntimeError(f'Unsupported file format {format}')


[docs]def load_edge_list(path, symmetrize=False): """ Load an edgelist dataset in CSV format. The CSV file must have at least 3 columns: ``id1``, ``id2``, and ``weight``. If the dataset is directed, then it is assumed that ``id2`` is the parent of ``id1``. Args: path (str): path to the CSV file symmetrize (bool): If set to ``True``, then for every edge ``A -> B``, we create a symmetric edge ``B -> A`` Return: Tuple[np.ndarray[ndim=2], list[str], np.ndarray[ndim=1]]: A tuple containiner: ``idx`` an array of edges, ``objects`` a list of the unique objects in the graph, and ``weights`` an array the same length of ``idx`` containing the weights of each edge """ df = pandas.read_csv(path, usecols=['id1', 'id2', 'weight'], engine='c') df.dropna(inplace=True) if symmetrize: rev = df.copy().rename(columns={'id1' : 'id2', 'id2' : 'id1'}) df = pandas.concat([df, rev]) idx, objects = pandas.factorize(df[['id1', 'id2']].values.reshape(-1)) idx = idx.reshape(-1, 2).astype('int') weights = df.weight.values.astype('float') return idx, objects.tolist(), weights
class Embedding(nn.Module): """ Base class for Embedding models Args: size (int): number of embeddings dim (int): dimension of the embeddings manifold (Manifold): which manifold to use sparse (bool): whether or not to use sparse gradients """ def __init__(self, size, dim, manifold, sparse=True): super(Embedding, self).__init__() self.dim = dim self.nobjects = size self.manifold = manifold self.lt = nn.Embedding(size, dim, sparse=sparse) self.dist = manifold.distance self.pre_hook = None self.post_hook = None self.init_weights(manifold) def init_weights(self, manifold, scale=1e-4): manifold.init_weights(self.lt.weight, scale) def forward(self, inputs): e = self.lt(inputs) with th.no_grad(): e = self.manifold.normalize(e) if self.pre_hook is not None: e = self.pre_hook(e) fval = self._forward(e) return fval def embedding(self): return list(self.lt.parameters())[0].data.cpu().numpy() def optim_params(self, manifold): return [{ 'params': self.lt.parameters(), 'rgrad': manifold.rgrad, 'expm': manifold.expm, 'logm': manifold.logm, 'ptransp': manifold.ptransp, }, ] # This class is now deprecated in favor of BatchedDataset (graph_dataset.pyx) class Dataset(DS): _neg_multiplier = 1 _ntries = 10 _sample_dampening = 0.75 def __init__(self, idx, objects, weights, nnegs, unigram_size=1e8): assert idx.ndim == 2 and idx.shape[1] == 2 assert weights.ndim == 1 assert len(idx) == len(weights) assert nnegs >= 0 assert unigram_size >= 0 print('Indexing data') self.idx = idx self.nnegs = nnegs self.burnin = False self.objects = objects self._weights = ddict(lambda: ddict(int)) self._counts = np.ones(len(objects), dtype=np.float) self.max_tries = self.nnegs * self._ntries for i in range(idx.shape[0]): t, h = self.idx[i] self._counts[h] += weights[i] self._weights[t][h] += weights[i] self._weights = dict(self._weights) nents = int(np.array(list(self._weights.keys())).max()) assert len(objects) > nents, f'Number of objects do no match' if unigram_size > 0: c = self._counts ** self._sample_dampening self.unigram_table = choice( len(objects), size=int(unigram_size), p=(c / c.sum()) ) def __len__(self): return self.idx.shape[0] def weights(self, inputs, targets): return self.fweights(self, inputs, targets) def nnegatives(self): if self.burnin: return self._neg_multiplier * self.nnegs else: return self.nnegs @classmethod def collate(cls, batch): inputs, targets = zip(*batch) return th.cat(inputs, 0), th.cat(targets, 0) # This function is now deprecated in favor of eval_reconstruction def eval_reconstruction_slow(adj, lt, distfn): ranks = [] ap_scores = [] for s, s_types in adj.items(): s_e = lt[s].expand_as(lt) _dists = distfn(s_e, lt).data.cpu().numpy().flatten() _dists[s] = 1e+12 _labels = np.zeros(lt.size(0)) _dists_masked = _dists.copy() _ranks = [] for o in s_types: _dists_masked[o] = np.Inf _labels[o] = 1 for o in s_types: d = _dists_masked.copy() d[o] = _dists[o] r = np.argsort(d) _ranks.append(np.where(r == o)[0][0] + 1) ranks += _ranks ap_scores.append( average_precision_score(_labels, -_dists) ) return np.mean(ranks), np.mean(ap_scores) def reconstruction_worker(adj, lt, distfn, objects, progress=False): ranksum = nranks = ap_scores = iters = 0 labels = np.empty(lt.size(0)) for object in tqdm(objects) if progress else objects: labels.fill(0) neighbors = np.array(list(adj[object])) dists = distfn(lt[None, object], lt) dists[object] = 1e12 sorted_dists, sorted_idx = dists.sort() ranks, = np.where(np.in1d(sorted_idx.cpu().numpy(), neighbors)) # The above gives us the position of the neighbors in sorted order. We # want to count the number of non-neighbors that occur before each neighbor ranks += 1 N = ranks.shape[0] # To account for other positive nearer neighbors, we subtract (N*(N+1)/2) # As an example, assume the ranks of the neighbors are: # 0, 1, 4, 5, 6, 8 # For each neighbor, we'd like to return the number of non-neighbors # that ranked higher than it. In this case, we'd return 0+0+2+2+2+3=14 # Another way of thinking about it is to return # 0 + 1 + 4 + 5 + 6 + 8 - (0 + 1 + 2 + 3 + 4 + 5) # (0 + 1 + 2 + ... + N) == (N * (N + 1) / 2) # Note that we include `N` to account for the source embedding itself # always being the nearest neighbor ranksum += ranks.sum() - (N * (N - 1) / 2) nranks += ranks.shape[0] labels[neighbors] = 1 ap_scores += average_precision_score(labels, -dists.cpu().numpy()) iters += 1 return float(ranksum), nranks, ap_scores, iters
[docs]def eval_reconstruction(adj, lt, distfn, workers=1, progress=False): ''' Reconstruction evaluation. Evaluate how well the embedding is able to reconstruct the original input graph. Specifically, for each node, we compute all of its nearest neighbors in the embedding space and rank them amongst its non-neighbors. Args: adj (dict[int, set[int]]): Adjacency list mapping objects to its neighbors lt (torch.Tensor[N, dim]): Embedding table with `N` embeddings and `dim` dimensionality distfn (Callable[[Tensor, Tensor], Tensor]): distance function to use for computing nearest neighbors in embedding space workers (int): number of workers to use progress (bool): display progress bar Returns: Tuple[float, float]: ``mean_rank``, ``map_rank`` ''' objects = np.array(list(adj.keys())) if workers > 1: with ThreadPool(workers) as pool: f = partial(reconstruction_worker, adj, lt, distfn) results = pool.map(f, np.array_split(objects, workers)) results = np.array(results).sum(axis=0).astype(float) else: results = reconstruction_worker(adj, lt, distfn, objects, progress) return float(results[0]) / results[1], float(results[2]) / results[3]