Source code for HAT.HAT

import itertools
from itertools import combinations
import numpy as np
import scipy as sp
import scipy.io
import networkx as nx
import os
import pandas as pd

import HAT.multilinalg
# import HAT

"""
HAT.HAT module contains miscilaneous hypergraph methods
"""


[docs]
def directSimilarity(HG1, HG2, measure='Hamming'):
    """This function computes the direct similarity between two uniform hypergraphs.

    :param HG1: Hypergraph 1
    :type HG1: *Hypergraph*
    :param HG2: Hypergraph 2
    :type HG2: *Hypergraph*
    :param measure: This sepcifies which similarity measure to apply. It defaults to
        ``Hamming``, and ``Spectral-S`` and ``Centrality`` are available as other options
        as well.
    :type measure: str, optional
    :return: Hypergraph similarity
    :rtype: *float*

    References
    ----------
    .. [1] Amit Surana, Can Chen, and Indika Rajapakse. Hypergraph similarity measures. IEEE Transactions on Network Science and Engineering, pages 1-16, 2022.
    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 2, 2022
    if measure == 'Hamming':
        return HAT.multilinalg.HammingSimilarity(HG1.laplacianTensor(), HG2.laplacianTensor())
    elif measure == 'Spectral-S':
        return HAT.multilinalg.SpectralHSimilarity(HG1.laplacianTensor(), HG2.laplacianTensor())
    # elif measure == 'Spectral-H': # Not implemented until we get a H-eigenvalue solver
    elif measure == 'Centrality':
        C1 = HG1.centrality()[0]
        C2 = HG2.centrality()[0]
        C1 /= np.linalg.norm(C1)
        C2 /= np.linalg.norm(C2)
        m = np.linalg.norm(C1 - C2) / len(C1)
        return m

    

[docs]
def indirectSimilarity(G1, G2, measure='Hamming', eps=10e-3):
    """This function computes the indirect similarity between two hypergraphs.

    :param G1: Hypergraph 1 expansion
    :type G1: *nx.Graph* or *ndarray*
    :param G2: Hypergraph 2 expansion
    :type G2: *nx.Graph* or *ndarray*
    :param measure: This specifies which similarity measure to apply. It defaults to ``Hamming`` , and
        ``Jaccard`` , ``deltaCon`` , ``Spectral`` , and ``Centrality`` are provided as well. When ``Centrality``
        is used as the similarity measure, ``G1`` and ``G2`` should *ndarray* s of centrality values; Otherwise
        ``G1`` and ``G2`` are *nx.Graph*s or *ndarray** s as adjacency matrices.
    :type measure: *str*, optional
    :param eps: a hyperparameter required for deltaCon similarity, defaults to 10e-3
    :type eps: *float*, optional
    :return: similarity measure
    :rtype: *float*

    References
    ==========
    .. [1] Amit Surana, Can Chen, and Indika Rajapakse. Hypergraph similarity measures. IEEE Transactions on Network Science and Engineering, pages 1-16, 2022.
    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 2, 2022
    if isinstance(G1, nx.classes.graph.Graph):
        M1 = nx.adjacency_matrix(G1).todense()
        M1 = np.array(M1)
    else:
        M1 = G1
    if isinstance(G2, nx.classes.graph.Graph):
        M2 = nx.adjacency_matrix(G2).todense()
        M2 = np.array(M2)
    else:
        M2 = G2
    if measure == 'Hamming':
        n = len(M1)
        s = np.linalg.norm(M1 - M2) / (n*(n-1))
    elif measure == 'Jaccard':
        M1 = np.matrix.flatten(M1)
        M2 = np.matrix.flatten(M2)
        M = np.array([M1, M2])
        Mmin = np.min(M, axis=0)
        Mmax = np.max(M, axis=0)
        s = 1 - sum(Mmin)/sum(Mmax)
    elif measure == 'deltaCon':
        D1 = np.diag(sum(M1))
        D2 = np.diag(sum(M2))
        I = np.eye(len(D1))
        S1 = sp.linalg.inv(I - (eps**2) * D1 - (eps*M1))
        S2 = sp.linalg.inv(I - (eps**2) * D2 - (eps*M2))
        M = np.square(np.sqrt(S1) - np.sqrt(S2))
        s = (1 / (len(M1)**2)) * np.sqrt(np.sum(np.sum(M)))
    elif measure == 'Spectral':
        v1, _ = np.linalg.eigh(M1)
        v2, _ = np.linalg.eigh(M2)
        s = (1 / len(v1)) * np.linalg.norm(v1-v2)
    elif measure == 'Centrality':
        # In this case mat1 and mat2 are centrality vectors
        s = (1 / len(v1)) * np.linalg.norm(M1-M2)
    return s



[docs]
def multicorrelations(D, order, mtype='Drezner', idxs=None, v=False, vfreq=1000):
    """This function computes the multicorrelation among pairwise or 2D data.

    :param D: 2D or pairwise data
    :type D: *ndarray*
    :param order: order of the multi-way interactions
    :type order: *int*
    :param mtype: This specifies which multicorrelation measure to use. It defaults to
        ``Drezner`` [1], but ``Wang`` [2] and ``Taylor`` [3] are options as well.
    :type mtype: *str*
    :param idxs: specify which indices of ``D`` to compute multicorrelations of. The default is ``None``, in which case
        all combinations of ``order`` indices are computed.
    :type idxs: *ndarray*, optional
    :param v: verbose, defaults to False
    :type v: bool, optional
    :param vfreq: frequency to display output in verbose mode
    :type vfrea: int, optional
    :return: A vector of the multicorrelation scores computed and a vector of the column indices of
        ``D`` used to compute each multicorrelation.
    :rtype: *(ndarray, ndarray)*

    References
    ----------
    .. [1] Zvi Drezner. Multirelation—a correlation among more than two variables. Computational Statistics & Data Analysis, 19(3):283–292, 1995.
    .. [2] Jianji Wang and Nanning Zheng. Measures of correlation for multiple variables. arXiv preprint arXiv:1401.4827, 2014.
    .. [3] Benjamin M Taylor. A multi-way correlation coefficient. arXiv preprint arXiv:2003.02561, 2020.
    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 2, 2022
    R = np.corrcoef(D.T)
    
    if idxs == None:
        idxs = np.array(list(itertools.combinations(range(len(R)), order)))
    
    M = np.zeros(len(idxs),)
    
    if mtype == 'Taylor':
        taylorCoef = 1/np.sqrt(order)

    for i in range(len(idxs)):
        minor = R[np.ix_(idxs[i,:], idxs[i,:])]
        if mtype == 'Drezner':
            w, _ = np.linalg.eigh(minor)
            M[i] = 1 - w[0]
        elif mtype == 'Wang':
            M[i] = pow((1 - np.linalg.det(minor)), 0.5)
        elif mtype == 'Taylor':
            w, _ = np.linalg.eigh(minor)
            M[i] = taylorCoef * np.std(w)

        if v and i % vfreq == 0:
            print(f"{i} / {len(idxs)} mcorrs complete")

    return M, idxs



[docs]
def uniformErdosRenyi(v, e, k):
    """This function generates a uniform, random hypergraph.

    :param v: number of vertices
    :type v: *int*
    :param e: number of edges
    :type e: *int*
    :param k: order of hypergraph
    :type k: *int*
    :return: Hypergraph
    :rtype: *Hypergraph*
    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 2, 2022
    IM = np.zeros((v,e))
    for i in range(e):
        idx = np.random.choice(v, size = k, replace = False)
        IM[idx,i] = 1
    return HAT.Hypergraph(IM)



[docs]
def load(dataset='Karate'):
    """This function loads built-in datasets. Currently only one dataset is available and we are working to expand this.

    :param dataset: sets which dataset to load in, defaults to 'Karate'
    :type dataset: str, optional
    :return: incidence matrix or graph object
    :rtype: *ndarray* or *nx.Graph*
    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 2, 2022

    current_path = os.path.dirname(os.path.realpath(__file__))
    current_path += '/data/'
    if dataset == 'Karate':
        return nx.karate_club_graph()
    elif dataset == 'ArnetMiner Citation':
        mat = sp.io.loadmat(current_path + 'aminer_cocitation.mat')
        S = mat['S']
        return S
    elif dataset == 'ArnetMiner Reference':
        mat = sp.io.loadmat(current_path + 'aminer_coreference.mat')
        S = mat['S']
        return S
    elif dataset == 'Citeseer Citation':
        mat = sp.io.loadmat(current_path + 'citeseer_cocitation.mat')
        S = mat['S']
        return S
    elif dataset == 'Cora Citation':
        mat = sp.io.loadmat(current_path + 'cora_coreference.mat')
        S = mat['S']
        return S
    elif dataset == 'Cora Citation':
        mat = sp.io.loadmat(current_path + 'cora_coreference.mat')
        S = mat['S']
        return S
    elif dataset == 'DBLP':
        mat = sp.io.loadmat(current_path + 'dblp.mat')
        S = mat['S']
        return S



[docs]
def hyperedges2IM(edgeSet):
    """This function constructs an incidence matrix from an edge set.

    :param edgeSet: a :math:`e \\times k` matrix where each row contains :math:`k` integers that are contained within the same hyperedge
    :type edgeSet: *ndarray*
    :return: a :math:`n \times e` incidence matrix where each row of the edge set corresponds to a column of the incidence matrix. :math:`n` is the number of nodes contained in the edgeset.
    :rtype: *ndarray*
    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 2, 2022
    n = np.max(edgeSet)
    e = len(edgeSet)
    IM = np.zeros((n+1,e))
    for e in range(e):
        IM[edgeSet[e,:],e] = 1
    return IM

    

[docs]
def hyperedgeHomophily(H, HG=None, G=None, method='CN'):
    """This function computes the hyperedge homophily score according to the below methods. The homophily score is the average score based on
    structural similarity of the vertices in hypredge `H` in the clique expanded graph `G`. This function is an interface from `HAT` to `networkx`
    link prediction algorithms.

    :param G: a pairwise hypergraph expansion
    :type G: `networkx.Graph`
    :param H: hyperedge containing individual vertices within the edge
    :type H: `ndarray`
    :param method: specifies which structural similarity method to use. This defaults to `CN` common neighbors.
    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 6, 2022
    pairwise = list(itertools.combinations(H, 2))

    # Compute pairwise scores with networkx
    if method == 'CN':
        pairwiseScores = nx.common_neighbor_centrality(G, pairwise)
    elif method == 'RA':
        pairwiseScores = nx.resource_allocation_index(G, pairwise)
    elif method == 'JC':
        pairwiseScores = nx.jaccard_coefficient(G, pairwise)
    elif method == 'AA':
        pairwiseScores = nx.adamic_adar_index(G, pairwise)
    elif method == 'PA':
        pairwiseScores = nx.preferential_attachment(G, pairwise)

    # Compute average pairwise score
    pairwiseScores = pairwiseScores[:, 2]
    hyperedgeHomophily = sum(pairwiseScores)/len(pairwiseScores)
    return hyperedgeHomophily



[docs]
def edgeRemoval(HG, p, method='Random'):
    """This function randomly removes edges from a hypergraph. In [1], four primary reasons are given for data missing in pairwise networks:
        1. random edge removal
        2. right censoring
        3. snowball effect
        4. cold-ends
    This method removes edes from hypergraphs according to the multi-way analogue of these.
    
    References
    ----------
    .. [1] Yan, Bowen, and Steve Gregory. "Finding missing edges and communities in incomplete networks." Journal of Physics A: Mathematical and Theoretical 
        44.49 (2011): 495102.
    .. [2] Zhu, Yu-Xiao, et al. "Uncovering missing links with cold ends." Physica A: Statistical Mechanics and its Applications 391.22 (2012): 5769-5778.
    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 9, 2022
    IM = HG.IM
    n, e = IM.shape
    
    if method == 'Random':
        # Randome edge removal
        known, unknown = randomRemoval(HG, p)
    elif method == 'RC':
        # Right censoring
        known, unknown = rightCensorRemoval(HG, p)
    elif method == 'SB':
        # Snowball Effect
        known, unknown = snowBallRemoval(HG, p)
    elif method == 'CE':
        # Cold Ends
        known, unknown = coldEndsRemoval(HG, p)
    else:
        print('Enter a valid edge removal method')
        return
    
    # Bradcast from 3d to 2d
    a, _, c = unknown.shape
    unknown = np.reshape(unknown, (a, c))
    a, _, c = known.shape
    known = np.reshape(known, (a, c))

    # Return hypergraph objects
    K = HAT.Hypergraph(known)
    U = HAT.Hypergraph(unknown)

    return K, U



[docs]
def randomRemoval(HG, p):
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 9, 2022
    IM = HG.IM
    n, e = IM.shape
    knownIdxs = np.random.choice([0, 1], size=(e,), p=[p, 1-p])
    known = IM[:, np.where(knownIdxs == 1)]
    unknown = IM[:, np.where(knownIdxs == 0)]
    return known, unknown



[docs]
def rightCensorRemoval(HG, p):
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 9, 2022
    IM = HG.IM.copy()
    n, e = IM.shape
    
    # Determine number of known edges in remaining graph
    numknownEdges = sum(np.random.choice([0, 1], size=(e,), p=[p, 1-p]))

    # Vertex degree
    vxDegree = np.sum(IM, axis=1)
    
    # Iteratively remove edges
    removedEdges = 0
    knownIdxs = np.ones(e,)
    while sum(knownIdxs) > numknownEdges:
        # Select vertex with maximum degree
        vx = np.argmax(vxDegree)
        # Select edges vx participates in
        vxEdges = np.where(IM[vx,:] != 0)[0]
        # Select single edge to remove
        removeEdge = np.random.choice(vxEdges)
        # Remove edge from incidence matrix
        IM[:, removeEdge] = 0
        # Remove it from list of known edges
        knownIdxs[removeEdge] = 0
        # Decrease degree of vx
        vxDegree[vx] -= 1
    print(IM)
    print(HG.IM)
    known = HG.IM[:, np.where(knownIdxs == 1)]
    unknown = HG.IM[:, np.where(knownIdxs == 0)]
    return known, unknown



[docs]
def coldEndsRemoval(HG, p):
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 9, 2022
    IM = HG.IM.copy()
    n, e = IM.shape
    
    # Determine number of known edges in remaining graph
    numknownEdges = sum(np.random.choice([0, 1], size=(e,), p=[p, 1-p]))

    # Vertex degree
    vxDegree = np.sum(IM, axis=1)
    
    # Iteratively remove edges
    removedEdges = 0
    
    knownIdxs = np.ones(e,)
    while sum(knownIdxs) > numknownEdges:
        # Select vertex with maximum degree
        vx = np.argmin(vxDegree)
        if vxDegree[vx] == 1:
            # Does not remove vxc with degree one. The degree value is set
            # to max and the process continues
            vxDegree[vx] = max(vxDegree)
            continue
        # Select edges vx participates in
        vxEdges = np.where(IM[vx,:] != 0)[0]
        # Select single edge to remove
        removeEdge = np.random.choice(vxEdges)
        # Remove edge from incidence matrix
        IM[:, removeEdge] = 0
        # Remove it from list of known edges
        knownIdxs[removeEdge] = 0
        # Decrease degree of vx
        vxDegree[vx] -= 1
    
    known = HG.IM[:, np.where(knownIdxs == 1)]
    unknown = HG.IM[:, np.where(knownIdxs == 0)]
    return known, unknown



[docs]
def snowBallRemoval(HG, p):
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: Dec 9, 2022
    n, e = HG.IM.shape
    source = np.random.choice(n)
    # Clique expand HG
    C = HG.cliqueGraph()
    # Perform BFS on C. BFSedgeList is an ordered list of tuples containing each edge discovered in BFS
    BFSedgeList = list(nx.bfs_tree(C, 5).edges())
    # List of tuples to list preserving order nodes are visited
    orderedVxc = [item for e in BFSedgeList for item in e]
    _, idx = np.unique(orderedVxc, return_index=True)
    # Ordered list vertices are visited
    vxOrder = np.array(orderedVxc)[np.sort(idx)]

    # Set which vertices will remain known in the hypergraph
    numKnownVxc = sum(np.random.choice([0, 1], size=(n,), p=[p, 1-p]))    
    knownVxc = vxOrder[0:numKnownVxc]
    
    # Only include edges where every vertex in the edge is known
    knownIdxs = np.ones(e,)
    for edge in range(e):
        edgeVxc = np.where(HG.IM[:, edge] != 0)
        numRecognizedNodes = np.intersect1d(edgeVxc, knownVxc)
        if len(numRecognizedNodes) != len(edgeVxc):
            knownIdxs[edge] = 0

    known = HG.IM[:, np.where(knownIdxs == 1)]
    unknown = HG.IM[:, np.where(knownIdxs == 0)]
    return known, unknown