Source code for er_evaluation.data_structures._data_structures

import logging

import numpy as np
import pandas as pd
from igraph import Graph


[docs]def compress_memberships(*memberships):
    """
    Compress membership vectors to int values, preserving index compatibility.

    Args:
        series (list): list of membership vectors (Series) to compress

    Returns:
        List of Series with int codes for index and values. Index are compatible accross the Series.

    Examples:
        >>> membership = pd.Series([None, "c1", "c1", "c2", "c2", "c3"], index=[0,1,2,3,4,5])
        >>> compressed, = compress_memberships(membership)
        >>> compressed
        0    NaN
        1    0.0
        2    0.0
        3    1.0
        4    1.0
        5    2.0
        Name: 0, dtype: float64
    """
    compressed = pd.concat(memberships, axis=1)
    for col in compressed.columns:
        codes = pd.Categorical(compressed[col]).codes
        compressed[col] = np.where(compressed[col].isna(), np.nan, codes)

    return [compressed[col] for col in compressed.columns]


[docs]class MembershipVector(pd.Series):
    """
    Series wrapper to validate membership vector format and log potential issues.

    Given a Series ``membership`` representing a membership vector, you can validate it using:

    .. code::

        membership = MembershipVector(membership)

    This casts its type to the MembershipVector subclass. If ``membership`` is already of the MembershipVector subtype, this does absolutely nothing and simply returns the ``membership`` object as-is. However, if ``membership`` is a Series, then it is validated, potential issues are logged, and then the object is returned as a instance of the MembershipVector subclass.

    This wrapper helps avoid duplicate validation and duplicate logging within the er_evaluation package. Externally, you may use :meth:`ismembership` to validate that a given pandas Series satisfies the requirements of a membership vector.

    Examples:
        >>> series = pd.Series([1,2,3,3])
        >>> membership = MembershipVector(series)  # Validates the series and logs potential issues.
        >>> membership = MembershipVector(membership)  # Does nothing.
    """

    def __init__(self, data=None, dropna=False, **kwargs):
        if not isinstance(data, MembershipVector):
            super().__init__(data=data, **kwargs)
            if ismembership(self):
                if len(self) == 0:
                    logging.info("Membership vector is empty.")
                if self.hasnans:
                    logging.info("Membership vector contains NA values.")
            else:
                logging.critical(f"Invalid membership vector: {self}")
                raise ValueError(f"Invalid membership vector: {self}. Check for duplicated or NA index values.")

        if dropna:
            self.dropna(inplace=True)

    def __new__(cls, data=None, dropna=False, **kwargs):
        if isinstance(data, MembershipVector):
            return data
        return super().__new__(cls)


[docs]def isgraph(obj):
    r"""
    Check if given object is an iGraph :py:class:`Graph`.

    Graph:
        A graph is an igraph :py:class:`Graph` object with vertices representing clustering elements and with edges between all elements belonging to the same cluster. Note that clusters are unnamed in graphs. Example::

            1───2       4
            │   │       │       6
            └─3─┘       5

    Returns:
        bool: True if Graph, False otherwise.

    Examples:
        >>> import igraph
        >>> g = igraph.Graph()
        >>> isgraph(g)
        True
    """
    if isinstance(obj, Graph):
        return True
    else:
        return False


[docs]def ismembership(obj):
    r"""
    Check if given object is a membership vector.

    Membership vector:
        A membership vector is a pandas :py:class:`Series` indexed by the elements of :math:`E` and with values corresponding to cluster identifiers. That is, the memebership vector maps elements to clusters. Example::

            >>> pd.Series(["c1", "c1", "c1", "c2", "c2", "c3"], index=[0,1,2,3,4,5])
            0    c1
            1    c1
            2    c1
            3    c2
            4    c2
            5    c3
            dtype: object

    Returns:
        bool: True if membership vector, False otherwise.

    Examples:
        >>> import pandas as pd
        >>> obj = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> ismembership(obj)
        True

        >>> ismembership([1,1,2,3,2,4,4,4])
        False
    """
    if isinstance(obj, pd.Series):
        return all(
            [
                obj.index.has_duplicates is False,
                obj.index.hasnans is False,
            ]
        )
    else:
        return False


[docs]def isclusters(obj):
    r"""
    Check if given object is a clusters dictionary.

    Clusters dictionary:
        A clusters dictionary is a Python :py:class:`dict` with keys corresponding to cluster identifiers and values being list of cluster elements. Example::

            {'c1': array([0, 1, 2]), 'c2': array([3, 4]), 'c3': array([5])}

    Returns:
        bool: True if clusters dictionary, False otherwise.

    Examples:
        >>> from numpy import array
        >>> obj = {'c1': array([0, 1, 2]), 'c2': array([3, 4]), 'c3': array([5])}
        >>> isclusters(obj)
        True

        Dictionary values should be numpy arrays:

        >>> obj = {'c1': [0, 1, 2], 'c2': [3, 4], 'c3': [5]}
        >>> isclusters(obj)
        False

        ⚠️ Warning: Clustering validity is not checked.

        >>> import pandas as pd
        >>> obj = {'c1': array([pd.NA]), 'c2': array([pd.NA])}
        >>> isclusters(obj)
        True

    Notes:
        * This function does not verify that clusters are non-overlapping with unique non-NaN elements.
    """
    if isinstance(obj, dict):
        return all(isinstance(value, np.ndarray) for value in obj.values())
    else:
        return False


[docs]def ispairs(obj):
    r"""
    Check if given object is a pairs list.

    A pairwise links list is an array of pairwise links between elements of the clustering, where each element of a cluster is linked to every other element of the same cluster. Note that clusters are unnamed in pairwise links lists. Example::

        array([[0, 1],
               [0, 2],
               [1, 2],
               [3, 4]])

    Returns:
        bool: True if a pairs list, False otherwise.

    Examples:
        >>> from numpy import array
        >>> obj = array([[0, 1], [0, 2], [1, 2], [3, 4]])
        >>> ispairs(obj)
        True

        >>> obj = [[0, 1], [0, 2], [1, 2], [3, 4]]
        >>> ispairs(obj)
        False
    """
    if isinstance(obj, np.ndarray):
        shape = obj.shape
        if shape[1] == 2:
            return True
        else:
            return False
    else:
        return False


[docs]def membership_to_clusters(membership):
    r"""
    Transform membership vector into clusters dictionary.

    Args:
        membership (Series): Membership vector.

    Returns:
        Cluters dictionary.

    Examples:
        >>> import pandas as pd
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> membership_to_clusters(membership)
        {1: array([1, 2]), 2: array([3, 5]), 3: array([4]), 4: array([6, 7, 8])}
    """
    membership = MembershipVector(membership)

    return {k: np.array(v) for k, v in membership.groupby(membership).groups.items()}


[docs]def membership_to_pairs(membership):
    r"""
    Transform membership vector into pairs list.

    Args:
        membership (Series): Membership vector.

    Returns:
        Pairs list.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> membership_to_pairs(membership)
        array([[1, 2],
               [3, 5],
               [6, 7],
               [6, 8],
               [7, 8]])
    """
    membership = MembershipVector(membership)

    clusters = membership_to_clusters(membership)
    return clusters_to_pairs(clusters)


[docs]def membership_to_graph(membership):
    r"""
    Transform membership vector into Graph.

    Args:
        membership (Series): Membership vector.

    Returns:
        Graph, with all elements converted to string.

    Note:
        All elements are converted to string before creating the graph.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> graph = membership_to_graph(membership)

    """
    membership = MembershipVector(membership)

    return pairs_to_graph(membership_to_pairs(membership), membership.index.values)


[docs]def clusters_to_pairs(clusters):
    r"""
    Transform clusters dictionary into pairs list.

    Args:
        clusters (dictionary): Dictionary mapping cluster identifiers to numpy array of cluster elements.

    Returns:
        Pairs list.

    Examples:
        >>> from numpy import array
        >>> clusters = {1: array([1, 2]), 2: array([3, 5]), 3: array([4]), 4: array([6, 7, 8])}
        >>> clusters_to_pairs(clusters)
        array([[1, 2],
               [3, 5],
               [6, 7],
               [6, 8],
               [7, 8]])
    """
    assert isclusters(clusters)

    def single_cluster_to_pairs(c):
        """
        References:
            - Carlos Gameiro (2021) Fast pairwise combinations in NumPy.
                Accessed online on November 1, 2022.
                https://carlostgameiro.medium.com/fast-pairwise-combinations-in-numpy-c29b977c33e2
        """
        index = np.stack(np.triu_indices(len(c), k=1), axis=-1)
        return c[index]

    if len(clusters) == 0:
        return np.zeros(shape=(0, 2))
    else:
        return np.row_stack([single_cluster_to_pairs(c) for c in clusters.values()])


[docs]def clusters_to_membership(clusters):
    r"""
    Transform clusters dictionary into membership vector.

    Args:
        clusters (dictionary): Dictionary mapping cluster identifiers to numpy array of cluster elements.

    Returns:
        Membership vector.

    Examples:
        >>> from numpy import array
        >>> clusters = {1: array([1, 2]), 2: array([3, 5]), 3: array([4]), 4: array([6, 7, 8])}
        >>> clusters_to_membership(clusters)
        1    1
        2    1
        3    2
        5    2
        4    3
        6    4
        7    4
        8    4
        dtype: int64
    """
    assert isclusters(clusters)

    return pd.concat([pd.Series(value, index=indices) for value, indices in clusters.items()])


[docs]def clusters_to_graph(clusters):
    r"""
    Transform clusters dictionary into Graph.

    Args:
        clusters (dictionary): Dictionary mapping cluster identifiers to numpy array of cluster elements.

    Returns:
        Membership vector.

    Examples:
        >>> from numpy import array
        >>> clusters = {1: array([1, 2]), 2: array([3, 5]), 3: array([4]), 4: array([6, 7, 8])}
        >>> graph = clusters_to_graph(clusters)
    """
    assert isclusters(clusters)

    indices = np.concatenate(list(clusters.values()))

    return pairs_to_graph(clusters_to_pairs(clusters), indices)


[docs]def pairs_to_membership(pairs, indices):
    r"""Transform pairs list into membership vector.

    Args:
        pairs (ndarray): array of paired elements.
        indices (ndarray): flat array of all elements to consider (paired and non-paired), including singletons.

    Returns:
        Membership vector

    Examples:
        >>> from numpy import array
        >>> pairs = array([[1, 2], [3, 5], [6, 7], [6, 8], [7, 8]])
        >>> indices = array([1,2,3,4,5,6,7,8])
        >>> pairs_to_membership(pairs, indices)
        1    0
        2    0
        3    1
        4    2
        5    1
        6    3
        7    3
        8    3
        dtype: int64
    """
    assert ispairs(pairs)
    assert all(np.isin(pairs.flatten(), indices))

    return graph_to_membership(pairs_to_graph(pairs, indices))


[docs]def pairs_to_clusters(pairs, indices):
    r"""Transform pairs list into clusters dictionary.

    Args:
        pairs (ndarray): array of paired elements.
        indices (ndarray): flat array of all elements to consider (paired and non-paired), including singletons.
    """
    assert ispairs(pairs)
    assert all(np.isin(pairs.flatten(), indices))

    return membership_to_clusters(pairs_to_membership(pairs, indices))


[docs]def pairs_to_graph(pairs, indices):
    r"""
    Transform pairs list into Graph.

    Args:
        pairs (ndarray): array of paired elements.
        indices (ndarray): flat array of all elements to consider (paired and non-paired), including singletons.

    Returns:
        Graph corresponding to the pairs list with given indices as vertices. Note that all elements are converted to string before creating the graph.

    Note:
        All elements are converted to string before creating the graph.

    Examples:
        >>> from numpy import array
        >>> pairs = array([[1, 2], [3, 5], [6, 7], [6, 8], [7, 8]])
        >>> indices = array([1,2,3,4,5,6,7,8])
        >>> graph = pairs_to_graph(pairs, indices)
    """
    assert ispairs(pairs)
    assert all(np.isin(pairs.flatten(), indices))

    g = Graph()
    g.add_vertices(indices.astype(str))
    g.add_edges(pairs.astype(str))

    return g


[docs]def graph_to_membership(graph):
    r"""
    Transform Graph into membership vector.

    Args:
        graph (Graph): igraph Graph object.

    Returns:
        Membership vector

    Examples:
        >>> from numpy import array
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> graph = membership_to_graph(membership)
        >>> graph_to_membership(graph) # Note that cluster identifiers are arbitrary.
        1    0
        2    0
        3    1
        4    2
        5    1
        6    3
        7    3
        8    3
        dtype: int64
    """
    assert isgraph(graph)

    return pd.Series(
        index=graph.get_vertex_dataframe().name.values,
        data=graph.connected_components().membership,
    )


[docs]def graph_to_clusters(graph):
    r"""
    Transform Graph into clusters dictionary.

    Args:
        graph (Graph): igraph Graph object.

    Returns:
        Membership vector

    Examples:
        >>> from numpy import array
        >>> clusters = {1: array([1, 2]), 2: array([3, 5]), 3: array([4]), 4: array([6, 7, 8])}
        >>> graph = clusters_to_graph(clusters)
        >>> graph_to_clusters(graph) # doctest: +NORMALIZE_WHITESPACE
        {0: array(['1', '2'], dtype=object),
         1: array(['3', '5'], dtype=object),
         2: array(['4'], dtype=object),
         3: array(['6', '7', '8'], dtype=object)}
    """
    assert isgraph(graph)

    return membership_to_clusters(graph_to_membership(graph))


[docs]def graph_to_pairs(graph):
    r"""
    Transform Graph into pairs list.

    Args:
        graph (Graph): igraph Graph object.

    Returns:
        Membership vector

    Examples:
        >>> from numpy import array
        >>> pairs = array([[1, 2], [3, 5], [6, 7], [6, 8], [7, 8]])
        >>> indices = array([1,2,3,4,5,6,7,8])
        >>> graph = pairs_to_graph(pairs, indices)
        >>> graph_to_pairs(graph)
        array([['1', '2'],
               ['3', '5'],
               ['6', '7'],
               ['6', '8'],
               ['7', '8']], dtype='<U1')
    """
    assert isgraph(graph)

    names = graph.get_vertex_dataframe().name.values
    edges = graph.get_edgelist()

    return np.array([[names[e[0]], names[e[1]]] for e in edges])