Source code for er_evaluation.summary._summary

import numpy as np
import pandas as pd
from scipy.special import comb

from er_evaluation.data_structures import MembershipVector


[docs]def summary_statistics(membership, names=None):
    r"""
    Compute canonical set of summary statistics.

    This includes:

    * Number of clusters
    * Average cluster size
    * Matching rate
    * Hill numbers of order 0, 1, and 2

    If names are provided for each cluster elements then the following two statistics are also provided:

    * Homonymy rate (proportion of clusters where at least one name is shared with another cluster)
    * Name variation rate (proportion of clusters with name variation within them)

    Args:
        membership (Series): Membership vector representation of a clustering.
        names (Series): Names associated with each cluster elements. Defaults to None.

    Returns:
        dict: Dictionary of summary statistics.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> summary_statistics(membership)
        {'number_of_clusters': 4, 'average_cluster_size': 2.0, 'matching_rate': 0.875, 'H0': 3, 'H1': 2.82842712474619, 'H2': 2.6666666666666665}
    """
    membership = MembershipVector(membership)

    statistics = {
        "number_of_clusters": number_of_clusters(membership),
        "average_cluster_size": average_cluster_size(membership),
        "matching_rate": matching_rate(membership),
        "H0": cluster_hill_number(membership, alpha=0),
        "H1": cluster_hill_number(membership, alpha=1),
        "H2": cluster_hill_number(membership, alpha=2),
    }

    if names is not None:
        statistics.update(
            {
                "homonymy_rate": homonymy_rate(membership, names),
                "name_variation_rate": name_variation_rate(membership, names),
            }
        )

    return statistics


[docs]def number_of_clusters(membership):
    r"""
    Number of clusters in a given clustering.

    Args:
        membership (Series): Membership vector representation of a clustering.

    Returns:
        int: number of unique cluster identifiers. Note that NAs are not counted.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> number_of_clusters(membership)
        4
    """
    membership = MembershipVector(membership)

    return membership.nunique()


[docs]def number_of_links(membership):
    r"""
    Number of pairwise links associated with a given clustering.

    Args:
        membership (Series): Membership vector representation of a clustering.

    Returns:
        int: Number of pairs of elements belonging to the same cluster. Note that clusters identified by NA values are excluded.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> number_of_links(membership)
        5.0

    """
    membership = MembershipVector(membership)

    return np.sum(comb(cluster_sizes(membership), 2))


[docs]def matching_rate(membership):
    r"""
    Compute the **matching rate** for a given clustering.

    Matching rate:
        This is the proportion of elements belonging to clusters of size at least 2.

    Args:
        membership (Series): Membership vector representation of a clustering.

    Returns:
        int: Number of pairs of elements belonging to the same cluster. Note that clusters identified by NA values are excluded.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> matching_rate(membership)
        0.875
    """
    membership = MembershipVector(membership)

    counts = membership.groupby(membership).count()

    return (counts * (counts > 1)).sum() / membership.count()


[docs]def average_cluster_size(membership):
    """
    Compute the average cluster size.

    Args:
        membership (Series): Membership vector representation of a clustering.

    Returns:
        float: Average cluster size.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> average_cluster_size(membership)
        2.0
    """
    membership = MembershipVector(membership)

    return cluster_sizes(membership).mean()


[docs]def cluster_sizes(membership):
    r"""
    Compute the size of each cluster.

    Args:
        membership (Series): Membership vector representation of a clustering.

    Returns:
        Series: Series indexed by cluster identifier and with values corresponding to cluster size. Note that NA cluster identifiers are excluded.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> cluster_sizes(membership)
        1    2
        2    2
        3    1
        4    3
        dtype: int64
    """
    membership = MembershipVector(membership)

    return membership.groupby(membership).count()


[docs]def cluster_sizes_distribution(membership):
    r"""
    Compute the cluster size distribution

    Args:
        membership (Series): Membership vector representation of a clustering.

    Returns:
        Series: Pandas Series indexed by distinct cluster sizes and with values corresponding to the number of clusters of that size.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> cluster_sizes_distribution(membership)
        1    1
        2    2
        3    1
        dtype: int64
    """
    membership = MembershipVector(membership)

    cs = cluster_sizes(membership)
    return cs.groupby(cs).count()


[docs]def cluster_hill_number(membership, alpha=1):
    r"""
    Compute Hill number of a given order.

    Hill numbers:
        The Hill number of order :math:`\alpha \geq 0` of a given probability distribution :math:`p_i`, :math:`i =0,1,2, \dots`, is defined as

        .. math::

            H_\alpha = \left(\sum_{i} p_i^{\alpha} \right)^{1/(1-\alpha)}

        and continually extended at :math:`\alpha =0, 1`. Here, we let :math:`p_i` be the proportion of clusters of size :math:`i`.

    Args:
        membership (Series): Membership vector representation of a clustering.
        alpha (int, optional): Order of the Hill Number. Defaults to 1.

    Returns:
        float: Hill number of order `alpha` for the given clustering.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> cluster_hill_number(membership, alpha=0)
        3

        >>> cluster_hill_number(membership, alpha=1)
        2.82842712474619

        >>> cluster_hill_number(membership, alpha=np.Inf)
        2.0
    """
    membership = MembershipVector(membership)

    cs_dist = cluster_sizes_distribution(membership)
    probs = cs_dist / np.sum(cs_dist)
    probs = probs[probs > 0]

    if alpha == 0:
        return len(probs)
    if alpha == 1:
        return np.exp(-np.sum(probs * np.log(probs)))
    if alpha == np.Inf:
        return 1 / np.max(probs)
    else:
        return np.sum(probs**alpha) ** (1 / (1 - alpha))


[docs]def homonymy_rate(membership, names):
    r"""
    Compute the homonymy rate of a given clustering with a set of associated names.

    Homonymy rate:
        The homonymy rate is the proportion of clusters that share a name with another cluster.

    Args:
        membership (Series): Membership vector representation of a clustering.
        names (Series): Series indexed by cluster elements and with values corresponding to the associated name. Note that the index of `membership` should be included in the index of `names`.

    Returns:
        float: homonymy rate

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> names = pd.Series(index=[1,2,3,4,5,6,7,8], data=["n1", "n2", "n3", "n4", "n3", "n1", "n2", "n8"])
        >>> homonymy_rate(membership, names)
        0.5
    """
    membership = MembershipVector(membership)
    assert isinstance(names, pd.Series)
    assert all(membership.index.isin(names.index))

    df = pd.concat(
        {"membership": membership, "name": names},
        axis=1,
        join="inner",
        copy=False,
    )

    names_count = df.name.groupby(df.name).count().reset_index(name="total_count")
    name_count_per_cluster = df.groupby(["name", "membership"]).size().reset_index(name="cluster_count")
    merged = name_count_per_cluster.merge(
        names_count,
        on="name",
        copy=False,
        validate="m:1",
    )
    merged["diff"] = merged.total_count - merged.cluster_count

    return (merged.groupby("membership").agg({"diff": "max"}) > 0).mean().values[0]


[docs]def name_variation_rate(membership, names):
    r"""
    Compute the name variation rate of a given clustering with a set of associated names.

    Name variation rate:
        The name variation rate is the proportion of clusters with name variation within.

    Args:
        membership (Series): Membership vector representation of a clustering.
        names (Series): Series indexed by cluster elements and with values corresponding to the associated name. Note that the index of `names` should exactly match the index of `membership`.

    Returns:
        float: Name variation rate.

    Examples:
        >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
        >>> names = pd.Series(index=[1,2,3,4,5,6,7,8], data=["n1", "n2", "n3", "n4", "n3", "n1", "n2", "n8"])
        >>> name_variation_rate(membership, names)
        0.5
    """
    membership = MembershipVector(membership)
    assert isinstance(names, pd.Series)
    assert all(membership.index.isin(names.index))

    joined = pd.concat(
        {"membership": membership, "name": names},
        axis=1,
        join="inner",
        copy=False,
    )

    return (joined.groupby("membership").nunique() > 1).mean().values[0]