Source code for er_evaluation.summary._summary

import numpy as np
import pandas as pd
from scipy.special import comb

from er_evaluation.data_structures import MembershipVector


[docs]def summary_statistics(membership, names=None): r""" Compute canonical set of summary statistics. This includes: * Number of clusters * Average cluster size * Matching rate * Hill numbers of order 0, 1, and 2 If names are provided for each cluster elements then the following two statistics are also provided: * Homonymy rate (proportion of clusters where at least one name is shared with another cluster) * Name variation rate (proportion of clusters with name variation within them) Args: membership (Series): Membership vector representation of a clustering. names (Series): Names associated with each cluster elements. Defaults to None. Returns: dict: Dictionary of summary statistics. Examples: >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> summary_statistics(membership) {'number_of_clusters': 4, 'average_cluster_size': 2.0, 'matching_rate': 0.875, 'H0': 3, 'H1': 2.82842712474619, 'H2': 2.6666666666666665} """ membership = MembershipVector(membership) statistics = { "number_of_clusters": number_of_clusters(membership), "average_cluster_size": average_cluster_size(membership), "matching_rate": matching_rate(membership), "H0": cluster_hill_number(membership, alpha=0), "H1": cluster_hill_number(membership, alpha=1), "H2": cluster_hill_number(membership, alpha=2), } if names is not None: statistics.update( { "homonymy_rate": homonymy_rate(membership, names), "name_variation_rate": name_variation_rate(membership, names), } ) return statistics
[docs]def number_of_clusters(membership): r""" Number of clusters in a given clustering. Args: membership (Series): Membership vector representation of a clustering. Returns: int: number of unique cluster identifiers. Note that NAs are not counted. Examples: >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> number_of_clusters(membership) 4 """ membership = MembershipVector(membership) return membership.nunique()
[docs]def matching_rate(membership): r""" Compute the **matching rate** for a given clustering. Matching rate: This is the proportion of elements belonging to clusters of size at least 2. Args: membership (Series): Membership vector representation of a clustering. Returns: int: Number of pairs of elements belonging to the same cluster. Note that clusters identified by NA values are excluded. Examples: >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> matching_rate(membership) 0.875 """ membership = MembershipVector(membership) counts = membership.groupby(membership).count() return (counts * (counts > 1)).sum() / membership.count()
[docs]def average_cluster_size(membership): """ Compute the average cluster size. Args: membership (Series): Membership vector representation of a clustering. Returns: float: Average cluster size. Examples: >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> average_cluster_size(membership) 2.0 """ membership = MembershipVector(membership) return cluster_sizes(membership).mean()
[docs]def cluster_sizes(membership): r""" Compute the size of each cluster. Args: membership (Series): Membership vector representation of a clustering. Returns: Series: Series indexed by cluster identifier and with values corresponding to cluster size. Note that NA cluster identifiers are excluded. Examples: >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> cluster_sizes(membership) 1 2 2 2 3 1 4 3 dtype: int64 """ membership = MembershipVector(membership) return membership.groupby(membership).count()
[docs]def cluster_sizes_distribution(membership): r""" Compute the cluster size distribution Args: membership (Series): Membership vector representation of a clustering. Returns: Series: Pandas Series indexed by distinct cluster sizes and with values corresponding to the number of clusters of that size. Examples: >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> cluster_sizes_distribution(membership) 1 1 2 2 3 1 dtype: int64 """ membership = MembershipVector(membership) cs = cluster_sizes(membership) return cs.groupby(cs).count()
[docs]def cluster_hill_number(membership, alpha=1): r""" Compute Hill number of a given order. Hill numbers: The Hill number of order :math:`\alpha \geq 0` of a given probability distribution :math:`p_i`, :math:`i =0,1,2, \dots`, is defined as .. math:: H_\alpha = \left(\sum_{i} p_i^{\alpha} \right)^{1/(1-\alpha)} and continually extended at :math:`\alpha =0, 1`. Here, we let :math:`p_i` be the proportion of clusters of size :math:`i`. Args: membership (Series): Membership vector representation of a clustering. alpha (int, optional): Order of the Hill Number. Defaults to 1. Returns: float: Hill number of order `alpha` for the given clustering. Examples: >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> cluster_hill_number(membership, alpha=0) 3 >>> cluster_hill_number(membership, alpha=1) 2.82842712474619 >>> cluster_hill_number(membership, alpha=np.Inf) 2.0 """ membership = MembershipVector(membership) cs_dist = cluster_sizes_distribution(membership) probs = cs_dist / np.sum(cs_dist) probs = probs[probs > 0] if alpha == 0: return len(probs) if alpha == 1: return np.exp(-np.sum(probs * np.log(probs))) if alpha == np.Inf: return 1 / np.max(probs) else: return np.sum(probs**alpha) ** (1 / (1 - alpha))
[docs]def homonymy_rate(membership, names): r""" Compute the homonymy rate of a given clustering with a set of associated names. Homonymy rate: The homonymy rate is the proportion of clusters that share a name with another cluster. Args: membership (Series): Membership vector representation of a clustering. names (Series): Series indexed by cluster elements and with values corresponding to the associated name. Note that the index of `membership` should be included in the index of `names`. Returns: float: homonymy rate Examples: >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> names = pd.Series(index=[1,2,3,4,5,6,7,8], data=["n1", "n2", "n3", "n4", "n3", "n1", "n2", "n8"]) >>> homonymy_rate(membership, names) 0.5 """ membership = MembershipVector(membership) assert isinstance(names, pd.Series) assert all(membership.index.isin(names.index)) df = pd.concat( {"membership": membership, "name": names}, axis=1, join="inner", copy=False, ) names_count = df.name.groupby(df.name).count().reset_index(name="total_count") name_count_per_cluster = df.groupby(["name", "membership"]).size().reset_index(name="cluster_count") merged = name_count_per_cluster.merge( names_count, on="name", copy=False, validate="m:1", ) merged["diff"] = merged.total_count - merged.cluster_count return (merged.groupby("membership").agg({"diff": "max"}) > 0).mean().values[0]
[docs]def name_variation_rate(membership, names): r""" Compute the name variation rate of a given clustering with a set of associated names. Name variation rate: The name variation rate is the proportion of clusters with name variation within. Args: membership (Series): Membership vector representation of a clustering. names (Series): Series indexed by cluster elements and with values corresponding to the associated name. Note that the index of `names` should exactly match the index of `membership`. Returns: float: Name variation rate. Examples: >>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> names = pd.Series(index=[1,2,3,4,5,6,7,8], data=["n1", "n2", "n3", "n4", "n3", "n1", "n2", "n8"]) >>> name_variation_rate(membership, names) 0.5 """ membership = MembershipVector(membership) assert isinstance(names, pd.Series) assert all(membership.index.isin(names.index)) joined = pd.concat( {"membership": membership, "name": names}, axis=1, join="inner", copy=False, ) return (joined.groupby("membership").nunique() > 1).mean().values[0]